blob: 3b0bcf074efd0b124695adfc9ceb8cacb723ae77 [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/blink/renderer/platform/wtf/text/case_map.h"
#include <unicode/casemap.h>
#include "base/notreached.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/string_impl.h"
#include "third_party/blink/renderer/platform/wtf/text/string_view.h"
#include "third_party/blink/renderer/platform/wtf/text/text_offset_map.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
namespace WTF {
namespace {
inline bool LocaleIdMatchesLang(const AtomicString& locale_id,
const StringView& lang) {
CHECK_GE(lang.length(), 2u);
CHECK_LE(lang.length(), 3u);
if (!locale_id.Impl() || !locale_id.Impl()->StartsWithIgnoringCase(lang))
return false;
if (locale_id.Impl()->length() == lang.length())
return true;
const UChar maybe_delimiter = (*locale_id.Impl())[lang.length()];
return maybe_delimiter == '-' || maybe_delimiter == '_' ||
maybe_delimiter == '@';
}
enum class CaseMapType { kLower, kUpper };
scoped_refptr<StringImpl> CaseConvert(CaseMapType type,
StringImpl* source,
const char* locale,
TextOffsetMap* offset_map = nullptr) {
DCHECK(source);
CHECK_LE(source->length(),
static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
const wtf_size_t source_length = source->length();
scoped_refptr<StringImpl> upconverted = source->UpconvertedString();
const UChar* source16 = upconverted->Characters16();
UChar* data16;
wtf_size_t target_length = source_length;
scoped_refptr<StringImpl> output =
StringImpl::CreateUninitialized(target_length, data16);
while (true) {
UErrorCode status = U_ZERO_ERROR;
icu::Edits edits;
switch (type) {
case CaseMapType::kLower:
target_length = icu::CaseMap::toLower(
locale, /* options */ 0,
reinterpret_cast<const char16_t*>(source16), source_length,
reinterpret_cast<char16_t*>(data16), target_length, &edits, status);
break;
case CaseMapType::kUpper:
target_length = icu::CaseMap::toUpper(
locale, /* options */ 0,
reinterpret_cast<const char16_t*>(source16), source_length,
reinterpret_cast<char16_t*>(data16), target_length, &edits, status);
break;
}
if (U_SUCCESS(status)) {
if (!edits.hasChanges())
return source;
if (offset_map)
offset_map->Append(edits);
if (source_length == target_length)
return output;
return output->Substring(0, target_length);
}
// Expand the buffer and retry if the target is longer.
if (status == U_BUFFER_OVERFLOW_ERROR) {
output = StringImpl::CreateUninitialized(target_length, data16);
continue;
}
NOTREACHED();
return source;
}
}
} // namespace
const char* CaseMap::Locale::turkic_or_azeri_ = "tr";
const char* CaseMap::Locale::greek_ = "el";
const char* CaseMap::Locale::lithuanian_ = "lt";
CaseMap::Locale::Locale(const AtomicString& locale) {
// Use the more optimized code path most of the time.
//
// Only Turkic (tr and az) languages and Lithuanian requires
// locale-specific lowercasing rules. Even though CLDR has el-Lower,
// it's identical to the locale-agnostic lowercasing. Context-dependent
// handling of Greek capital sigma is built into the common lowercasing
// function in ICU.
//
// Only Turkic (tr and az) languages, Greek and Lithuanian require
// locale-specific uppercasing rules.
if (UNLIKELY(LocaleIdMatchesLang(locale, "tr") ||
LocaleIdMatchesLang(locale, "az")))
case_map_locale_ = turkic_or_azeri_;
else if (UNLIKELY(LocaleIdMatchesLang(locale, "el")))
case_map_locale_ = greek_;
else if (UNLIKELY(LocaleIdMatchesLang(locale, "lt")))
case_map_locale_ = lithuanian_;
else
case_map_locale_ = nullptr;
}
scoped_refptr<StringImpl> CaseMap::TryFastToLowerInvariant(StringImpl* source) {
DCHECK(source);
// Note: This is a hot function in the Dromaeo benchmark, specifically the
// no-op code path up through the first 'return' statement.
// First scan the string for uppercase and non-ASCII characters:
if (source->Is8Bit()) {
wtf_size_t first_index_to_be_lowered = source->length();
for (wtf_size_t i = 0; i < source->length(); ++i) {
LChar ch = source->Characters8()[i];
if (UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
first_index_to_be_lowered = i;
break;
}
}
// Nothing to do if the string is all ASCII with no uppercase.
if (first_index_to_be_lowered == source->length())
return source;
LChar* data8;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data8);
memcpy(data8, source->Characters8(), first_index_to_be_lowered);
for (wtf_size_t i = first_index_to_be_lowered; i < source->length(); ++i) {
LChar ch = source->Characters8()[i];
data8[i] = UNLIKELY(ch & ~0x7F) ? static_cast<LChar>(unicode::ToLower(ch))
: ToASCIILower(ch);
}
return new_impl;
}
bool no_upper = true;
UChar ored = 0;
const UChar* end = source->Characters16() + source->length();
for (const UChar* chp = source->Characters16(); chp != end; ++chp) {
if (UNLIKELY(IsASCIIUpper(*chp)))
no_upper = false;
ored |= *chp;
}
// Nothing to do if the string is all ASCII with no uppercase.
if (no_upper && !(ored & ~0x7F))
return source;
CHECK_LE(source->length(),
static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
int32_t length = source->length();
if (!(ored & ~0x7F)) {
UChar* data16;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data16);
for (int32_t i = 0; i < length; ++i) {
UChar c = source->Characters16()[i];
data16[i] = ToASCIILower(c);
}
return new_impl;
}
// The fast code path was not able to handle this case.
return nullptr;
}
scoped_refptr<StringImpl> CaseMap::FastToLowerInvariant(StringImpl* source) {
// Note: This is a hot function in the Dromaeo benchmark.
DCHECK(source);
if (scoped_refptr<StringImpl> result = TryFastToLowerInvariant(source))
return result;
const char* locale = ""; // "" = root locale.
return CaseConvert(CaseMapType::kLower, source, locale);
}
scoped_refptr<StringImpl> CaseMap::ToLowerInvariant(StringImpl* source,
TextOffsetMap* offset_map) {
DCHECK(source);
DCHECK(!offset_map || offset_map->IsEmpty());
if (scoped_refptr<StringImpl> result = TryFastToLowerInvariant(source))
return result;
const char* locale = ""; // "" = root locale.
return CaseConvert(CaseMapType::kLower, source, locale, offset_map);
}
scoped_refptr<StringImpl> CaseMap::ToUpperInvariant(StringImpl* source,
TextOffsetMap* offset_map) {
DCHECK(source);
DCHECK(!offset_map || offset_map->IsEmpty());
// This function could be optimized for no-op cases the way LowerUnicode() is,
// but in empirical testing, few actual calls to UpperUnicode() are no-ops, so
// it wouldn't be worth the extra time for pre-scanning.
CHECK_LE(source->length(),
static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max()));
int32_t length = source->length();
if (source->Is8Bit()) {
LChar* data8;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data8);
// Do a faster loop for the case where all the characters are ASCII.
LChar ored = 0;
for (int i = 0; i < length; ++i) {
LChar c = source->Characters8()[i];
ored |= c;
data8[i] = ToASCIIUpper(c);
}
if (!(ored & ~0x7F))
return new_impl;
// Do a slower implementation for cases that include non-ASCII Latin-1
// characters.
int number_sharp_s_characters = 0;
// There are two special cases.
// 1. latin-1 characters when converted to upper case are 16 bit
// characters.
// 2. Lower case sharp-S converts to "SS" (two characters)
for (int32_t i = 0; i < length; ++i) {
LChar c = source->Characters8()[i];
if (UNLIKELY(c == kSmallLetterSharpSCharacter))
++number_sharp_s_characters;
UChar upper = static_cast<UChar>(unicode::ToUpper(c));
if (UNLIKELY(upper > 0xff)) {
// Since this upper-cased character does not fit in an 8-bit string, we
// need to take the 16-bit path.
goto upconvert;
}
data8[i] = static_cast<LChar>(upper);
}
if (!number_sharp_s_characters)
return new_impl;
// We have numberSSCharacters sharp-s characters, but none of the other
// special characters.
new_impl = StringImpl::CreateUninitialized(
source->length() + number_sharp_s_characters, data8);
LChar* dest = data8;
for (int32_t i = 0; i < length; ++i) {
LChar c = source->Characters8()[i];
if (c == kSmallLetterSharpSCharacter) {
*dest++ = 'S';
*dest++ = 'S';
if (offset_map)
offset_map->Append(i + 1, dest - data8);
} else {
*dest++ = static_cast<LChar>(unicode::ToUpper(c));
}
}
return new_impl;
}
upconvert:
scoped_refptr<StringImpl> upconverted = source->UpconvertedString();
const UChar* source16 = upconverted->Characters16();
UChar* data16;
scoped_refptr<StringImpl> new_impl =
StringImpl::CreateUninitialized(source->length(), data16);
// Do a faster loop for the case where all the characters are ASCII.
UChar ored = 0;
for (int i = 0; i < length; ++i) {
UChar c = source16[i];
ored |= c;
data16[i] = ToASCIIUpper(c);
}
if (!(ored & ~0x7F))
return new_impl;
// Do a slower implementation for cases that include non-ASCII characters.
const char* locale = ""; // "" = root locale.
return CaseConvert(CaseMapType::kUpper, source, locale, offset_map);
}
scoped_refptr<StringImpl> CaseMap::ToLower(StringImpl* source,
TextOffsetMap* offset_map) const {
DCHECK(source);
DCHECK(!offset_map || offset_map->IsEmpty());
if (!case_map_locale_)
return ToLowerInvariant(source, offset_map);
return CaseConvert(CaseMapType::kLower, source, case_map_locale_, offset_map);
}
scoped_refptr<StringImpl> CaseMap::ToUpper(StringImpl* source,
TextOffsetMap* offset_map) const {
DCHECK(source);
DCHECK(!offset_map || offset_map->IsEmpty());
if (!case_map_locale_)
return ToUpperInvariant(source, offset_map);
return CaseConvert(CaseMapType::kUpper, source, case_map_locale_, offset_map);
}
String CaseMap::ToLower(const String& source, TextOffsetMap* offset_map) const {
DCHECK(!offset_map || offset_map->IsEmpty());
if (StringImpl* impl = source.Impl())
return ToLower(impl, offset_map);
return String();
}
String CaseMap::ToUpper(const String& source, TextOffsetMap* offset_map) const {
DCHECK(!offset_map || offset_map->IsEmpty());
if (StringImpl* impl = source.Impl())
return ToUpper(impl, offset_map);
return String();
}
} // namespace WTF