| // Copyright 2019 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "third_party/blink/renderer/platform/wtf/text/case_map.h" |
| |
| #include <unicode/casemap.h> |
| |
| #include "base/notreached.h" |
| #include "third_party/blink/renderer/platform/wtf/text/atomic_string.h" |
| #include "third_party/blink/renderer/platform/wtf/text/character_names.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_impl.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_view.h" |
| #include "third_party/blink/renderer/platform/wtf/text/text_offset_map.h" |
| #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" |
| |
| namespace WTF { |
| |
| namespace { |
| |
| inline bool LocaleIdMatchesLang(const AtomicString& locale_id, |
| const StringView& lang) { |
| CHECK_GE(lang.length(), 2u); |
| CHECK_LE(lang.length(), 3u); |
| if (!locale_id.Impl() || !locale_id.Impl()->StartsWithIgnoringCase(lang)) |
| return false; |
| if (locale_id.Impl()->length() == lang.length()) |
| return true; |
| const UChar maybe_delimiter = (*locale_id.Impl())[lang.length()]; |
| return maybe_delimiter == '-' || maybe_delimiter == '_' || |
| maybe_delimiter == '@'; |
| } |
| |
| enum class CaseMapType { kLower, kUpper }; |
| |
| scoped_refptr<StringImpl> CaseConvert(CaseMapType type, |
| StringImpl* source, |
| const char* locale, |
| TextOffsetMap* offset_map = nullptr) { |
| DCHECK(source); |
| CHECK_LE(source->length(), |
| static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max())); |
| const wtf_size_t source_length = source->length(); |
| |
| scoped_refptr<StringImpl> upconverted = source->UpconvertedString(); |
| const UChar* source16 = upconverted->Characters16(); |
| |
| UChar* data16; |
| wtf_size_t target_length = source_length; |
| scoped_refptr<StringImpl> output = |
| StringImpl::CreateUninitialized(target_length, data16); |
| while (true) { |
| UErrorCode status = U_ZERO_ERROR; |
| icu::Edits edits; |
| switch (type) { |
| case CaseMapType::kLower: |
| target_length = icu::CaseMap::toLower( |
| locale, /* options */ 0, |
| reinterpret_cast<const char16_t*>(source16), source_length, |
| reinterpret_cast<char16_t*>(data16), target_length, &edits, status); |
| break; |
| case CaseMapType::kUpper: |
| target_length = icu::CaseMap::toUpper( |
| locale, /* options */ 0, |
| reinterpret_cast<const char16_t*>(source16), source_length, |
| reinterpret_cast<char16_t*>(data16), target_length, &edits, status); |
| break; |
| } |
| if (U_SUCCESS(status)) { |
| if (!edits.hasChanges()) |
| return source; |
| |
| if (offset_map) |
| offset_map->Append(edits); |
| |
| if (source_length == target_length) |
| return output; |
| return output->Substring(0, target_length); |
| } |
| |
| // Expand the buffer and retry if the target is longer. |
| if (status == U_BUFFER_OVERFLOW_ERROR) { |
| output = StringImpl::CreateUninitialized(target_length, data16); |
| continue; |
| } |
| |
| NOTREACHED(); |
| return source; |
| } |
| } |
| |
| } // namespace |
| |
| const char* CaseMap::Locale::turkic_or_azeri_ = "tr"; |
| const char* CaseMap::Locale::greek_ = "el"; |
| const char* CaseMap::Locale::lithuanian_ = "lt"; |
| |
| CaseMap::Locale::Locale(const AtomicString& locale) { |
| // Use the more optimized code path most of the time. |
| // |
| // Only Turkic (tr and az) languages and Lithuanian requires |
| // locale-specific lowercasing rules. Even though CLDR has el-Lower, |
| // it's identical to the locale-agnostic lowercasing. Context-dependent |
| // handling of Greek capital sigma is built into the common lowercasing |
| // function in ICU. |
| // |
| // Only Turkic (tr and az) languages, Greek and Lithuanian require |
| // locale-specific uppercasing rules. |
| if (UNLIKELY(LocaleIdMatchesLang(locale, "tr") || |
| LocaleIdMatchesLang(locale, "az"))) |
| case_map_locale_ = turkic_or_azeri_; |
| else if (UNLIKELY(LocaleIdMatchesLang(locale, "el"))) |
| case_map_locale_ = greek_; |
| else if (UNLIKELY(LocaleIdMatchesLang(locale, "lt"))) |
| case_map_locale_ = lithuanian_; |
| else |
| case_map_locale_ = nullptr; |
| } |
| |
| scoped_refptr<StringImpl> CaseMap::TryFastToLowerInvariant(StringImpl* source) { |
| DCHECK(source); |
| |
| // Note: This is a hot function in the Dromaeo benchmark, specifically the |
| // no-op code path up through the first 'return' statement. |
| |
| // First scan the string for uppercase and non-ASCII characters: |
| if (source->Is8Bit()) { |
| wtf_size_t first_index_to_be_lowered = source->length(); |
| for (wtf_size_t i = 0; i < source->length(); ++i) { |
| LChar ch = source->Characters8()[i]; |
| if (UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) { |
| first_index_to_be_lowered = i; |
| break; |
| } |
| } |
| |
| // Nothing to do if the string is all ASCII with no uppercase. |
| if (first_index_to_be_lowered == source->length()) |
| return source; |
| |
| LChar* data8; |
| scoped_refptr<StringImpl> new_impl = |
| StringImpl::CreateUninitialized(source->length(), data8); |
| memcpy(data8, source->Characters8(), first_index_to_be_lowered); |
| |
| for (wtf_size_t i = first_index_to_be_lowered; i < source->length(); ++i) { |
| LChar ch = source->Characters8()[i]; |
| data8[i] = UNLIKELY(ch & ~0x7F) ? static_cast<LChar>(unicode::ToLower(ch)) |
| : ToASCIILower(ch); |
| } |
| |
| return new_impl; |
| } |
| |
| bool no_upper = true; |
| UChar ored = 0; |
| |
| const UChar* end = source->Characters16() + source->length(); |
| for (const UChar* chp = source->Characters16(); chp != end; ++chp) { |
| if (UNLIKELY(IsASCIIUpper(*chp))) |
| no_upper = false; |
| ored |= *chp; |
| } |
| // Nothing to do if the string is all ASCII with no uppercase. |
| if (no_upper && !(ored & ~0x7F)) |
| return source; |
| |
| CHECK_LE(source->length(), |
| static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max())); |
| int32_t length = source->length(); |
| |
| if (!(ored & ~0x7F)) { |
| UChar* data16; |
| scoped_refptr<StringImpl> new_impl = |
| StringImpl::CreateUninitialized(source->length(), data16); |
| |
| for (int32_t i = 0; i < length; ++i) { |
| UChar c = source->Characters16()[i]; |
| data16[i] = ToASCIILower(c); |
| } |
| return new_impl; |
| } |
| |
| // The fast code path was not able to handle this case. |
| return nullptr; |
| } |
| |
| scoped_refptr<StringImpl> CaseMap::FastToLowerInvariant(StringImpl* source) { |
| // Note: This is a hot function in the Dromaeo benchmark. |
| DCHECK(source); |
| if (scoped_refptr<StringImpl> result = TryFastToLowerInvariant(source)) |
| return result; |
| const char* locale = ""; // "" = root locale. |
| return CaseConvert(CaseMapType::kLower, source, locale); |
| } |
| |
| scoped_refptr<StringImpl> CaseMap::ToLowerInvariant(StringImpl* source, |
| TextOffsetMap* offset_map) { |
| DCHECK(source); |
| DCHECK(!offset_map || offset_map->IsEmpty()); |
| if (scoped_refptr<StringImpl> result = TryFastToLowerInvariant(source)) |
| return result; |
| const char* locale = ""; // "" = root locale. |
| return CaseConvert(CaseMapType::kLower, source, locale, offset_map); |
| } |
| |
| scoped_refptr<StringImpl> CaseMap::ToUpperInvariant(StringImpl* source, |
| TextOffsetMap* offset_map) { |
| DCHECK(source); |
| DCHECK(!offset_map || offset_map->IsEmpty()); |
| |
| // This function could be optimized for no-op cases the way LowerUnicode() is, |
| // but in empirical testing, few actual calls to UpperUnicode() are no-ops, so |
| // it wouldn't be worth the extra time for pre-scanning. |
| |
| CHECK_LE(source->length(), |
| static_cast<wtf_size_t>(std::numeric_limits<int32_t>::max())); |
| int32_t length = source->length(); |
| |
| if (source->Is8Bit()) { |
| LChar* data8; |
| scoped_refptr<StringImpl> new_impl = |
| StringImpl::CreateUninitialized(source->length(), data8); |
| |
| // Do a faster loop for the case where all the characters are ASCII. |
| LChar ored = 0; |
| for (int i = 0; i < length; ++i) { |
| LChar c = source->Characters8()[i]; |
| ored |= c; |
| data8[i] = ToASCIIUpper(c); |
| } |
| if (!(ored & ~0x7F)) |
| return new_impl; |
| |
| // Do a slower implementation for cases that include non-ASCII Latin-1 |
| // characters. |
| int number_sharp_s_characters = 0; |
| |
| // There are two special cases. |
| // 1. latin-1 characters when converted to upper case are 16 bit |
| // characters. |
| // 2. Lower case sharp-S converts to "SS" (two characters) |
| for (int32_t i = 0; i < length; ++i) { |
| LChar c = source->Characters8()[i]; |
| if (UNLIKELY(c == kSmallLetterSharpSCharacter)) |
| ++number_sharp_s_characters; |
| UChar upper = static_cast<UChar>(unicode::ToUpper(c)); |
| if (UNLIKELY(upper > 0xff)) { |
| // Since this upper-cased character does not fit in an 8-bit string, we |
| // need to take the 16-bit path. |
| goto upconvert; |
| } |
| data8[i] = static_cast<LChar>(upper); |
| } |
| |
| if (!number_sharp_s_characters) |
| return new_impl; |
| |
| // We have numberSSCharacters sharp-s characters, but none of the other |
| // special characters. |
| new_impl = StringImpl::CreateUninitialized( |
| source->length() + number_sharp_s_characters, data8); |
| |
| LChar* dest = data8; |
| |
| for (int32_t i = 0; i < length; ++i) { |
| LChar c = source->Characters8()[i]; |
| if (c == kSmallLetterSharpSCharacter) { |
| *dest++ = 'S'; |
| *dest++ = 'S'; |
| if (offset_map) |
| offset_map->Append(i + 1, dest - data8); |
| } else { |
| *dest++ = static_cast<LChar>(unicode::ToUpper(c)); |
| } |
| } |
| |
| return new_impl; |
| } |
| |
| upconvert: |
| scoped_refptr<StringImpl> upconverted = source->UpconvertedString(); |
| const UChar* source16 = upconverted->Characters16(); |
| |
| UChar* data16; |
| scoped_refptr<StringImpl> new_impl = |
| StringImpl::CreateUninitialized(source->length(), data16); |
| |
| // Do a faster loop for the case where all the characters are ASCII. |
| UChar ored = 0; |
| for (int i = 0; i < length; ++i) { |
| UChar c = source16[i]; |
| ored |= c; |
| data16[i] = ToASCIIUpper(c); |
| } |
| if (!(ored & ~0x7F)) |
| return new_impl; |
| |
| // Do a slower implementation for cases that include non-ASCII characters. |
| const char* locale = ""; // "" = root locale. |
| return CaseConvert(CaseMapType::kUpper, source, locale, offset_map); |
| } |
| |
| scoped_refptr<StringImpl> CaseMap::ToLower(StringImpl* source, |
| TextOffsetMap* offset_map) const { |
| DCHECK(source); |
| DCHECK(!offset_map || offset_map->IsEmpty()); |
| |
| if (!case_map_locale_) |
| return ToLowerInvariant(source, offset_map); |
| return CaseConvert(CaseMapType::kLower, source, case_map_locale_, offset_map); |
| } |
| |
| scoped_refptr<StringImpl> CaseMap::ToUpper(StringImpl* source, |
| TextOffsetMap* offset_map) const { |
| DCHECK(source); |
| DCHECK(!offset_map || offset_map->IsEmpty()); |
| |
| if (!case_map_locale_) |
| return ToUpperInvariant(source, offset_map); |
| return CaseConvert(CaseMapType::kUpper, source, case_map_locale_, offset_map); |
| } |
| |
| String CaseMap::ToLower(const String& source, TextOffsetMap* offset_map) const { |
| DCHECK(!offset_map || offset_map->IsEmpty()); |
| |
| if (StringImpl* impl = source.Impl()) |
| return ToLower(impl, offset_map); |
| return String(); |
| } |
| |
| String CaseMap::ToUpper(const String& source, TextOffsetMap* offset_map) const { |
| DCHECK(!offset_map || offset_map->IsEmpty()); |
| |
| if (StringImpl* impl = source.Impl()) |
| return ToUpper(impl, offset_map); |
| return String(); |
| } |
| |
| } // namespace WTF |