| /* |
| * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All |
| * rights reserved. |
| * Copyright (C) 2005 Alexey Proskuryakov. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "third_party/blink/renderer/platform/text/unicode_utilities.h" |
| |
| #include <unicode/normalizer2.h> |
| #include "third_party/blink/renderer/platform/wtf/text/character_names.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_buffer.h" |
| |
| namespace blink { |
| |
| enum VoicedSoundMarkType { |
| kNoVoicedSoundMark, |
| kVoicedSoundMark, |
| kSemiVoicedSoundMark |
| }; |
| |
| template <typename CharType> |
| static inline CharType FoldQuoteMarkOrSoftHyphen(CharType c) { |
| switch (static_cast<UChar>(c)) { |
| case kHebrewPunctuationGershayimCharacter: |
| case kLeftDoubleQuotationMarkCharacter: |
| case kRightDoubleQuotationMarkCharacter: |
| return '"'; |
| case kHebrewPunctuationGereshCharacter: |
| case kLeftSingleQuotationMarkCharacter: |
| case kRightSingleQuotationMarkCharacter: |
| return '\''; |
| case kSoftHyphenCharacter: |
| // Replace soft hyphen with an ignorable character so that their presence |
| // or absence will |
| // not affect string comparison. |
| return 0; |
| default: |
| return c; |
| } |
| } |
| |
| void FoldQuoteMarksAndSoftHyphens(UChar* data, size_t length) { |
| for (size_t i = 0; i < length; ++i) |
| data[i] = FoldQuoteMarkOrSoftHyphen(data[i]); |
| } |
| |
| void FoldQuoteMarksAndSoftHyphens(String& s) { |
| s.Replace(kHebrewPunctuationGereshCharacter, '\''); |
| s.Replace(kHebrewPunctuationGershayimCharacter, '"'); |
| s.Replace(kLeftDoubleQuotationMarkCharacter, '"'); |
| s.Replace(kLeftSingleQuotationMarkCharacter, '\''); |
| s.Replace(kRightDoubleQuotationMarkCharacter, '"'); |
| s.Replace(kRightSingleQuotationMarkCharacter, '\''); |
| // Replace soft hyphen with an ignorable character so that their presence or |
| // absence will |
| // not affect string comparison. |
| s.Replace(kSoftHyphenCharacter, static_cast<UChar>('\0')); |
| } |
| |
| static bool IsNonLatin1Separator(UChar32 character) { |
| DCHECK_GE(character, 256); |
| return U_GET_GC_MASK(character) & |
| (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); |
| } |
| |
| bool IsSeparator(UChar32 character) { |
| // clang-format off |
| static const bool kLatin1SeparatorTable[256] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| // space ! " # $ % & ' ( ) * + , - . / |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| // : ; < = > ? |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, |
| // @ |
| 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| // [ \ ] ^ _ |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, |
| // ` |
| 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| // { | } ~ |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| // clang-format on |
| if (character < 256) |
| return kLatin1SeparatorTable[character]; |
| |
| return IsNonLatin1Separator(character); |
| } |
| |
| // ICU's search ignores the distinction between small kana letters and ones |
| // that are not small, and also characters that differ only in the voicing |
| // marks when considering only primary collation strength differences. |
| // This is not helpful for end users, since these differences make words |
| // distinct, so for our purposes we need these to be considered. |
| // The Unicode folks do not think the collation algorithm should be |
| // changed. To work around this, we would like to tailor the ICU searcher, |
| // but we can't get that to work yet. So instead, we check for cases where |
| // these differences occur, and skip those matches. |
| |
| // We refer to the above technique as the "kana workaround". The next few |
| // functions are helper functinos for the kana workaround. |
| |
| bool IsKanaLetter(UChar character) { |
| // Hiragana letters. |
| if (character >= 0x3041 && character <= 0x3096) |
| return true; |
| |
| // Katakana letters. |
| if (character >= 0x30A1 && character <= 0x30FA) |
| return true; |
| if (character >= 0x31F0 && character <= 0x31FF) |
| return true; |
| |
| // Halfwidth katakana letters. |
| if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70) |
| return true; |
| |
| return false; |
| } |
| |
| bool IsSmallKanaLetter(UChar character) { |
| DCHECK(IsKanaLetter(character)); |
| |
| switch (character) { |
| case 0x3041: // HIRAGANA LETTER SMALL A |
| case 0x3043: // HIRAGANA LETTER SMALL I |
| case 0x3045: // HIRAGANA LETTER SMALL U |
| case 0x3047: // HIRAGANA LETTER SMALL E |
| case 0x3049: // HIRAGANA LETTER SMALL O |
| case 0x3063: // HIRAGANA LETTER SMALL TU |
| case 0x3083: // HIRAGANA LETTER SMALL YA |
| case 0x3085: // HIRAGANA LETTER SMALL YU |
| case 0x3087: // HIRAGANA LETTER SMALL YO |
| case 0x308E: // HIRAGANA LETTER SMALL WA |
| case 0x3095: // HIRAGANA LETTER SMALL KA |
| case 0x3096: // HIRAGANA LETTER SMALL KE |
| case 0x30A1: // KATAKANA LETTER SMALL A |
| case 0x30A3: // KATAKANA LETTER SMALL I |
| case 0x30A5: // KATAKANA LETTER SMALL U |
| case 0x30A7: // KATAKANA LETTER SMALL E |
| case 0x30A9: // KATAKANA LETTER SMALL O |
| case 0x30C3: // KATAKANA LETTER SMALL TU |
| case 0x30E3: // KATAKANA LETTER SMALL YA |
| case 0x30E5: // KATAKANA LETTER SMALL YU |
| case 0x30E7: // KATAKANA LETTER SMALL YO |
| case 0x30EE: // KATAKANA LETTER SMALL WA |
| case 0x30F5: // KATAKANA LETTER SMALL KA |
| case 0x30F6: // KATAKANA LETTER SMALL KE |
| case 0x31F0: // KATAKANA LETTER SMALL KU |
| case 0x31F1: // KATAKANA LETTER SMALL SI |
| case 0x31F2: // KATAKANA LETTER SMALL SU |
| case 0x31F3: // KATAKANA LETTER SMALL TO |
| case 0x31F4: // KATAKANA LETTER SMALL NU |
| case 0x31F5: // KATAKANA LETTER SMALL HA |
| case 0x31F6: // KATAKANA LETTER SMALL HI |
| case 0x31F7: // KATAKANA LETTER SMALL HU |
| case 0x31F8: // KATAKANA LETTER SMALL HE |
| case 0x31F9: // KATAKANA LETTER SMALL HO |
| case 0x31FA: // KATAKANA LETTER SMALL MU |
| case 0x31FB: // KATAKANA LETTER SMALL RA |
| case 0x31FC: // KATAKANA LETTER SMALL RI |
| case 0x31FD: // KATAKANA LETTER SMALL RU |
| case 0x31FE: // KATAKANA LETTER SMALL RE |
| case 0x31FF: // KATAKANA LETTER SMALL RO |
| case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A |
| case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I |
| case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U |
| case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E |
| case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O |
| case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA |
| case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU |
| case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO |
| case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU |
| return true; |
| } |
| return false; |
| } |
| |
| static inline VoicedSoundMarkType ComposedVoicedSoundMark(UChar character) { |
| DCHECK(IsKanaLetter(character)); |
| |
| switch (character) { |
| case 0x304C: // HIRAGANA LETTER GA |
| case 0x304E: // HIRAGANA LETTER GI |
| case 0x3050: // HIRAGANA LETTER GU |
| case 0x3052: // HIRAGANA LETTER GE |
| case 0x3054: // HIRAGANA LETTER GO |
| case 0x3056: // HIRAGANA LETTER ZA |
| case 0x3058: // HIRAGANA LETTER ZI |
| case 0x305A: // HIRAGANA LETTER ZU |
| case 0x305C: // HIRAGANA LETTER ZE |
| case 0x305E: // HIRAGANA LETTER ZO |
| case 0x3060: // HIRAGANA LETTER DA |
| case 0x3062: // HIRAGANA LETTER DI |
| case 0x3065: // HIRAGANA LETTER DU |
| case 0x3067: // HIRAGANA LETTER DE |
| case 0x3069: // HIRAGANA LETTER DO |
| case 0x3070: // HIRAGANA LETTER BA |
| case 0x3073: // HIRAGANA LETTER BI |
| case 0x3076: // HIRAGANA LETTER BU |
| case 0x3079: // HIRAGANA LETTER BE |
| case 0x307C: // HIRAGANA LETTER BO |
| case 0x3094: // HIRAGANA LETTER VU |
| case 0x30AC: // KATAKANA LETTER GA |
| case 0x30AE: // KATAKANA LETTER GI |
| case 0x30B0: // KATAKANA LETTER GU |
| case 0x30B2: // KATAKANA LETTER GE |
| case 0x30B4: // KATAKANA LETTER GO |
| case 0x30B6: // KATAKANA LETTER ZA |
| case 0x30B8: // KATAKANA LETTER ZI |
| case 0x30BA: // KATAKANA LETTER ZU |
| case 0x30BC: // KATAKANA LETTER ZE |
| case 0x30BE: // KATAKANA LETTER ZO |
| case 0x30C0: // KATAKANA LETTER DA |
| case 0x30C2: // KATAKANA LETTER DI |
| case 0x30C5: // KATAKANA LETTER DU |
| case 0x30C7: // KATAKANA LETTER DE |
| case 0x30C9: // KATAKANA LETTER DO |
| case 0x30D0: // KATAKANA LETTER BA |
| case 0x30D3: // KATAKANA LETTER BI |
| case 0x30D6: // KATAKANA LETTER BU |
| case 0x30D9: // KATAKANA LETTER BE |
| case 0x30DC: // KATAKANA LETTER BO |
| case 0x30F4: // KATAKANA LETTER VU |
| case 0x30F7: // KATAKANA LETTER VA |
| case 0x30F8: // KATAKANA LETTER VI |
| case 0x30F9: // KATAKANA LETTER VE |
| case 0x30FA: // KATAKANA LETTER VO |
| return kVoicedSoundMark; |
| case 0x3071: // HIRAGANA LETTER PA |
| case 0x3074: // HIRAGANA LETTER PI |
| case 0x3077: // HIRAGANA LETTER PU |
| case 0x307A: // HIRAGANA LETTER PE |
| case 0x307D: // HIRAGANA LETTER PO |
| case 0x30D1: // KATAKANA LETTER PA |
| case 0x30D4: // KATAKANA LETTER PI |
| case 0x30D7: // KATAKANA LETTER PU |
| case 0x30DA: // KATAKANA LETTER PE |
| case 0x30DD: // KATAKANA LETTER PO |
| return kSemiVoicedSoundMark; |
| } |
| return kNoVoicedSoundMark; |
| } |
| |
| static inline bool IsCombiningVoicedSoundMark(UChar character) { |
| switch (character) { |
| case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK |
| case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK |
| return true; |
| } |
| return false; |
| } |
| |
| bool ContainsKanaLetters(const String& pattern) { |
| const unsigned length = pattern.length(); |
| for (unsigned i = 0; i < length; ++i) { |
| if (IsKanaLetter(pattern[i])) |
| return true; |
| } |
| return false; |
| } |
| |
| void NormalizeCharactersIntoNFCForm(const UChar* characters, |
| unsigned length, |
| Vector<UChar>& buffer) { |
| DCHECK(length); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| const icu::Normalizer2* normalizer = icu::Normalizer2::getNFCInstance(status); |
| DCHECK(U_SUCCESS(status)); |
| int32_t input_length = static_cast<int32_t>(length); |
| // copy-on-write. |
| icu::UnicodeString normalized(false, characters, input_length); |
| // In the vast majority of cases, input is already NFC. Run a quick check |
| // to avoid normalizing the entire input unnecessarily. |
| int32_t normalized_prefix_length = |
| normalizer->spanQuickCheckYes(normalized, status); |
| if (normalized_prefix_length < input_length) { |
| icu::UnicodeString un_normalized(normalized, normalized_prefix_length); |
| normalized.truncate(normalized_prefix_length); |
| normalizer->normalizeSecondAndAppend(normalized, un_normalized, status); |
| } |
| int32_t buffer_size = normalized.length(); |
| DCHECK(buffer_size); |
| |
| buffer.resize(static_cast<wtf_size_t>(buffer_size)); |
| normalized.extract(buffer.data(), buffer_size, status); |
| DCHECK(U_SUCCESS(status)); |
| } |
| |
| // This function returns kNotFound if |first| and |second| contain different |
| // Kana letters. If |first| and |second| contain the same Kana letter then |
| // function returns offset in characters from |first|. |
| // Pointers to both strings increase simultaneously so so it is possible to use |
| // one offset value. |
| static inline size_t CompareKanaLetterAndComposedVoicedSoundMarks( |
| const UChar* first, |
| const UChar* first_end, |
| const UChar* second, |
| const UChar* second_end) { |
| const UChar* start = first; |
| // Check for differences in the kana letter character itself. |
| if (IsSmallKanaLetter(*first) != IsSmallKanaLetter(*second)) |
| return kNotFound; |
| if (ComposedVoicedSoundMark(*first) != ComposedVoicedSoundMark(*second)) |
| return kNotFound; |
| ++first; |
| ++second; |
| |
| // Check for differences in combining voiced sound marks found after the |
| // letter. |
| while (true) { |
| const bool second_is_not_sound_mark = |
| second == second_end || !IsCombiningVoicedSoundMark(*second); |
| if (first == first_end || !IsCombiningVoicedSoundMark(*first)) { |
| return second_is_not_sound_mark ? first - start : kNotFound; |
| } |
| if (second_is_not_sound_mark) |
| return kNotFound; |
| if (*first != *second) |
| return kNotFound; |
| ++first; |
| ++second; |
| } |
| } |
| |
| bool CheckOnlyKanaLettersInStrings(const UChar* first_data, |
| unsigned first_length, |
| const UChar* second_data, |
| unsigned second_length) { |
| const UChar* a = first_data; |
| const UChar* a_end = first_data + first_length; |
| |
| const UChar* b = second_data; |
| const UChar* b_end = second_data + second_length; |
| while (true) { |
| // Skip runs of non-kana-letter characters. This is necessary so we can |
| // correctly handle strings where the |firstData| and |secondData| have |
| // different-length runs of characters that match, while still double |
| // checking the correctness of matches of kana letters with other kana |
| // letters. |
| while (a != a_end && !IsKanaLetter(*a)) |
| ++a; |
| while (b != b_end && !IsKanaLetter(*b)) |
| ++b; |
| |
| // If we reached the end of either the target or the match, we should have |
| // reached the end of both; both should have the same number of kana |
| // letters. |
| if (a == a_end || b == b_end) { |
| return a == a_end && b == b_end; |
| } |
| |
| // Check that single Kana letters in |a| and |b| are the same. |
| const size_t offset = |
| CompareKanaLetterAndComposedVoicedSoundMarks(a, a_end, b, b_end); |
| if (offset == kNotFound) |
| return false; |
| |
| // Update values of |a| and |b| after comparing. |
| a += offset; |
| b += offset; |
| } |
| } |
| |
| bool CheckKanaStringsEqual(const UChar* first_data, |
| unsigned first_length, |
| const UChar* second_data, |
| unsigned second_length) { |
| const UChar* a = first_data; |
| const UChar* a_end = first_data + first_length; |
| |
| const UChar* b = second_data; |
| const UChar* b_end = second_data + second_length; |
| while (true) { |
| // Check for non-kana-letter characters. |
| while (a != a_end && !IsKanaLetter(*a) && b != b_end && !IsKanaLetter(*b)) { |
| if (*a++ != *b++) |
| return false; |
| } |
| |
| // If we reached the end of either the target or the match, we should have |
| // reached the end of both; both should have the same number of kana |
| // letters. |
| if (a == a_end || b == b_end) { |
| return a == a_end && b == b_end; |
| } |
| |
| if (IsKanaLetter(*a) != IsKanaLetter(*b)) |
| return false; |
| |
| // Check that single Kana letters in |a| and |b| are the same. |
| const size_t offset = |
| CompareKanaLetterAndComposedVoicedSoundMarks(a, a_end, b, b_end); |
| if (offset == kNotFound) |
| return false; |
| |
| // Update values of |a| and |b| after comparing. |
| a += offset; |
| b += offset; |
| } |
| } |
| |
| } // namespace blink |