chromium/src/third_party/blink/renderer/platform/text/unicode_utilities.cc - manifest_repos/chromium_src - Git at Google

 /*
  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All
  * rights reserved.
  * Copyright (C) 2005 Alexey Proskuryakov.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "third_party/blink/renderer/platform/text/unicode_utilities.h"

 #include <unicode/normalizer2.h>
 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
 #include "third_party/blink/renderer/platform/wtf/text/string_buffer.h"

 namespace blink {

 enum VoicedSoundMarkType {
   kNoVoicedSoundMark,
   kVoicedSoundMark,
   kSemiVoicedSoundMark
 };

 template <typename CharType>
 static inline CharType FoldQuoteMarkOrSoftHyphen(CharType c) {
   switch (static_cast<UChar>(c)) {
     case kHebrewPunctuationGershayimCharacter:
     case kLeftDoubleQuotationMarkCharacter:
     case kRightDoubleQuotationMarkCharacter:
       return '"';
     case kHebrewPunctuationGereshCharacter:
     case kLeftSingleQuotationMarkCharacter:
     case kRightSingleQuotationMarkCharacter:
       return '\'';
     case kSoftHyphenCharacter:
       // Replace soft hyphen with an ignorable character so that their presence
       // or absence will
       // not affect string comparison.
       return 0;
     default:
       return c;
   }
 }

 void FoldQuoteMarksAndSoftHyphens(UChar* data, size_t length) {
   for (size_t i = 0; i < length; ++i)
     data[i] = FoldQuoteMarkOrSoftHyphen(data[i]);
 }

 void FoldQuoteMarksAndSoftHyphens(String& s) {
   s.Replace(kHebrewPunctuationGereshCharacter, '\'');
   s.Replace(kHebrewPunctuationGershayimCharacter, '"');
   s.Replace(kLeftDoubleQuotationMarkCharacter, '"');
   s.Replace(kLeftSingleQuotationMarkCharacter, '\'');
   s.Replace(kRightDoubleQuotationMarkCharacter, '"');
   s.Replace(kRightSingleQuotationMarkCharacter, '\'');
   // Replace soft hyphen with an ignorable character so that their presence or
   // absence will
   // not affect string comparison.
   s.Replace(kSoftHyphenCharacter, static_cast<UChar>('\0'));
 }

 static bool IsNonLatin1Separator(UChar32 character) {
   DCHECK_GE(character, 256);
   return U_GET_GC_MASK(character) &
          (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
 }

 bool IsSeparator(UChar32 character) {
   // clang-format off
   static const bool kLatin1SeparatorTable[256] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       // space ! " # $ % & ' ( ) * + , - . /
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       //                         : ; < = > ?
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       //   @
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       //                         [ \ ] ^ _
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       //   `
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       //                           { | } ~
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
   };
   // clang-format on
   if (character < 256)
     return kLatin1SeparatorTable[character];

   return IsNonLatin1Separator(character);
 }

 // ICU's search ignores the distinction between small kana letters and ones
 // that are not small, and also characters that differ only in the voicing
 // marks when considering only primary collation strength differences.
 // This is not helpful for end users, since these differences make words
 // distinct, so for our purposes we need these to be considered.
 // The Unicode folks do not think the collation algorithm should be
 // changed. To work around this, we would like to tailor the ICU searcher,
 // but we can't get that to work yet. So instead, we check for cases where
 // these differences occur, and skip those matches.

 // We refer to the above technique as the "kana workaround". The next few
 // functions are helper functinos for the kana workaround.

 bool IsKanaLetter(UChar character) {
   // Hiragana letters.
   if (character >= 0x3041 && character <= 0x3096)
     return true;

   // Katakana letters.
   if (character >= 0x30A1 && character <= 0x30FA)
     return true;
   if (character >= 0x31F0 && character <= 0x31FF)
     return true;

   // Halfwidth katakana letters.
   if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
     return true;

   return false;
 }

 bool IsSmallKanaLetter(UChar character) {
   DCHECK(IsKanaLetter(character));

   switch (character) {
     case 0x3041:  // HIRAGANA LETTER SMALL A
     case 0x3043:  // HIRAGANA LETTER SMALL I
     case 0x3045:  // HIRAGANA LETTER SMALL U
     case 0x3047:  // HIRAGANA LETTER SMALL E
     case 0x3049:  // HIRAGANA LETTER SMALL O
     case 0x3063:  // HIRAGANA LETTER SMALL TU
     case 0x3083:  // HIRAGANA LETTER SMALL YA
     case 0x3085:  // HIRAGANA LETTER SMALL YU
     case 0x3087:  // HIRAGANA LETTER SMALL YO
     case 0x308E:  // HIRAGANA LETTER SMALL WA
     case 0x3095:  // HIRAGANA LETTER SMALL KA
     case 0x3096:  // HIRAGANA LETTER SMALL KE
     case 0x30A1:  // KATAKANA LETTER SMALL A
     case 0x30A3:  // KATAKANA LETTER SMALL I
     case 0x30A5:  // KATAKANA LETTER SMALL U
     case 0x30A7:  // KATAKANA LETTER SMALL E
     case 0x30A9:  // KATAKANA LETTER SMALL O
     case 0x30C3:  // KATAKANA LETTER SMALL TU
     case 0x30E3:  // KATAKANA LETTER SMALL YA
     case 0x30E5:  // KATAKANA LETTER SMALL YU
     case 0x30E7:  // KATAKANA LETTER SMALL YO
     case 0x30EE:  // KATAKANA LETTER SMALL WA
     case 0x30F5:  // KATAKANA LETTER SMALL KA
     case 0x30F6:  // KATAKANA LETTER SMALL KE
     case 0x31F0:  // KATAKANA LETTER SMALL KU
     case 0x31F1:  // KATAKANA LETTER SMALL SI
     case 0x31F2:  // KATAKANA LETTER SMALL SU
     case 0x31F3:  // KATAKANA LETTER SMALL TO
     case 0x31F4:  // KATAKANA LETTER SMALL NU
     case 0x31F5:  // KATAKANA LETTER SMALL HA
     case 0x31F6:  // KATAKANA LETTER SMALL HI
     case 0x31F7:  // KATAKANA LETTER SMALL HU
     case 0x31F8:  // KATAKANA LETTER SMALL HE
     case 0x31F9:  // KATAKANA LETTER SMALL HO
     case 0x31FA:  // KATAKANA LETTER SMALL MU
     case 0x31FB:  // KATAKANA LETTER SMALL RA
     case 0x31FC:  // KATAKANA LETTER SMALL RI
     case 0x31FD:  // KATAKANA LETTER SMALL RU
     case 0x31FE:  // KATAKANA LETTER SMALL RE
     case 0x31FF:  // KATAKANA LETTER SMALL RO
     case 0xFF67:  // HALFWIDTH KATAKANA LETTER SMALL A
     case 0xFF68:  // HALFWIDTH KATAKANA LETTER SMALL I
     case 0xFF69:  // HALFWIDTH KATAKANA LETTER SMALL U
     case 0xFF6A:  // HALFWIDTH KATAKANA LETTER SMALL E
     case 0xFF6B:  // HALFWIDTH KATAKANA LETTER SMALL O
     case 0xFF6C:  // HALFWIDTH KATAKANA LETTER SMALL YA
     case 0xFF6D:  // HALFWIDTH KATAKANA LETTER SMALL YU
     case 0xFF6E:  // HALFWIDTH KATAKANA LETTER SMALL YO
     case 0xFF6F:  // HALFWIDTH KATAKANA LETTER SMALL TU
       return true;
   }
   return false;
 }

 static inline VoicedSoundMarkType ComposedVoicedSoundMark(UChar character) {
   DCHECK(IsKanaLetter(character));

   switch (character) {
     case 0x304C:  // HIRAGANA LETTER GA
     case 0x304E:  // HIRAGANA LETTER GI
     case 0x3050:  // HIRAGANA LETTER GU
     case 0x3052:  // HIRAGANA LETTER GE
     case 0x3054:  // HIRAGANA LETTER GO
     case 0x3056:  // HIRAGANA LETTER ZA
     case 0x3058:  // HIRAGANA LETTER ZI
     case 0x305A:  // HIRAGANA LETTER ZU
     case 0x305C:  // HIRAGANA LETTER ZE
     case 0x305E:  // HIRAGANA LETTER ZO
     case 0x3060:  // HIRAGANA LETTER DA
     case 0x3062:  // HIRAGANA LETTER DI
     case 0x3065:  // HIRAGANA LETTER DU
     case 0x3067:  // HIRAGANA LETTER DE
     case 0x3069:  // HIRAGANA LETTER DO
     case 0x3070:  // HIRAGANA LETTER BA
     case 0x3073:  // HIRAGANA LETTER BI
     case 0x3076:  // HIRAGANA LETTER BU
     case 0x3079:  // HIRAGANA LETTER BE
     case 0x307C:  // HIRAGANA LETTER BO
     case 0x3094:  // HIRAGANA LETTER VU
     case 0x30AC:  // KATAKANA LETTER GA
     case 0x30AE:  // KATAKANA LETTER GI
     case 0x30B0:  // KATAKANA LETTER GU
     case 0x30B2:  // KATAKANA LETTER GE
     case 0x30B4:  // KATAKANA LETTER GO
     case 0x30B6:  // KATAKANA LETTER ZA
     case 0x30B8:  // KATAKANA LETTER ZI
     case 0x30BA:  // KATAKANA LETTER ZU
     case 0x30BC:  // KATAKANA LETTER ZE
     case 0x30BE:  // KATAKANA LETTER ZO
     case 0x30C0:  // KATAKANA LETTER DA
     case 0x30C2:  // KATAKANA LETTER DI
     case 0x30C5:  // KATAKANA LETTER DU
     case 0x30C7:  // KATAKANA LETTER DE
     case 0x30C9:  // KATAKANA LETTER DO
     case 0x30D0:  // KATAKANA LETTER BA
     case 0x30D3:  // KATAKANA LETTER BI
     case 0x30D6:  // KATAKANA LETTER BU
     case 0x30D9:  // KATAKANA LETTER BE
     case 0x30DC:  // KATAKANA LETTER BO
     case 0x30F4:  // KATAKANA LETTER VU
     case 0x30F7:  // KATAKANA LETTER VA
     case 0x30F8:  // KATAKANA LETTER VI
     case 0x30F9:  // KATAKANA LETTER VE
     case 0x30FA:  // KATAKANA LETTER VO
       return kVoicedSoundMark;
     case 0x3071:  // HIRAGANA LETTER PA
     case 0x3074:  // HIRAGANA LETTER PI
     case 0x3077:  // HIRAGANA LETTER PU
     case 0x307A:  // HIRAGANA LETTER PE
     case 0x307D:  // HIRAGANA LETTER PO
     case 0x30D1:  // KATAKANA LETTER PA
     case 0x30D4:  // KATAKANA LETTER PI
     case 0x30D7:  // KATAKANA LETTER PU
     case 0x30DA:  // KATAKANA LETTER PE
     case 0x30DD:  // KATAKANA LETTER PO
       return kSemiVoicedSoundMark;
   }
   return kNoVoicedSoundMark;
 }

 static inline bool IsCombiningVoicedSoundMark(UChar character) {
   switch (character) {
     case 0x3099:  // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
     case 0x309A:  // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
       return true;
   }
   return false;
 }

 bool ContainsKanaLetters(const String& pattern) {
   const unsigned length = pattern.length();
   for (unsigned i = 0; i < length; ++i) {
     if (IsKanaLetter(pattern[i]))
       return true;
   }
   return false;
 }

 void NormalizeCharactersIntoNFCForm(const UChar* characters,
                                     unsigned length,
                                     Vector<UChar>& buffer) {
   DCHECK(length);

   UErrorCode status = U_ZERO_ERROR;
   const icu::Normalizer2* normalizer = icu::Normalizer2::getNFCInstance(status);
   DCHECK(U_SUCCESS(status));
   int32_t input_length = static_cast<int32_t>(length);
   // copy-on-write.
   icu::UnicodeString normalized(false, characters, input_length);
   // In the vast majority of cases, input is already NFC. Run a quick check
   // to avoid normalizing the entire input unnecessarily.
   int32_t normalized_prefix_length =
       normalizer->spanQuickCheckYes(normalized, status);
   if (normalized_prefix_length < input_length) {
     icu::UnicodeString un_normalized(normalized, normalized_prefix_length);
     normalized.truncate(normalized_prefix_length);
     normalizer->normalizeSecondAndAppend(normalized, un_normalized, status);
   }
   int32_t buffer_size = normalized.length();
   DCHECK(buffer_size);

   buffer.resize(static_cast<wtf_size_t>(buffer_size));
   normalized.extract(buffer.data(), buffer_size, status);
   DCHECK(U_SUCCESS(status));
 }

 // This function returns kNotFound if |first| and |second| contain different
 // Kana letters.  If |first| and |second| contain the same Kana letter then
 // function returns offset in characters from |first|.
 // Pointers to both strings increase simultaneously so so it is possible to use
 // one offset value.
 static inline size_t CompareKanaLetterAndComposedVoicedSoundMarks(
     const UChar* first,
     const UChar* first_end,
     const UChar* second,
     const UChar* second_end) {
   const UChar* start = first;
   // Check for differences in the kana letter character itself.
   if (IsSmallKanaLetter(*first) != IsSmallKanaLetter(*second))
     return kNotFound;
   if (ComposedVoicedSoundMark(*first) != ComposedVoicedSoundMark(*second))
     return kNotFound;
   ++first;
   ++second;

   // Check for differences in combining voiced sound marks found after the
   // letter.
   while (true) {
     const bool second_is_not_sound_mark =
         second == second_end || !IsCombiningVoicedSoundMark(*second);
     if (first == first_end || !IsCombiningVoicedSoundMark(*first)) {
       return second_is_not_sound_mark ? first - start : kNotFound;
     }
     if (second_is_not_sound_mark)
       return kNotFound;
     if (*first != *second)
       return kNotFound;
     ++first;
     ++second;
   }
 }

 bool CheckOnlyKanaLettersInStrings(const UChar* first_data,
                                    unsigned first_length,
                                    const UChar* second_data,
                                    unsigned second_length) {
   const UChar* a = first_data;
   const UChar* a_end = first_data + first_length;

   const UChar* b = second_data;
   const UChar* b_end = second_data + second_length;
   while (true) {
     // Skip runs of non-kana-letter characters. This is necessary so we can
     // correctly handle strings where the |firstData| and |secondData| have
     // different-length runs of characters that match, while still double
     // checking the correctness of matches of kana letters with other kana
     // letters.
     while (a != a_end && !IsKanaLetter(*a))
       ++a;
     while (b != b_end && !IsKanaLetter(*b))
       ++b;

     // If we reached the end of either the target or the match, we should have
     // reached the end of both; both should have the same number of kana
     // letters.
     if (a == a_end || b == b_end) {
       return a == a_end && b == b_end;
     }

     // Check that single Kana letters in |a| and |b| are the same.
     const size_t offset =
         CompareKanaLetterAndComposedVoicedSoundMarks(a, a_end, b, b_end);
     if (offset == kNotFound)
       return false;

     // Update values of |a| and |b| after comparing.
     a += offset;
     b += offset;
   }
 }

 bool CheckKanaStringsEqual(const UChar* first_data,
                            unsigned first_length,
                            const UChar* second_data,
                            unsigned second_length) {
   const UChar* a = first_data;
   const UChar* a_end = first_data + first_length;

   const UChar* b = second_data;
   const UChar* b_end = second_data + second_length;
   while (true) {
     // Check for non-kana-letter characters.
     while (a != a_end && !IsKanaLetter(*a) && b != b_end && !IsKanaLetter(*b)) {
       if (*a++ != *b++)
         return false;
     }

     // If we reached the end of either the target or the match, we should have
     // reached the end of both; both should have the same number of kana
     // letters.
     if (a == a_end || b == b_end) {
       return a == a_end && b == b_end;
     }

     if (IsKanaLetter(*a) != IsKanaLetter(*b))
       return false;

     // Check that single Kana letters in |a| and |b| are the same.
     const size_t offset =
         CompareKanaLetterAndComposedVoicedSoundMarks(a, a_end, b, b_end);
     if (offset == kNotFound)
       return false;

     // Update values of |a| and |b| after comparing.
     a += offset;
     b += offset;
   }
 }

 }  // namespace blink
	/*
	* Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All
	* rights reserved.
	* Copyright (C) 2005 Alexey Proskuryakov.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "third_party/blink/renderer/platform/text/unicode_utilities.h"

	#include <unicode/normalizer2.h>
	#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
	#include "third_party/blink/renderer/platform/wtf/text/string_buffer.h"

	namespace blink {

	enum VoicedSoundMarkType {
	kNoVoicedSoundMark,
	kVoicedSoundMark,
	kSemiVoicedSoundMark
	};

	template <typename CharType>
	static inline CharType FoldQuoteMarkOrSoftHyphen(CharType c) {
	switch (static_cast<UChar>(c)) {
	case kHebrewPunctuationGershayimCharacter:
	case kLeftDoubleQuotationMarkCharacter:
	case kRightDoubleQuotationMarkCharacter:
	return '"';
	case kHebrewPunctuationGereshCharacter:
	case kLeftSingleQuotationMarkCharacter:
	case kRightSingleQuotationMarkCharacter:
	return '\'';
	case kSoftHyphenCharacter:
	// Replace soft hyphen with an ignorable character so that their presence
	// or absence will
	// not affect string comparison.
	return 0;
	default:
	return c;
	}
	}

	void FoldQuoteMarksAndSoftHyphens(UChar* data, size_t length) {
	for (size_t i = 0; i < length; ++i)
	data[i] = FoldQuoteMarkOrSoftHyphen(data[i]);
	}

	void FoldQuoteMarksAndSoftHyphens(String& s) {
	s.Replace(kHebrewPunctuationGereshCharacter, '\'');
	s.Replace(kHebrewPunctuationGershayimCharacter, '"');
	s.Replace(kLeftDoubleQuotationMarkCharacter, '"');
	s.Replace(kLeftSingleQuotationMarkCharacter, '\'');
	s.Replace(kRightDoubleQuotationMarkCharacter, '"');
	s.Replace(kRightSingleQuotationMarkCharacter, '\'');
	// Replace soft hyphen with an ignorable character so that their presence or
	// absence will
	// not affect string comparison.
	s.Replace(kSoftHyphenCharacter, static_cast<UChar>('\0'));
	}

	static bool IsNonLatin1Separator(UChar32 character) {
	DCHECK_GE(character, 256);
	return U_GET_GC_MASK(character) &
	(U_GC_S_MASK \| U_GC_P_MASK \| U_GC_Z_MASK \| U_GC_CF_MASK);
	}

	bool IsSeparator(UChar32 character) {
	// clang-format off
	static const bool kLatin1SeparatorTable[256] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	// space ! " # $ % & ' ( ) * + , - . /
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	// : ; < = > ?
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
	// @
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	// [ \ ] ^ _
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
	// `
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	// { \| } ~
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
	};
	// clang-format on
	if (character < 256)
	return kLatin1SeparatorTable[character];

	return IsNonLatin1Separator(character);
	}

	// ICU's search ignores the distinction between small kana letters and ones
	// that are not small, and also characters that differ only in the voicing
	// marks when considering only primary collation strength differences.
	// This is not helpful for end users, since these differences make words
	// distinct, so for our purposes we need these to be considered.
	// The Unicode folks do not think the collation algorithm should be
	// changed. To work around this, we would like to tailor the ICU searcher,
	// but we can't get that to work yet. So instead, we check for cases where
	// these differences occur, and skip those matches.

	// We refer to the above technique as the "kana workaround". The next few
	// functions are helper functinos for the kana workaround.

	bool IsKanaLetter(UChar character) {
	// Hiragana letters.
	if (character >= 0x3041 && character <= 0x3096)
	return true;

	// Katakana letters.
	if (character >= 0x30A1 && character <= 0x30FA)
	return true;
	if (character >= 0x31F0 && character <= 0x31FF)
	return true;

	// Halfwidth katakana letters.
	if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
	return true;

	return false;
	}

	bool IsSmallKanaLetter(UChar character) {
	DCHECK(IsKanaLetter(character));

	switch (character) {
	case 0x3041: // HIRAGANA LETTER SMALL A
	case 0x3043: // HIRAGANA LETTER SMALL I
	case 0x3045: // HIRAGANA LETTER SMALL U
	case 0x3047: // HIRAGANA LETTER SMALL E
	case 0x3049: // HIRAGANA LETTER SMALL O
	case 0x3063: // HIRAGANA LETTER SMALL TU
	case 0x3083: // HIRAGANA LETTER SMALL YA
	case 0x3085: // HIRAGANA LETTER SMALL YU
	case 0x3087: // HIRAGANA LETTER SMALL YO
	case 0x308E: // HIRAGANA LETTER SMALL WA
	case 0x3095: // HIRAGANA LETTER SMALL KA
	case 0x3096: // HIRAGANA LETTER SMALL KE
	case 0x30A1: // KATAKANA LETTER SMALL A
	case 0x30A3: // KATAKANA LETTER SMALL I
	case 0x30A5: // KATAKANA LETTER SMALL U
	case 0x30A7: // KATAKANA LETTER SMALL E
	case 0x30A9: // KATAKANA LETTER SMALL O
	case 0x30C3: // KATAKANA LETTER SMALL TU
	case 0x30E3: // KATAKANA LETTER SMALL YA
	case 0x30E5: // KATAKANA LETTER SMALL YU
	case 0x30E7: // KATAKANA LETTER SMALL YO
	case 0x30EE: // KATAKANA LETTER SMALL WA
	case 0x30F5: // KATAKANA LETTER SMALL KA
	case 0x30F6: // KATAKANA LETTER SMALL KE
	case 0x31F0: // KATAKANA LETTER SMALL KU
	case 0x31F1: // KATAKANA LETTER SMALL SI
	case 0x31F2: // KATAKANA LETTER SMALL SU
	case 0x31F3: // KATAKANA LETTER SMALL TO
	case 0x31F4: // KATAKANA LETTER SMALL NU
	case 0x31F5: // KATAKANA LETTER SMALL HA
	case 0x31F6: // KATAKANA LETTER SMALL HI
	case 0x31F7: // KATAKANA LETTER SMALL HU
	case 0x31F8: // KATAKANA LETTER SMALL HE
	case 0x31F9: // KATAKANA LETTER SMALL HO
	case 0x31FA: // KATAKANA LETTER SMALL MU
	case 0x31FB: // KATAKANA LETTER SMALL RA
	case 0x31FC: // KATAKANA LETTER SMALL RI
	case 0x31FD: // KATAKANA LETTER SMALL RU
	case 0x31FE: // KATAKANA LETTER SMALL RE
	case 0x31FF: // KATAKANA LETTER SMALL RO
	case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
	case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
	case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
	case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
	case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
	case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
	case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
	case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
	case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
	return true;
	}
	return false;
	}

	static inline VoicedSoundMarkType ComposedVoicedSoundMark(UChar character) {
	DCHECK(IsKanaLetter(character));

	switch (character) {
	case 0x304C: // HIRAGANA LETTER GA
	case 0x304E: // HIRAGANA LETTER GI
	case 0x3050: // HIRAGANA LETTER GU
	case 0x3052: // HIRAGANA LETTER GE
	case 0x3054: // HIRAGANA LETTER GO
	case 0x3056: // HIRAGANA LETTER ZA
	case 0x3058: // HIRAGANA LETTER ZI
	case 0x305A: // HIRAGANA LETTER ZU
	case 0x305C: // HIRAGANA LETTER ZE
	case 0x305E: // HIRAGANA LETTER ZO
	case 0x3060: // HIRAGANA LETTER DA
	case 0x3062: // HIRAGANA LETTER DI
	case 0x3065: // HIRAGANA LETTER DU
	case 0x3067: // HIRAGANA LETTER DE
	case 0x3069: // HIRAGANA LETTER DO
	case 0x3070: // HIRAGANA LETTER BA
	case 0x3073: // HIRAGANA LETTER BI
	case 0x3076: // HIRAGANA LETTER BU
	case 0x3079: // HIRAGANA LETTER BE
	case 0x307C: // HIRAGANA LETTER BO
	case 0x3094: // HIRAGANA LETTER VU
	case 0x30AC: // KATAKANA LETTER GA
	case 0x30AE: // KATAKANA LETTER GI
	case 0x30B0: // KATAKANA LETTER GU
	case 0x30B2: // KATAKANA LETTER GE
	case 0x30B4: // KATAKANA LETTER GO
	case 0x30B6: // KATAKANA LETTER ZA
	case 0x30B8: // KATAKANA LETTER ZI
	case 0x30BA: // KATAKANA LETTER ZU
	case 0x30BC: // KATAKANA LETTER ZE
	case 0x30BE: // KATAKANA LETTER ZO
	case 0x30C0: // KATAKANA LETTER DA
	case 0x30C2: // KATAKANA LETTER DI
	case 0x30C5: // KATAKANA LETTER DU
	case 0x30C7: // KATAKANA LETTER DE
	case 0x30C9: // KATAKANA LETTER DO
	case 0x30D0: // KATAKANA LETTER BA
	case 0x30D3: // KATAKANA LETTER BI
	case 0x30D6: // KATAKANA LETTER BU
	case 0x30D9: // KATAKANA LETTER BE
	case 0x30DC: // KATAKANA LETTER BO
	case 0x30F4: // KATAKANA LETTER VU
	case 0x30F7: // KATAKANA LETTER VA
	case 0x30F8: // KATAKANA LETTER VI
	case 0x30F9: // KATAKANA LETTER VE
	case 0x30FA: // KATAKANA LETTER VO
	return kVoicedSoundMark;
	case 0x3071: // HIRAGANA LETTER PA
	case 0x3074: // HIRAGANA LETTER PI
	case 0x3077: // HIRAGANA LETTER PU
	case 0x307A: // HIRAGANA LETTER PE
	case 0x307D: // HIRAGANA LETTER PO
	case 0x30D1: // KATAKANA LETTER PA
	case 0x30D4: // KATAKANA LETTER PI
	case 0x30D7: // KATAKANA LETTER PU
	case 0x30DA: // KATAKANA LETTER PE
	case 0x30DD: // KATAKANA LETTER PO
	return kSemiVoicedSoundMark;
	}
	return kNoVoicedSoundMark;
	}

	static inline bool IsCombiningVoicedSoundMark(UChar character) {
	switch (character) {
	case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
	case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
	return true;
	}
	return false;
	}

	bool ContainsKanaLetters(const String& pattern) {
	const unsigned length = pattern.length();
	for (unsigned i = 0; i < length; ++i) {
	if (IsKanaLetter(pattern[i]))
	return true;
	}
	return false;
	}

	void NormalizeCharactersIntoNFCForm(const UChar* characters,
	unsigned length,
	Vector<UChar>& buffer) {
	DCHECK(length);

	UErrorCode status = U_ZERO_ERROR;
	const icu::Normalizer2* normalizer = icu::Normalizer2::getNFCInstance(status);
	DCHECK(U_SUCCESS(status));
	int32_t input_length = static_cast<int32_t>(length);
	// copy-on-write.
	icu::UnicodeString normalized(false, characters, input_length);
	// In the vast majority of cases, input is already NFC. Run a quick check
	// to avoid normalizing the entire input unnecessarily.
	int32_t normalized_prefix_length =
	normalizer->spanQuickCheckYes(normalized, status);
	if (normalized_prefix_length < input_length) {
	icu::UnicodeString un_normalized(normalized, normalized_prefix_length);
	normalized.truncate(normalized_prefix_length);
	normalizer->normalizeSecondAndAppend(normalized, un_normalized, status);
	}
	int32_t buffer_size = normalized.length();
	DCHECK(buffer_size);

	buffer.resize(static_cast<wtf_size_t>(buffer_size));
	normalized.extract(buffer.data(), buffer_size, status);
	DCHECK(U_SUCCESS(status));
	}

	// This function returns kNotFound if \|first\| and \|second\| contain different
	// Kana letters. If \|first\| and \|second\| contain the same Kana letter then
	// function returns offset in characters from \|first\|.
	// Pointers to both strings increase simultaneously so so it is possible to use
	// one offset value.
	static inline size_t CompareKanaLetterAndComposedVoicedSoundMarks(
	const UChar* first,
	const UChar* first_end,
	const UChar* second,
	const UChar* second_end) {
	const UChar* start = first;
	// Check for differences in the kana letter character itself.
	if (IsSmallKanaLetter(first) != IsSmallKanaLetter(second))
	return kNotFound;
	if (ComposedVoicedSoundMark(first) != ComposedVoicedSoundMark(second))
	return kNotFound;
	++first;
	++second;

	// Check for differences in combining voiced sound marks found after the
	// letter.
	while (true) {
	const bool second_is_not_sound_mark =
	second == second_end \|\| !IsCombiningVoicedSoundMark(*second);
	if (first == first_end \|\| !IsCombiningVoicedSoundMark(*first)) {
	return second_is_not_sound_mark ? first - start : kNotFound;
	}
	if (second_is_not_sound_mark)
	return kNotFound;
	if (first != second)
	return kNotFound;
	++first;
	++second;
	}
	}

	bool CheckOnlyKanaLettersInStrings(const UChar* first_data,
	unsigned first_length,
	const UChar* second_data,
	unsigned second_length) {
	const UChar* a = first_data;
	const UChar* a_end = first_data + first_length;

	const UChar* b = second_data;
	const UChar* b_end = second_data + second_length;
	while (true) {
	// Skip runs of non-kana-letter characters. This is necessary so we can
	// correctly handle strings where the \|firstData\| and \|secondData\| have
	// different-length runs of characters that match, while still double
	// checking the correctness of matches of kana letters with other kana
	// letters.
	while (a != a_end && !IsKanaLetter(*a))
	++a;
	while (b != b_end && !IsKanaLetter(*b))
	++b;

	// If we reached the end of either the target or the match, we should have
	// reached the end of both; both should have the same number of kana
	// letters.
	if (a == a_end \|\| b == b_end) {
	return a == a_end && b == b_end;
	}

	// Check that single Kana letters in \|a\| and \|b\| are the same.
	const size_t offset =
	CompareKanaLetterAndComposedVoicedSoundMarks(a, a_end, b, b_end);
	if (offset == kNotFound)
	return false;

	// Update values of \|a\| and \|b\| after comparing.
	a += offset;
	b += offset;
	}
	}

	bool CheckKanaStringsEqual(const UChar* first_data,
	unsigned first_length,
	const UChar* second_data,
	unsigned second_length) {
	const UChar* a = first_data;
	const UChar* a_end = first_data + first_length;

	const UChar* b = second_data;
	const UChar* b_end = second_data + second_length;
	while (true) {
	// Check for non-kana-letter characters.
	while (a != a_end && !IsKanaLetter(a) && b != b_end && !IsKanaLetter(b)) {
	if (a++ != b++)
	return false;
	}

	// If we reached the end of either the target or the match, we should have
	// reached the end of both; both should have the same number of kana
	// letters.
	if (a == a_end \|\| b == b_end) {
	return a == a_end && b == b_end;
	}

	if (IsKanaLetter(a) != IsKanaLetter(b))
	return false;

	// Check that single Kana letters in \|a\| and \|b\| are the same.
	const size_t offset =
	CompareKanaLetterAndComposedVoicedSoundMarks(a, a_end, b, b_end);
	if (offset == kNotFound)
	return false;

	// Update values of \|a\| and \|b\| after comparing.
	a += offset;
	b += offset;
	}
	}

	} // namespace blink