Igor Sarkisov | 7a4d6f0 | 2020-10-06 04:17:58 -0700 | [diff] [blame] | 1 | /* |
| 2 | ******************************************************************************* |
| 3 | * |
| 4 | * Copyright (C) 2001-2010, International Business Machines |
| 5 | * Corporation and others. All Rights Reserved. |
| 6 | * |
| 7 | ******************************************************************************* |
| 8 | * file name: ucol_tok.cpp |
| 9 | * encoding: US-ASCII |
| 10 | * tab size: 8 (not used) |
| 11 | * indentation:4 |
| 12 | * |
| 13 | * created 02/22/2001 |
| 14 | * created by: Vladimir Weinstein |
| 15 | * |
| 16 | * This module reads a tailoring rule string and produces a list of |
| 17 | * tokens that will be turned into collation elements |
| 18 | * |
| 19 | */ |
| 20 | |
| 21 | #include "unicode/utypes.h" |
| 22 | |
| 23 | #if !UCONFIG_NO_COLLATION |
| 24 | |
| 25 | #include "unicode/ustring.h" |
| 26 | #include "unicode/uchar.h" |
| 27 | #include "unicode/uniset.h" |
| 28 | |
| 29 | #include "ucol_tok.h" |
| 30 | #include "ucol_bld.h" |
| 31 | #include "cmemory.h" |
| 32 | #include "util.h" |
| 33 | |
| 34 | U_CDECL_BEGIN |
| 35 | static int32_t U_CALLCONV |
| 36 | uhash_hashTokens(const UHashTok k) |
| 37 | { |
| 38 | int32_t hash = 0; |
| 39 | //uint32_t key = (uint32_t)k.integer; |
| 40 | UColToken *key = (UColToken *)k.pointer; |
| 41 | if (key != 0) { |
| 42 | //int32_t len = (key & 0xFF000000)>>24; |
| 43 | int32_t len = (key->source & 0xFF000000)>>24; |
| 44 | int32_t inc = ((len - 32) / 32) + 1; |
| 45 | |
| 46 | //const UChar *p = (key & 0x00FFFFFF) + rulesToParse; |
| 47 | const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse; |
| 48 | const UChar *limit = p + len; |
| 49 | |
| 50 | while (p<limit) { |
| 51 | hash = (hash * 37) + *p; |
| 52 | p += inc; |
| 53 | } |
| 54 | } |
| 55 | return hash; |
| 56 | } |
| 57 | |
| 58 | static UBool U_CALLCONV |
| 59 | uhash_compareTokens(const UHashTok key1, const UHashTok key2) |
| 60 | { |
| 61 | //uint32_t p1 = (uint32_t) key1.integer; |
| 62 | //uint32_t p2 = (uint32_t) key2.integer; |
| 63 | UColToken *p1 = (UColToken *)key1.pointer; |
| 64 | UColToken *p2 = (UColToken *)key2.pointer; |
| 65 | const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse; |
| 66 | const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse; |
| 67 | uint32_t s1L = ((p1->source & 0xFF000000) >> 24); |
| 68 | uint32_t s2L = ((p2->source & 0xFF000000) >> 24); |
| 69 | const UChar *end = s1+s1L-1; |
| 70 | |
| 71 | if (p1 == p2) { |
| 72 | return TRUE; |
| 73 | } |
| 74 | if (p1->source == 0 || p2->source == 0) { |
| 75 | return FALSE; |
| 76 | } |
| 77 | if(s1L != s2L) { |
| 78 | return FALSE; |
| 79 | } |
| 80 | if(p1->source == p2->source) { |
| 81 | return TRUE; |
| 82 | } |
| 83 | while((s1 < end) && *s1 == *s2) { |
| 84 | ++s1; |
| 85 | ++s2; |
| 86 | } |
| 87 | if(*s1 == *s2) { |
| 88 | return TRUE; |
| 89 | } else { |
| 90 | return FALSE; |
| 91 | } |
| 92 | } |
| 93 | U_CDECL_END |
| 94 | |
| 95 | /*static inline void U_CALLCONV |
| 96 | uhash_freeBlockWrapper(void *obj) { |
| 97 | uhash_freeBlock(obj); |
| 98 | }*/ |
| 99 | |
| 100 | |
| 101 | typedef struct { |
| 102 | uint32_t startCE; |
| 103 | uint32_t startContCE; |
| 104 | uint32_t limitCE; |
| 105 | uint32_t limitContCE; |
| 106 | } indirectBoundaries; |
| 107 | |
| 108 | /* these values are used for finding CE values for indirect positioning. */ |
| 109 | /* Indirect positioning is a mechanism for allowing resets on symbolic */ |
| 110 | /* values. It only works for resets and you cannot tailor indirect names */ |
| 111 | /* An indirect name can define either an anchor point or a range. An */ |
| 112 | /* anchor point behaves in exactly the same way as a code point in reset */ |
| 113 | /* would, except that it cannot be tailored. A range (we currently only */ |
| 114 | /* know for the [top] range will explicitly set the upper bound for */ |
| 115 | /* generated CEs, thus allowing for better control over how many CEs can */ |
| 116 | /* be squeezed between in the range without performance penalty. */ |
| 117 | /* In that respect, we use [top] for tailoring of locales that use CJK */ |
| 118 | /* characters. Other indirect values are currently a pure convenience, */ |
| 119 | /* they can be used to assure that the CEs will be always positioned in */ |
| 120 | /* the same place relative to a point with known properties (e.g. first */ |
| 121 | /* primary ignorable). */ |
| 122 | static indirectBoundaries ucolIndirectBoundaries[15]; |
| 123 | /* |
| 124 | static indirectBoundaries ucolIndirectBoundaries[11] = { |
| 125 | { UCOL_RESET_TOP_VALUE, 0, |
| 126 | UCOL_NEXT_TOP_VALUE, 0 }, |
| 127 | { UCOL_FIRST_PRIMARY_IGNORABLE, 0, |
| 128 | 0, 0 }, |
| 129 | { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT, |
| 130 | 0, 0 }, |
| 131 | { UCOL_FIRST_SECONDARY_IGNORABLE, 0, |
| 132 | 0, 0 }, |
| 133 | { UCOL_LAST_SECONDARY_IGNORABLE, 0, |
| 134 | 0, 0 }, |
| 135 | { UCOL_FIRST_TERTIARY_IGNORABLE, 0, |
| 136 | 0, 0 }, |
| 137 | { UCOL_LAST_TERTIARY_IGNORABLE, 0, |
| 138 | 0, 0 }, |
| 139 | { UCOL_FIRST_VARIABLE, 0, |
| 140 | 0, 0 }, |
| 141 | { UCOL_LAST_VARIABLE, 0, |
| 142 | 0, 0 }, |
| 143 | { UCOL_FIRST_NON_VARIABLE, 0, |
| 144 | 0, 0 }, |
| 145 | { UCOL_LAST_NON_VARIABLE, 0, |
| 146 | 0, 0 }, |
| 147 | }; |
| 148 | */ |
| 149 | |
| 150 | static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) { |
| 151 | |
| 152 | // Set values for the top - TODO: once we have values for all the indirects, we are going |
| 153 | // to initalize here. |
| 154 | ucolIndirectBoundaries[indexR].startCE = start[0]; |
| 155 | ucolIndirectBoundaries[indexR].startContCE = start[1]; |
| 156 | if(end) { |
| 157 | ucolIndirectBoundaries[indexR].limitCE = end[0]; |
| 158 | ucolIndirectBoundaries[indexR].limitContCE = end[1]; |
| 159 | } else { |
| 160 | ucolIndirectBoundaries[indexR].limitCE = 0; |
| 161 | ucolIndirectBoundaries[indexR].limitContCE = 0; |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | |
| 166 | static inline |
| 167 | void syntaxError(const UChar* rules, |
| 168 | int32_t pos, |
| 169 | int32_t rulesLen, |
| 170 | UParseError* parseError) |
| 171 | { |
| 172 | parseError->offset = pos; |
| 173 | parseError->line = 0 ; /* we are not using line numbers */ |
| 174 | |
| 175 | // for pre-context |
| 176 | int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1)); |
| 177 | int32_t stop = pos; |
| 178 | |
| 179 | u_memcpy(parseError->preContext,rules+start,stop-start); |
| 180 | //null terminate the buffer |
| 181 | parseError->preContext[stop-start] = 0; |
| 182 | |
| 183 | //for post-context |
| 184 | start = pos+1; |
| 185 | stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) : |
| 186 | rulesLen; |
| 187 | |
| 188 | if(start < stop) { |
| 189 | u_memcpy(parseError->postContext,rules+start,stop-start); |
| 190 | //null terminate the buffer |
| 191 | parseError->postContext[stop-start]= 0; |
| 192 | } else { |
| 193 | parseError->postContext[0] = 0; |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | static |
| 198 | void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) { |
| 199 | switch(attrib) { |
| 200 | case UCOL_HIRAGANA_QUATERNARY_MODE: |
| 201 | opts->hiraganaQ = value; |
| 202 | break; |
| 203 | case UCOL_FRENCH_COLLATION: |
| 204 | opts->frenchCollation = value; |
| 205 | break; |
| 206 | case UCOL_ALTERNATE_HANDLING: |
| 207 | opts->alternateHandling = value; |
| 208 | break; |
| 209 | case UCOL_CASE_FIRST: |
| 210 | opts->caseFirst = value; |
| 211 | break; |
| 212 | case UCOL_CASE_LEVEL: |
| 213 | opts->caseLevel = value; |
| 214 | break; |
| 215 | case UCOL_NORMALIZATION_MODE: |
| 216 | opts->normalizationMode = value; |
| 217 | break; |
| 218 | case UCOL_STRENGTH: |
| 219 | opts->strength = value; |
| 220 | break; |
| 221 | case UCOL_NUMERIC_COLLATION: |
| 222 | opts->numericCollation = value; |
| 223 | break; |
| 224 | case UCOL_ATTRIBUTE_COUNT: |
| 225 | default: |
| 226 | break; |
| 227 | } |
| 228 | } |
| 229 | |
| 230 | #define UTOK_OPTION_COUNT 20 |
| 231 | |
| 232 | static UBool didInit = FALSE; |
| 233 | /* we can be strict, or we can be lenient */ |
| 234 | /* I'd surely be lenient with the option arguments */ |
| 235 | /* maybe even with options */ |
| 236 | U_STRING_DECL(suboption_00, "non-ignorable", 13); |
| 237 | U_STRING_DECL(suboption_01, "shifted", 7); |
| 238 | |
| 239 | U_STRING_DECL(suboption_02, "lower", 5); |
| 240 | U_STRING_DECL(suboption_03, "upper", 5); |
| 241 | U_STRING_DECL(suboption_04, "off", 3); |
| 242 | U_STRING_DECL(suboption_05, "on", 2); |
| 243 | U_STRING_DECL(suboption_06, "1", 1); |
| 244 | U_STRING_DECL(suboption_07, "2", 1); |
| 245 | U_STRING_DECL(suboption_08, "3", 1); |
| 246 | U_STRING_DECL(suboption_09, "4", 1); |
| 247 | U_STRING_DECL(suboption_10, "I", 1); |
| 248 | |
| 249 | U_STRING_DECL(suboption_11, "primary", 7); |
| 250 | U_STRING_DECL(suboption_12, "secondary", 9); |
| 251 | U_STRING_DECL(suboption_13, "tertiary", 8); |
| 252 | U_STRING_DECL(suboption_14, "variable", 8); |
| 253 | U_STRING_DECL(suboption_15, "regular", 7); |
| 254 | U_STRING_DECL(suboption_16, "implicit", 8); |
| 255 | U_STRING_DECL(suboption_17, "trailing", 8); |
| 256 | |
| 257 | |
| 258 | U_STRING_DECL(option_00, "undefined", 9); |
| 259 | U_STRING_DECL(option_01, "rearrange", 9); |
| 260 | U_STRING_DECL(option_02, "alternate", 9); |
| 261 | U_STRING_DECL(option_03, "backwards", 9); |
| 262 | U_STRING_DECL(option_04, "variable top", 12); |
| 263 | U_STRING_DECL(option_05, "top", 3); |
| 264 | U_STRING_DECL(option_06, "normalization", 13); |
| 265 | U_STRING_DECL(option_07, "caseLevel", 9); |
| 266 | U_STRING_DECL(option_08, "caseFirst", 9); |
| 267 | U_STRING_DECL(option_09, "scriptOrder", 11); |
| 268 | U_STRING_DECL(option_10, "charsetname", 11); |
| 269 | U_STRING_DECL(option_11, "charset", 7); |
| 270 | U_STRING_DECL(option_12, "before", 6); |
| 271 | U_STRING_DECL(option_13, "hiraganaQ", 9); |
| 272 | U_STRING_DECL(option_14, "strength", 8); |
| 273 | U_STRING_DECL(option_15, "first", 5); |
| 274 | U_STRING_DECL(option_16, "last", 4); |
| 275 | U_STRING_DECL(option_17, "optimize", 8); |
| 276 | U_STRING_DECL(option_18, "suppressContractions", 20); |
| 277 | U_STRING_DECL(option_19, "numericOrdering", 15); |
| 278 | |
| 279 | |
| 280 | /* |
| 281 | [last variable] last variable value |
| 282 | [last primary ignorable] largest CE for primary ignorable |
| 283 | [last secondary ignorable] largest CE for secondary ignorable |
| 284 | [last tertiary ignorable] largest CE for tertiary ignorable |
| 285 | [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) |
| 286 | */ |
| 287 | |
| 288 | |
| 289 | static const ucolTokSuboption alternateSub[2] = { |
| 290 | {suboption_00, 13, UCOL_NON_IGNORABLE}, |
| 291 | {suboption_01, 7, UCOL_SHIFTED} |
| 292 | }; |
| 293 | |
| 294 | static const ucolTokSuboption caseFirstSub[3] = { |
| 295 | {suboption_02, 5, UCOL_LOWER_FIRST}, |
| 296 | {suboption_03, 5, UCOL_UPPER_FIRST}, |
| 297 | {suboption_04, 3, UCOL_OFF}, |
| 298 | }; |
| 299 | |
| 300 | static const ucolTokSuboption onOffSub[2] = { |
| 301 | {suboption_04, 3, UCOL_OFF}, |
| 302 | {suboption_05, 2, UCOL_ON} |
| 303 | }; |
| 304 | |
| 305 | static const ucolTokSuboption frenchSub[1] = { |
| 306 | {suboption_07, 1, UCOL_ON} |
| 307 | }; |
| 308 | |
| 309 | static const ucolTokSuboption beforeSub[3] = { |
| 310 | {suboption_06, 1, UCOL_PRIMARY}, |
| 311 | {suboption_07, 1, UCOL_SECONDARY}, |
| 312 | {suboption_08, 1, UCOL_TERTIARY} |
| 313 | }; |
| 314 | |
| 315 | static const ucolTokSuboption strengthSub[5] = { |
| 316 | {suboption_06, 1, UCOL_PRIMARY}, |
| 317 | {suboption_07, 1, UCOL_SECONDARY}, |
| 318 | {suboption_08, 1, UCOL_TERTIARY}, |
| 319 | {suboption_09, 1, UCOL_QUATERNARY}, |
| 320 | {suboption_10, 1, UCOL_IDENTICAL}, |
| 321 | }; |
| 322 | |
| 323 | static const ucolTokSuboption firstLastSub[7] = { |
| 324 | {suboption_11, 7, UCOL_PRIMARY}, |
| 325 | {suboption_12, 9, UCOL_PRIMARY}, |
| 326 | {suboption_13, 8, UCOL_PRIMARY}, |
| 327 | {suboption_14, 8, UCOL_PRIMARY}, |
| 328 | {suboption_15, 7, UCOL_PRIMARY}, |
| 329 | {suboption_16, 8, UCOL_PRIMARY}, |
| 330 | {suboption_17, 8, UCOL_PRIMARY}, |
| 331 | }; |
| 332 | |
| 333 | enum OptionNumber { |
| 334 | OPTION_ALTERNATE_HANDLING = 0, |
| 335 | OPTION_FRENCH_COLLATION, |
| 336 | OPTION_CASE_LEVEL, |
| 337 | OPTION_CASE_FIRST, |
| 338 | OPTION_NORMALIZATION_MODE, |
| 339 | OPTION_HIRAGANA_QUATERNARY, |
| 340 | OPTION_STRENGTH, |
| 341 | OPTION_NUMERIC_COLLATION, |
| 342 | OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION, |
| 343 | OPTION_VARIABLE_TOP, |
| 344 | OPTION_REARRANGE, |
| 345 | OPTION_BEFORE, |
| 346 | OPTION_TOP, |
| 347 | OPTION_FIRST, |
| 348 | OPTION_LAST, |
| 349 | OPTION_OPTIMIZE, |
| 350 | OPTION_SUPPRESS_CONTRACTIONS, |
| 351 | OPTION_UNDEFINED, |
| 352 | OPTION_SCRIPT_ORDER, |
| 353 | OPTION_CHARSET_NAME, |
| 354 | OPTION_CHARSET |
| 355 | } ; |
| 356 | |
| 357 | static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { |
| 358 | /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */ |
| 359 | /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */ |
| 360 | /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */ |
| 361 | /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */ |
| 362 | /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */ |
| 363 | /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */ |
| 364 | /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */ |
| 365 | /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/ |
| 366 | /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */ |
| 367 | /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ |
| 368 | /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ |
| 369 | /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ |
| 370 | /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */ |
| 371 | /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */ |
| 372 | /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */ |
| 373 | /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */ |
| 374 | /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ |
| 375 | /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ |
| 376 | /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ |
| 377 | /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */ |
| 378 | }; |
| 379 | |
| 380 | static |
| 381 | int32_t u_strncmpNoCase(const UChar *s1, |
| 382 | const UChar *s2, |
| 383 | int32_t n) |
| 384 | { |
| 385 | if(n > 0) { |
| 386 | int32_t rc; |
| 387 | for(;;) { |
| 388 | rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2); |
| 389 | if(rc != 0 || *s1 == 0 || --n == 0) { |
| 390 | return rc; |
| 391 | } |
| 392 | ++s1; |
| 393 | ++s2; |
| 394 | } |
| 395 | } |
| 396 | return 0; |
| 397 | } |
| 398 | |
| 399 | static |
| 400 | void ucol_uprv_tok_initData() { |
| 401 | if(!didInit) { |
| 402 | U_STRING_INIT(suboption_00, "non-ignorable", 13); |
| 403 | U_STRING_INIT(suboption_01, "shifted", 7); |
| 404 | |
| 405 | U_STRING_INIT(suboption_02, "lower", 5); |
| 406 | U_STRING_INIT(suboption_03, "upper", 5); |
| 407 | U_STRING_INIT(suboption_04, "off", 3); |
| 408 | U_STRING_INIT(suboption_05, "on", 2); |
| 409 | |
| 410 | U_STRING_INIT(suboption_06, "1", 1); |
| 411 | U_STRING_INIT(suboption_07, "2", 1); |
| 412 | U_STRING_INIT(suboption_08, "3", 1); |
| 413 | U_STRING_INIT(suboption_09, "4", 1); |
| 414 | U_STRING_INIT(suboption_10, "I", 1); |
| 415 | |
| 416 | U_STRING_INIT(suboption_11, "primary", 7); |
| 417 | U_STRING_INIT(suboption_12, "secondary", 9); |
| 418 | U_STRING_INIT(suboption_13, "tertiary", 8); |
| 419 | U_STRING_INIT(suboption_14, "variable", 8); |
| 420 | U_STRING_INIT(suboption_15, "regular", 7); |
| 421 | U_STRING_INIT(suboption_16, "implicit", 8); |
| 422 | U_STRING_INIT(suboption_17, "trailing", 8); |
| 423 | |
| 424 | |
| 425 | U_STRING_INIT(option_00, "undefined", 9); |
| 426 | U_STRING_INIT(option_01, "rearrange", 9); |
| 427 | U_STRING_INIT(option_02, "alternate", 9); |
| 428 | U_STRING_INIT(option_03, "backwards", 9); |
| 429 | U_STRING_INIT(option_04, "variable top", 12); |
| 430 | U_STRING_INIT(option_05, "top", 3); |
| 431 | U_STRING_INIT(option_06, "normalization", 13); |
| 432 | U_STRING_INIT(option_07, "caseLevel", 9); |
| 433 | U_STRING_INIT(option_08, "caseFirst", 9); |
| 434 | U_STRING_INIT(option_09, "scriptOrder", 11); |
| 435 | U_STRING_INIT(option_10, "charsetname", 11); |
| 436 | U_STRING_INIT(option_11, "charset", 7); |
| 437 | U_STRING_INIT(option_12, "before", 6); |
| 438 | U_STRING_INIT(option_13, "hiraganaQ", 9); |
| 439 | U_STRING_INIT(option_14, "strength", 8); |
| 440 | U_STRING_INIT(option_15, "first", 5); |
| 441 | U_STRING_INIT(option_16, "last", 4); |
| 442 | U_STRING_INIT(option_17, "optimize", 8); |
| 443 | U_STRING_INIT(option_18, "suppressContractions", 20); |
| 444 | U_STRING_INIT(option_19, "numericOrdering", 15); |
| 445 | didInit = TRUE; |
| 446 | } |
| 447 | } |
| 448 | |
| 449 | |
| 450 | // This function reads basic options to set in the runtime collator |
| 451 | // used by data driven tests. Should not support build time options |
| 452 | U_CAPI const UChar * U_EXPORT2 |
| 453 | ucol_tok_getNextArgument(const UChar *start, const UChar *end, |
| 454 | UColAttribute *attrib, UColAttributeValue *value, |
| 455 | UErrorCode *status) |
| 456 | { |
| 457 | uint32_t i = 0; |
| 458 | int32_t j=0; |
| 459 | UBool foundOption = FALSE; |
| 460 | const UChar *optionArg = NULL; |
| 461 | |
| 462 | ucol_uprv_tok_initData(); |
| 463 | |
| 464 | while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */ |
| 465 | start++; |
| 466 | } |
| 467 | if(start >= end) { |
| 468 | return NULL; |
| 469 | } |
| 470 | /* skip opening '[' */ |
| 471 | if(*start == 0x005b) { |
| 472 | start++; |
| 473 | } else { |
| 474 | *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '[' |
| 475 | return NULL; |
| 476 | } |
| 477 | |
| 478 | while(i < UTOK_OPTION_COUNT) { |
| 479 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
| 480 | foundOption = TRUE; |
| 481 | if(end - start > rulesOptions[i].optionLen) { |
| 482 | optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */ |
| 483 | while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */ |
| 484 | optionArg++; |
| 485 | } |
| 486 | } |
| 487 | break; |
| 488 | } |
| 489 | i++; |
| 490 | } |
| 491 | |
| 492 | if(!foundOption) { |
| 493 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 494 | return NULL; |
| 495 | } |
| 496 | |
| 497 | if(optionArg) { |
| 498 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 499 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
| 500 | //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
| 501 | *attrib = rulesOptions[i].attr; |
| 502 | *value = rulesOptions[i].subopts[j].attrVal; |
| 503 | optionArg += rulesOptions[i].subopts[j].subLen; |
| 504 | while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */ |
| 505 | optionArg++; |
| 506 | } |
| 507 | if(*optionArg == 0x005d) { |
| 508 | optionArg++; |
| 509 | return optionArg; |
| 510 | } else { |
| 511 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 512 | return NULL; |
| 513 | } |
| 514 | } |
| 515 | } |
| 516 | } |
| 517 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 518 | return NULL; |
| 519 | } |
| 520 | |
| 521 | static |
| 522 | USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) { |
| 523 | while(*start != 0x005b) { /* advance while we find the first '[' */ |
| 524 | start++; |
| 525 | } |
| 526 | // now we need to get a balanced set of '[]'. The problem is that a set can have |
| 527 | // many, and *end point to the first closing '[' |
| 528 | int32_t noOpenBraces = 1; |
| 529 | int32_t current = 1; // skip the opening brace |
| 530 | while(start+current < end && noOpenBraces != 0) { |
| 531 | if(start[current] == 0x005b) { |
| 532 | noOpenBraces++; |
| 533 | } else if(start[current] == 0x005D) { // closing brace |
| 534 | noOpenBraces--; |
| 535 | } |
| 536 | current++; |
| 537 | } |
| 538 | |
| 539 | if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) { |
| 540 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 541 | return NULL; |
| 542 | } |
| 543 | return uset_openPattern(start, current, status); |
| 544 | } |
| 545 | |
| 546 | static |
| 547 | int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) { |
| 548 | int32_t i = 0; |
| 549 | ucol_uprv_tok_initData(); |
| 550 | |
| 551 | while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */ |
| 552 | start++; |
| 553 | } |
| 554 | while(i < UTOK_OPTION_COUNT) { |
| 555 | if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) { |
| 556 | if(end - start > rulesOptions[i].optionLen) { |
| 557 | *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/ |
| 558 | while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */ |
| 559 | (*optionArg)++; |
| 560 | } |
| 561 | } |
| 562 | break; |
| 563 | } |
| 564 | i++; |
| 565 | } |
| 566 | if(i == UTOK_OPTION_COUNT) { |
| 567 | i = -1; // didn't find an option |
| 568 | } |
| 569 | return i; |
| 570 | } |
| 571 | |
| 572 | |
| 573 | // reads and conforms to various options in rules |
| 574 | // end is the position of the first closing ']' |
| 575 | // However, some of the options take an UnicodeSet definition |
| 576 | // which needs to duplicate the closing ']' |
| 577 | // for example: '[copy [\uAC00-\uD7FF]]' |
| 578 | // These options will move end to the second ']' and the |
| 579 | // caller will set the current to it. |
| 580 | static |
| 581 | uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) { |
| 582 | const UChar* start = src->current; |
| 583 | int32_t i = 0; |
| 584 | int32_t j=0; |
| 585 | const UChar *optionArg = NULL; |
| 586 | |
| 587 | uint8_t result = 0; |
| 588 | |
| 589 | start++; /*skip opening '['*/ |
| 590 | i = ucol_uprv_tok_readOption(start, src->end, &optionArg); |
| 591 | if(optionArg) { |
| 592 | src->current = optionArg; |
| 593 | } |
| 594 | |
| 595 | if(i < 0) { |
| 596 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 597 | } else { |
| 598 | int32_t noOpenBraces = 1; |
| 599 | switch(i) { |
| 600 | case OPTION_ALTERNATE_HANDLING: |
| 601 | case OPTION_FRENCH_COLLATION: |
| 602 | case OPTION_CASE_LEVEL: |
| 603 | case OPTION_CASE_FIRST: |
| 604 | case OPTION_NORMALIZATION_MODE: |
| 605 | case OPTION_HIRAGANA_QUATERNARY: |
| 606 | case OPTION_STRENGTH: |
| 607 | case OPTION_NUMERIC_COLLATION: |
| 608 | if(optionArg) { |
| 609 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 610 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
| 611 | ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal); |
| 612 | result = UCOL_TOK_SUCCESS; |
| 613 | } |
| 614 | } |
| 615 | } |
| 616 | if(result == 0) { |
| 617 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 618 | } |
| 619 | break; |
| 620 | case OPTION_VARIABLE_TOP: |
| 621 | result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP; |
| 622 | break; |
| 623 | case OPTION_REARRANGE: |
| 624 | result = UCOL_TOK_SUCCESS; |
| 625 | break; |
| 626 | case OPTION_BEFORE: |
| 627 | if(optionArg) { |
| 628 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 629 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
| 630 | result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1; |
| 631 | } |
| 632 | } |
| 633 | } |
| 634 | if(result == 0) { |
| 635 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 636 | } |
| 637 | break; |
| 638 | case OPTION_TOP: /* we are going to have an array with structures of limit CEs */ |
| 639 | /* index to this array will be src->parsedToken.indirectIndex*/ |
| 640 | src->parsedToken.indirectIndex = 0; |
| 641 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP; |
| 642 | break; |
| 643 | case OPTION_FIRST: |
| 644 | case OPTION_LAST: /* first, last */ |
| 645 | for(j = 0; j<rulesOptions[i].subSize; j++) { |
| 646 | if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) { |
| 647 | // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first |
| 648 | // element of indirect boundaries is reserved for top. |
| 649 | src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2); |
| 650 | result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;; |
| 651 | } |
| 652 | } |
| 653 | if(result == 0) { |
| 654 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 655 | } |
| 656 | break; |
| 657 | case OPTION_OPTIMIZE: |
| 658 | case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization |
| 659 | // we need to move end here |
| 660 | src->current++; // skip opening brace |
| 661 | while(src->current < src->end && noOpenBraces != 0) { |
| 662 | if(*src->current == 0x005b) { |
| 663 | noOpenBraces++; |
| 664 | } else if(*src->current == 0x005D) { // closing brace |
| 665 | noOpenBraces--; |
| 666 | } |
| 667 | src->current++; |
| 668 | } |
| 669 | result = UCOL_TOK_SUCCESS; |
| 670 | break; |
| 671 | default: |
| 672 | *status = U_UNSUPPORTED_ERROR; |
| 673 | break; |
| 674 | } |
| 675 | } |
| 676 | src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current)); |
| 677 | return result; |
| 678 | } |
| 679 | |
| 680 | |
| 681 | inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) { |
| 682 | if (stuff == NULL || len <= 0) { |
| 683 | return; |
| 684 | } |
| 685 | UChar *tempStuff = (UChar *)stuff; |
| 686 | if(src->extraCurrent+len >= src->extraEnd) { |
| 687 | /* reallocate */ |
| 688 | if (stuff >= src->source && stuff <= src->end) { |
| 689 | // Copy stuff to a new buffer if stuff points to an address within |
| 690 | // src->source buffer. |
| 691 | tempStuff = (UChar*)uprv_malloc(len*sizeof(UChar)); |
| 692 | if (tempStuff == NULL) { |
| 693 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 694 | return; |
| 695 | } |
| 696 | uprv_memcpy(tempStuff, stuff, len*sizeof(UChar)); |
| 697 | } |
| 698 | UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar)); |
| 699 | if(newSrc != NULL) { |
| 700 | src->current = newSrc + (src->current - src->source); |
| 701 | src->extraCurrent = newSrc + (src->extraCurrent - src->source); |
| 702 | src->end = newSrc + (src->end - src->source); |
| 703 | src->extraEnd = newSrc + (src->extraEnd-src->source)*2; |
| 704 | src->sourceCurrent = newSrc + (src->sourceCurrent-src->source); |
| 705 | src->source = newSrc; |
| 706 | } else { |
| 707 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 708 | if (tempStuff != stuff) { |
| 709 | uprv_free(tempStuff); |
| 710 | } |
| 711 | return; |
| 712 | } |
| 713 | } |
| 714 | if(len == 1) { |
| 715 | *src->extraCurrent++ = *tempStuff; |
| 716 | } else { |
| 717 | uprv_memcpy(src->extraCurrent, tempStuff, len*sizeof(UChar)); |
| 718 | src->extraCurrent += len; |
| 719 | } |
| 720 | if (tempStuff != stuff) { |
| 721 | uprv_free(tempStuff); |
| 722 | } |
| 723 | } |
| 724 | |
| 725 | inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) { |
| 726 | /* |
| 727 | top = TRUE; |
| 728 | */ |
| 729 | UChar buff[5]; |
| 730 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 731 | buff[0] = 0xFFFE; |
| 732 | buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); |
| 733 | buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); |
| 734 | if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) { |
| 735 | src->parsedToken.charsLen = 3; |
| 736 | ucol_tok_addToExtraCurrent(src, buff, 3, status); |
| 737 | } else { |
| 738 | buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); |
| 739 | buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); |
| 740 | src->parsedToken.charsLen = 5; |
| 741 | ucol_tok_addToExtraCurrent(src, buff, 5, status); |
| 742 | } |
| 743 | return TRUE; |
| 744 | } |
| 745 | |
| 746 | static UBool isCharNewLine(UChar c){ |
| 747 | switch(c){ |
| 748 | case 0x000A: /* LF */ |
| 749 | case 0x000D: /* CR */ |
| 750 | case 0x000C: /* FF */ |
| 751 | case 0x0085: /* NEL */ |
| 752 | case 0x2028: /* LS */ |
| 753 | case 0x2029: /* PS */ |
| 754 | return TRUE; |
| 755 | default: |
| 756 | return FALSE; |
| 757 | } |
| 758 | } |
| 759 | |
| 760 | U_CAPI const UChar* U_EXPORT2 |
| 761 | ucol_tok_parseNextToken(UColTokenParser *src, |
| 762 | UBool startOfRules, |
| 763 | UParseError *parseError, |
| 764 | UErrorCode *status) |
| 765 | { |
| 766 | /* parsing part */ |
| 767 | UBool variableTop = FALSE; |
| 768 | UBool top = FALSE; |
| 769 | UBool inChars = TRUE; |
| 770 | UBool inQuote = FALSE; |
| 771 | UBool wasInQuote = FALSE; |
| 772 | uint8_t before = 0; |
| 773 | UBool isEscaped = FALSE; |
| 774 | // TODO: replace these variables with src->parsedToken counterparts |
| 775 | // no need to use them anymore since we have src->parsedToken. |
| 776 | // Ideally, token parser would be a nice class... Once, when I have |
| 777 | // more time (around 2020 probably). |
| 778 | uint32_t newExtensionLen = 0; |
| 779 | uint32_t extensionOffset = 0; |
| 780 | uint32_t newStrength = UCOL_TOK_UNSET; |
| 781 | UChar buff[10]; |
| 782 | UChar32 codepoint; |
| 783 | |
| 784 | src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; |
| 785 | src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; |
| 786 | src->parsedToken.indirectIndex = 0; |
| 787 | |
| 788 | while (src->current < src->end) { |
| 789 | UChar ch = *(src->current); |
| 790 | |
| 791 | if (inQuote) { |
| 792 | if (ch == 0x0027/*'\''*/) { |
| 793 | inQuote = FALSE; |
| 794 | } else { |
| 795 | if ((src->parsedToken.charsLen == 0) || inChars) { |
| 796 | if(src->parsedToken.charsLen == 0) { |
| 797 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 798 | } |
| 799 | src->parsedToken.charsLen++; |
| 800 | } else { |
| 801 | if(newExtensionLen == 0) { |
| 802 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); |
| 803 | } |
| 804 | newExtensionLen++; |
| 805 | } |
| 806 | } |
| 807 | }else if(isEscaped){ |
| 808 | isEscaped =FALSE; |
| 809 | if (newStrength == UCOL_TOK_UNSET) { |
| 810 | *status = U_INVALID_FORMAT_ERROR; |
| 811 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 812 | return NULL; |
| 813 | // enabling rules to start with non-tokens a < b |
| 814 | // newStrength = UCOL_TOK_RESET; |
| 815 | } |
| 816 | if(ch != 0x0000 && src->current != src->end) { |
| 817 | if (inChars) { |
| 818 | if(src->parsedToken.charsLen == 0) { |
| 819 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); |
| 820 | } |
| 821 | src->parsedToken.charsLen++; |
| 822 | } else { |
| 823 | if(newExtensionLen == 0) { |
| 824 | extensionOffset = (uint32_t)(src->current - src->source); |
| 825 | } |
| 826 | newExtensionLen++; |
| 827 | } |
| 828 | } |
| 829 | }else { |
| 830 | if(!uprv_isRuleWhiteSpace(ch)) { |
| 831 | /* Sets the strength for this entry */ |
| 832 | switch (ch) { |
| 833 | case 0x003D/*'='*/ : |
| 834 | if (newStrength != UCOL_TOK_UNSET) { |
| 835 | goto EndOfLoop; |
| 836 | } |
| 837 | |
| 838 | /* if we start with strength, we'll reset to top */ |
| 839 | if(startOfRules == TRUE) { |
| 840 | src->parsedToken.indirectIndex = 5; |
| 841 | top = ucol_tok_doSetTop(src, status); |
| 842 | newStrength = UCOL_TOK_RESET; |
| 843 | goto EndOfLoop; |
| 844 | } |
| 845 | newStrength = UCOL_IDENTICAL; |
| 846 | if(*(src->current+1) == 0x002A) {/*'*'*/ |
| 847 | src->current++; |
| 848 | src->prevStrength = newStrength; |
| 849 | }else{ |
| 850 | src->prevStrength = UCOL_TOK_UNSET; |
| 851 | } |
| 852 | break; |
| 853 | |
| 854 | case 0x002C/*','*/: |
| 855 | if (newStrength != UCOL_TOK_UNSET) { |
| 856 | goto EndOfLoop; |
| 857 | } |
| 858 | |
| 859 | /* if we start with strength, we'll reset to top */ |
| 860 | if(startOfRules == TRUE) { |
| 861 | src->parsedToken.indirectIndex = 5; |
| 862 | top = ucol_tok_doSetTop(src, status); |
| 863 | newStrength = UCOL_TOK_RESET; |
| 864 | goto EndOfLoop; |
| 865 | } |
| 866 | newStrength = UCOL_TERTIARY; |
| 867 | src->prevStrength = UCOL_TOK_UNSET; |
| 868 | break; |
| 869 | |
| 870 | case 0x003B/*';'*/: |
| 871 | if (newStrength != UCOL_TOK_UNSET) { |
| 872 | goto EndOfLoop; |
| 873 | } |
| 874 | |
| 875 | /* if we start with strength, we'll reset to top */ |
| 876 | if(startOfRules == TRUE) { |
| 877 | src->parsedToken.indirectIndex = 5; |
| 878 | top = ucol_tok_doSetTop(src, status); |
| 879 | newStrength = UCOL_TOK_RESET; |
| 880 | goto EndOfLoop; |
| 881 | } |
| 882 | newStrength = UCOL_SECONDARY; |
| 883 | src->prevStrength = UCOL_TOK_UNSET; |
| 884 | break; |
| 885 | |
| 886 | case 0x003C/*'<'*/: |
| 887 | if (newStrength != UCOL_TOK_UNSET) { |
| 888 | goto EndOfLoop; |
| 889 | } |
| 890 | |
| 891 | /* if we start with strength, we'll reset to top */ |
| 892 | if(startOfRules == TRUE) { |
| 893 | src->parsedToken.indirectIndex = 5; |
| 894 | top = ucol_tok_doSetTop(src, status); |
| 895 | newStrength = UCOL_TOK_RESET; |
| 896 | goto EndOfLoop; |
| 897 | } |
| 898 | /* before this, do a scan to verify whether this is */ |
| 899 | /* another strength */ |
| 900 | if(*(src->current+1) == 0x003C) { |
| 901 | src->current++; |
| 902 | if(*(src->current+1) == 0x003C) { |
| 903 | src->current++; /* three in a row! */ |
| 904 | newStrength = UCOL_TERTIARY; |
| 905 | } else { /* two in a row */ |
| 906 | newStrength = UCOL_SECONDARY; |
| 907 | } |
| 908 | } else { /* just one */ |
| 909 | newStrength = UCOL_PRIMARY; |
| 910 | } |
| 911 | if(*(src->current+1) == 0x002A) {/*'*'*/ |
| 912 | src->current++; |
| 913 | src->prevStrength = newStrength; |
| 914 | }else{ |
| 915 | src->prevStrength = UCOL_TOK_UNSET; |
| 916 | } |
| 917 | break; |
| 918 | |
| 919 | case 0x0026/*'&'*/: |
| 920 | if (newStrength != UCOL_TOK_UNSET) { |
| 921 | /**/ |
| 922 | goto EndOfLoop; |
| 923 | } |
| 924 | |
| 925 | newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ |
| 926 | src->prevStrength = UCOL_TOK_UNSET; |
| 927 | break; |
| 928 | |
| 929 | case 0x005b/*'['*/: |
| 930 | /* options - read an option, analyze it */ |
| 931 | if(u_strchr(src->current, 0x005d /*']'*/) != NULL) { |
| 932 | uint8_t result = ucol_uprv_tok_readAndSetOption(src, status); |
| 933 | if(U_SUCCESS(*status)) { |
| 934 | if(result & UCOL_TOK_TOP) { |
| 935 | if(newStrength == UCOL_TOK_RESET) { |
| 936 | top = ucol_tok_doSetTop(src, status); |
| 937 | if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b' |
| 938 | src->parsedToken.charsLen+=2; |
| 939 | buff[0] = 0x002d; |
| 940 | buff[1] = before; |
| 941 | ucol_tok_addToExtraCurrent(src, buff, 2, status); |
| 942 | } |
| 943 | |
| 944 | src->current++; |
| 945 | goto EndOfLoop; |
| 946 | } else { |
| 947 | *status = U_INVALID_FORMAT_ERROR; |
| 948 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 949 | } |
| 950 | } else if(result & UCOL_TOK_VARIABLE_TOP) { |
| 951 | if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { |
| 952 | variableTop = TRUE; |
| 953 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 954 | src->parsedToken.charsLen = 1; |
| 955 | buff[0] = 0xFFFF; |
| 956 | ucol_tok_addToExtraCurrent(src, buff, 1, status); |
| 957 | src->current++; |
| 958 | goto EndOfLoop; |
| 959 | } else { |
| 960 | *status = U_INVALID_FORMAT_ERROR; |
| 961 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 962 | } |
| 963 | } else if (result & UCOL_TOK_BEFORE){ |
| 964 | if(newStrength == UCOL_TOK_RESET) { |
| 965 | before = result & UCOL_TOK_BEFORE; |
| 966 | } else { |
| 967 | *status = U_INVALID_FORMAT_ERROR; |
| 968 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 969 | |
| 970 | } |
| 971 | } |
| 972 | } else { |
| 973 | *status = U_INVALID_FORMAT_ERROR; |
| 974 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 975 | return NULL; |
| 976 | } |
| 977 | } |
| 978 | break; |
| 979 | case 0x0021/*! skip java thai modifier reordering*/: |
| 980 | break; |
| 981 | case 0x002F/*'/'*/: |
| 982 | wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ |
| 983 | inChars = FALSE; /* we're now processing expansion */ |
| 984 | break; |
| 985 | case 0x005C /* back slash for escaped chars */: |
| 986 | isEscaped = TRUE; |
| 987 | break; |
| 988 | /* found a quote, we're gonna start copying */ |
| 989 | case 0x0027/*'\''*/: |
| 990 | if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ |
| 991 | if(src->prevStrength == UCOL_TOK_UNSET){ |
| 992 | *status = U_INVALID_FORMAT_ERROR; |
| 993 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 994 | return NULL; |
| 995 | // enabling rules to start with a non-token character a < b |
| 996 | // newStrength = UCOL_TOK_RESET; |
| 997 | }else{ |
| 998 | newStrength = src->prevStrength; |
| 999 | } |
| 1000 | } |
| 1001 | |
| 1002 | inQuote = TRUE; |
| 1003 | |
| 1004 | if(inChars) { /* we're doing characters */ |
| 1005 | if(wasInQuote == FALSE) { |
| 1006 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 1007 | } |
| 1008 | if (src->parsedToken.charsLen != 0) { |
| 1009 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
| 1010 | } |
| 1011 | src->parsedToken.charsLen++; |
| 1012 | } else { /* we're doing an expansion */ |
| 1013 | if(wasInQuote == FALSE) { |
| 1014 | extensionOffset = (uint32_t)(src->extraCurrent - src->source); |
| 1015 | } |
| 1016 | if (newExtensionLen != 0) { |
| 1017 | ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status); |
| 1018 | } |
| 1019 | newExtensionLen++; |
| 1020 | } |
| 1021 | |
| 1022 | wasInQuote = TRUE; |
| 1023 | |
| 1024 | ch = *(++(src->current)); |
| 1025 | if(ch == 0x0027) { /* copy the double quote */ |
| 1026 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
| 1027 | inQuote = FALSE; |
| 1028 | } |
| 1029 | break; |
| 1030 | |
| 1031 | /* '@' is french only if the strength is not currently set */ |
| 1032 | /* if it is, it's just a regular character in collation rules */ |
| 1033 | case 0x0040/*'@'*/: |
| 1034 | if (newStrength == UCOL_TOK_UNSET) { |
| 1035 | src->opts->frenchCollation = UCOL_ON; |
| 1036 | break; |
| 1037 | } |
| 1038 | |
| 1039 | case 0x007C /*|*/: /* this means we have actually been reading prefix part */ |
| 1040 | // we want to store read characters to the prefix part and continue reading |
| 1041 | // the characters (proper way would be to restart reading the chars, but in |
| 1042 | // that case we would have to complicate the token hasher, which I do not |
| 1043 | // intend to play with. Instead, we will do prefixes when prefixes are due |
| 1044 | // (before adding the elements). |
| 1045 | src->parsedToken.prefixOffset = src->parsedToken.charsOffset; |
| 1046 | src->parsedToken.prefixLen = src->parsedToken.charsLen; |
| 1047 | |
| 1048 | if(inChars) { /* we're doing characters */ |
| 1049 | if(wasInQuote == FALSE) { |
| 1050 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 1051 | } |
| 1052 | if (src->parsedToken.charsLen != 0) { |
| 1053 | ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status); |
| 1054 | } |
| 1055 | src->parsedToken.charsLen++; |
| 1056 | } |
| 1057 | |
| 1058 | wasInQuote = TRUE; |
| 1059 | |
| 1060 | do { |
| 1061 | ch = *(++(src->current)); |
| 1062 | // skip whitespace between '|' and the character |
| 1063 | } while (uprv_isRuleWhiteSpace(ch)); |
| 1064 | break; |
| 1065 | |
| 1066 | //charsOffset = 0; |
| 1067 | //newCharsLen = 0; |
| 1068 | //break; // We want to store the whole prefix/character sequence. If we break |
| 1069 | // the '|' is going to get lost. |
| 1070 | case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */ |
| 1071 | do { |
| 1072 | ch = *(++(src->current)); |
| 1073 | } while (!isCharNewLine(ch)); |
| 1074 | |
| 1075 | break; |
| 1076 | default: |
| 1077 | if (newStrength == UCOL_TOK_UNSET) { |
| 1078 | if(src->prevStrength == UCOL_TOK_UNSET){ |
| 1079 | *status = U_INVALID_FORMAT_ERROR; |
| 1080 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 1081 | return NULL; |
| 1082 | }else{ |
| 1083 | newStrength = src->prevStrength; |
| 1084 | } |
| 1085 | } |
| 1086 | |
| 1087 | if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { |
| 1088 | *status = U_INVALID_FORMAT_ERROR; |
| 1089 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 1090 | return NULL; |
| 1091 | } |
| 1092 | |
| 1093 | if(ch == 0x0000 && src->current+1 == src->end) { |
| 1094 | break; |
| 1095 | } |
| 1096 | |
| 1097 | if (inChars) { |
| 1098 | if(src->parsedToken.charsLen == 0) { |
| 1099 | src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); |
| 1100 | } |
| 1101 | src->parsedToken.charsLen++; |
| 1102 | if(src->prevStrength != UCOL_TOK_UNSET){ |
| 1103 | U16_NEXT(0, src->current, src->end, codepoint); |
| 1104 | src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1; |
| 1105 | goto EndOfLoop; |
| 1106 | } |
| 1107 | } else { |
| 1108 | if(newExtensionLen == 0) { |
| 1109 | extensionOffset = (uint32_t)(src->current - src->source); |
| 1110 | } |
| 1111 | newExtensionLen++; |
| 1112 | } |
| 1113 | |
| 1114 | break; |
| 1115 | } |
| 1116 | } |
| 1117 | } |
| 1118 | |
| 1119 | if(wasInQuote) { |
| 1120 | if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){ |
| 1121 | src->current++; |
| 1122 | goto EndOfLoop; |
| 1123 | } |
| 1124 | if(ch != 0x27) { |
| 1125 | if(inQuote || !uprv_isRuleWhiteSpace(ch)) { |
| 1126 | ucol_tok_addToExtraCurrent(src, &ch, 1, status); |
| 1127 | } |
| 1128 | } |
| 1129 | } |
| 1130 | |
| 1131 | src->current++; |
| 1132 | } |
| 1133 | |
| 1134 | EndOfLoop: |
| 1135 | wasInQuote = FALSE; |
| 1136 | if (newStrength == UCOL_TOK_UNSET) { |
| 1137 | return NULL; |
| 1138 | } |
| 1139 | |
| 1140 | if (src->parsedToken.charsLen == 0 && top == FALSE) { |
| 1141 | syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); |
| 1142 | *status = U_INVALID_FORMAT_ERROR; |
| 1143 | return NULL; |
| 1144 | } |
| 1145 | |
| 1146 | src->parsedToken.strength = newStrength; |
| 1147 | src->parsedToken.extensionOffset = extensionOffset; |
| 1148 | src->parsedToken.extensionLen = newExtensionLen; |
| 1149 | src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before; |
| 1150 | |
| 1151 | return src->current; |
| 1152 | } |
| 1153 | |
| 1154 | /* |
| 1155 | Processing Description |
| 1156 | 1 Build a ListList. Each list has a header, which contains two lists (positive |
| 1157 | and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and |
| 1158 | reset may be null. |
| 1159 | 2 As you process, you keep a LAST pointer that points to the last token you |
| 1160 | handled. |
| 1161 | */ |
| 1162 | |
| 1163 | static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext, |
| 1164 | UParseError *parseError, UErrorCode *status) |
| 1165 | { |
| 1166 | if(src->resultLen == src->listCapacity) { |
| 1167 | // Unfortunately, this won't work, as we store addresses of lhs in token |
| 1168 | src->listCapacity *= 2; |
| 1169 | src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader)); |
| 1170 | if(src->lh == NULL) { |
| 1171 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1172 | return NULL; |
| 1173 | } |
| 1174 | } |
| 1175 | /* do the reset thing */ |
| 1176 | UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
| 1177 | /* test for NULL */ |
| 1178 | if (sourceToken == NULL) { |
| 1179 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1180 | return NULL; |
| 1181 | } |
| 1182 | sourceToken->rulesToParse = src->source; |
| 1183 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
| 1184 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; |
| 1185 | |
| 1186 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
| 1187 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); |
| 1188 | |
| 1189 | // keep the flags around so that we know about before |
| 1190 | sourceToken->flags = src->parsedToken.flags; |
| 1191 | |
| 1192 | if(src->parsedToken.prefixOffset != 0) { |
| 1193 | // this is a syntax error |
| 1194 | *status = U_INVALID_FORMAT_ERROR; |
| 1195 | syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError); |
| 1196 | uprv_free(sourceToken); |
| 1197 | return 0; |
| 1198 | } else { |
| 1199 | sourceToken->prefix = 0; |
| 1200 | } |
| 1201 | |
| 1202 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
| 1203 | sourceToken->strength = UCOL_TOK_RESET; |
| 1204 | sourceToken->next = NULL; |
| 1205 | sourceToken->previous = NULL; |
| 1206 | sourceToken->noOfCEs = 0; |
| 1207 | sourceToken->noOfExpCEs = 0; |
| 1208 | sourceToken->listHeader = &src->lh[src->resultLen]; |
| 1209 | |
| 1210 | src->lh[src->resultLen].first = NULL; |
| 1211 | src->lh[src->resultLen].last = NULL; |
| 1212 | src->lh[src->resultLen].first = NULL; |
| 1213 | src->lh[src->resultLen].last = NULL; |
| 1214 | |
| 1215 | src->lh[src->resultLen].reset = sourceToken; |
| 1216 | |
| 1217 | /* |
| 1218 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
| 1219 | First convert all expansions into normal form. Examples: |
| 1220 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
| 1221 | d * ... into &x * c/y * d * ... |
| 1222 | Note: reset values can never have expansions, although they can cause the |
| 1223 | very next item to have one. They may be contractions, if they are found |
| 1224 | earlier in the list. |
| 1225 | */ |
| 1226 | *expandNext = 0; |
| 1227 | if(expand != NULL) { |
| 1228 | /* check to see if there is an expansion */ |
| 1229 | if(src->parsedToken.charsLen > 1) { |
| 1230 | uint32_t resetCharsOffset; |
| 1231 | resetCharsOffset = (uint32_t)(expand - src->source); |
| 1232 | sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset; |
| 1233 | *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset); |
| 1234 | } |
| 1235 | } |
| 1236 | |
| 1237 | src->resultLen++; |
| 1238 | |
| 1239 | uhash_put(src->tailored, sourceToken, sourceToken, status); |
| 1240 | |
| 1241 | return sourceToken; |
| 1242 | } |
| 1243 | |
| 1244 | static |
| 1245 | inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) { |
| 1246 | if(U_FAILURE(*status)) { |
| 1247 | return NULL; |
| 1248 | } |
| 1249 | /* this is a virgin before - we need to fish the anchor from the UCA */ |
| 1250 | collIterate s; |
| 1251 | uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND; |
| 1252 | uint32_t CE, SecondCE; |
| 1253 | uint32_t invPos; |
| 1254 | if(sourceToken != NULL) { |
| 1255 | uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status); |
| 1256 | } else { |
| 1257 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status); |
| 1258 | } |
| 1259 | if(U_FAILURE(*status)) { |
| 1260 | return NULL; |
| 1261 | } |
| 1262 | |
| 1263 | baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F; |
| 1264 | baseContCE = ucol_getNextCE(src->UCA, &s, status); |
| 1265 | if(baseContCE == UCOL_NO_MORE_CES) { |
| 1266 | baseContCE = 0; |
| 1267 | } |
| 1268 | |
| 1269 | |
| 1270 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
| 1271 | uint32_t ch = 0; |
| 1272 | uint32_t expandNext = 0; |
| 1273 | UColToken key; |
| 1274 | |
| 1275 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
| 1276 | uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16; |
| 1277 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
| 1278 | ch = uprv_uca_getCodePointFromRaw(raw-1); |
| 1279 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
| 1280 | CE = primaryCE & UCOL_PRIMARYMASK | 0x0505; |
| 1281 | SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER; |
| 1282 | |
| 1283 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source); |
| 1284 | *src->extraCurrent++ = 0xFFFE; |
| 1285 | *src->extraCurrent++ = (UChar)ch; |
| 1286 | src->parsedToken.charsLen++; |
| 1287 | |
| 1288 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; |
| 1289 | key.rulesToParse = src->source; |
| 1290 | |
| 1291 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
| 1292 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1293 | |
| 1294 | if(sourceToken == NULL) { |
| 1295 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| 1296 | if(isContinuation(SecondCE)) { |
| 1297 | src->lh[src->resultLen].baseContCE = SecondCE; |
| 1298 | } else { |
| 1299 | src->lh[src->resultLen].baseContCE = 0; |
| 1300 | } |
| 1301 | src->lh[src->resultLen].nextCE = 0; |
| 1302 | src->lh[src->resultLen].nextContCE = 0; |
| 1303 | src->lh[src->resultLen].previousCE = 0; |
| 1304 | src->lh[src->resultLen].previousContCE = 0; |
| 1305 | |
| 1306 | src->lh[src->resultLen].indirect = FALSE; |
| 1307 | |
| 1308 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
| 1309 | } |
| 1310 | |
| 1311 | } else { |
| 1312 | invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); |
| 1313 | |
| 1314 | // we got the previous CE. Now we need to see if the difference between |
| 1315 | // the two CEs is really of the requested strength. |
| 1316 | // if it's a bigger difference (we asked for secondary and got primary), we |
| 1317 | // need to modify the CE. |
| 1318 | if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) { |
| 1319 | // adjust the strength |
| 1320 | // now we are in the situation where our baseCE should actually be modified in |
| 1321 | // order to get the CE in the right position. |
| 1322 | if(strength == UCOL_SECONDARY) { |
| 1323 | CE = baseCE - 0x0200; |
| 1324 | } else { // strength == UCOL_TERTIARY |
| 1325 | CE = baseCE - 0x02; |
| 1326 | } |
| 1327 | if(baseContCE) { |
| 1328 | if(strength == UCOL_SECONDARY) { |
| 1329 | SecondCE = baseContCE - 0x0200; |
| 1330 | } else { // strength == UCOL_TERTIARY |
| 1331 | SecondCE = baseContCE - 0x02; |
| 1332 | } |
| 1333 | } |
| 1334 | } |
| 1335 | |
| 1336 | #if 0 |
| 1337 | // the code below relies on getting a code point from the inverse table, in order to be |
| 1338 | // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: |
| 1339 | // 1. There are many code points that have the same CE |
| 1340 | // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. |
| 1341 | // Also, in case when there is no equivalent strength before an element, we have to actually |
| 1342 | // construct one. For example, &[before 2]a << x won't result in x << a, because the element |
| 1343 | // before a is a primary difference. |
| 1344 | |
| 1345 | //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); |
| 1346 | |
| 1347 | |
| 1348 | ch = CETable[3*invPos+2]; |
| 1349 | |
| 1350 | if((ch & UCOL_INV_SIZEMASK) != 0) { |
| 1351 | uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts); |
| 1352 | uint32_t offset = (ch & UCOL_INV_OFFSETMASK); |
| 1353 | ch = conts[offset]; |
| 1354 | } |
| 1355 | |
| 1356 | *src->extraCurrent++ = (UChar)ch; |
| 1357 | src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1); |
| 1358 | src->parsedToken.charsLen = 1; |
| 1359 | |
| 1360 | // We got an UCA before. However, this might have been tailored. |
| 1361 | // example: |
| 1362 | // &\u30ca = \u306a |
| 1363 | // &[before 3]\u306a<<<\u306a|\u309d |
| 1364 | |
| 1365 | |
| 1366 | // uint32_t key = (*newCharsLen << 24) | *charsOffset; |
| 1367 | key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/; |
| 1368 | key.rulesToParse = src->source; |
| 1369 | |
| 1370 | //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key); |
| 1371 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1372 | #endif |
| 1373 | |
| 1374 | // here is how it should be. The situation such as &[before 1]a < x, should be |
| 1375 | // resolved exactly as if we wrote &a > x. |
| 1376 | // therefore, I don't really care if the UCA value before a has been changed. |
| 1377 | // However, I do care if the strength between my element and the previous element |
| 1378 | // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll |
| 1379 | // have to construct the base CE. |
| 1380 | |
| 1381 | |
| 1382 | |
| 1383 | // if we found a tailored thing, we have to use the UCA value and construct |
| 1384 | // a new reset token with constructed name |
| 1385 | //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
| 1386 | // character to which we want to anchor is already tailored. |
| 1387 | // We need to construct a new token which will be the anchor |
| 1388 | // point |
| 1389 | //*(src->extraCurrent-1) = 0xFFFE; |
| 1390 | //*src->extraCurrent++ = (UChar)ch; |
| 1391 | // grab before |
| 1392 | src->parsedToken.charsOffset -= 10; |
| 1393 | src->parsedToken.charsLen += 10; |
| 1394 | src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| 1395 | if(isContinuation(SecondCE)) { |
| 1396 | src->lh[src->resultLen].baseContCE = SecondCE; |
| 1397 | } else { |
| 1398 | src->lh[src->resultLen].baseContCE = 0; |
| 1399 | } |
| 1400 | src->lh[src->resultLen].nextCE = 0; |
| 1401 | src->lh[src->resultLen].nextContCE = 0; |
| 1402 | src->lh[src->resultLen].previousCE = 0; |
| 1403 | src->lh[src->resultLen].previousContCE = 0; |
| 1404 | |
| 1405 | src->lh[src->resultLen].indirect = FALSE; |
| 1406 | |
| 1407 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
| 1408 | //} |
| 1409 | } |
| 1410 | |
| 1411 | return sourceToken; |
| 1412 | |
| 1413 | } |
| 1414 | |
| 1415 | uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) { |
| 1416 | UColToken *lastToken = NULL; |
| 1417 | const UChar *parseEnd = NULL; |
| 1418 | uint32_t expandNext = 0; |
| 1419 | UBool variableTop = FALSE; |
| 1420 | UBool top = FALSE; |
| 1421 | uint16_t specs = 0; |
| 1422 | UColTokListHeader *ListList = NULL; |
| 1423 | |
| 1424 | src->parsedToken.strength = UCOL_TOK_UNSET; |
| 1425 | |
| 1426 | ListList = src->lh; |
| 1427 | |
| 1428 | if(U_FAILURE(*status)) { |
| 1429 | return 0; |
| 1430 | } |
| 1431 | |
| 1432 | while(src->current < src->end) { |
| 1433 | src->parsedToken.prefixOffset = 0; |
| 1434 | |
| 1435 | parseEnd = ucol_tok_parseNextToken(src, |
| 1436 | (UBool)(lastToken == NULL), |
| 1437 | parseError, |
| 1438 | status); |
| 1439 | |
| 1440 | specs = src->parsedToken.flags; |
| 1441 | |
| 1442 | |
| 1443 | variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0); |
| 1444 | top = ((specs & UCOL_TOK_TOP) != 0); |
| 1445 | |
| 1446 | if(U_SUCCESS(*status) && parseEnd != NULL) { |
| 1447 | UColToken *sourceToken = NULL; |
| 1448 | //uint32_t key = 0; |
| 1449 | uint32_t lastStrength = UCOL_TOK_UNSET; |
| 1450 | |
| 1451 | if(lastToken != NULL ) { |
| 1452 | lastStrength = lastToken->strength; |
| 1453 | } |
| 1454 | |
| 1455 | //key = newCharsLen << 24 | charsOffset; |
| 1456 | UColToken key; |
| 1457 | key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
| 1458 | key.rulesToParse = src->source; |
| 1459 | |
| 1460 | /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */ |
| 1461 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1462 | |
| 1463 | if(src->parsedToken.strength != UCOL_TOK_RESET) { |
| 1464 | if(lastToken == NULL) { /* this means that rules haven't started properly */ |
| 1465 | *status = U_INVALID_FORMAT_ERROR; |
| 1466 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); |
| 1467 | return 0; |
| 1468 | } |
| 1469 | /* 6 Otherwise (when relation != reset) */ |
| 1470 | if(sourceToken == NULL) { |
| 1471 | /* If sourceToken is null, create new one, */ |
| 1472 | sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken)); |
| 1473 | /* test for NULL */ |
| 1474 | if (sourceToken == NULL) { |
| 1475 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1476 | return 0; |
| 1477 | } |
| 1478 | sourceToken->rulesToParse = src->source; |
| 1479 | sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset; |
| 1480 | |
| 1481 | sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset); |
| 1482 | |
| 1483 | sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset; |
| 1484 | sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset); |
| 1485 | |
| 1486 | sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */ |
| 1487 | sourceToken->next = NULL; |
| 1488 | sourceToken->previous = NULL; |
| 1489 | sourceToken->noOfCEs = 0; |
| 1490 | sourceToken->noOfExpCEs = 0; |
| 1491 | // keep the flags around so that we know about before |
| 1492 | sourceToken->flags = src->parsedToken.flags; |
| 1493 | uhash_put(src->tailored, sourceToken, sourceToken, status); |
| 1494 | if(U_FAILURE(*status)) { |
| 1495 | return 0; |
| 1496 | } |
| 1497 | } else { |
| 1498 | /* we could have fished out a reset here */ |
| 1499 | if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) { |
| 1500 | /* otherwise remove sourceToken from where it was. */ |
| 1501 | if(sourceToken->next != NULL) { |
| 1502 | if(sourceToken->next->strength > sourceToken->strength) { |
| 1503 | sourceToken->next->strength = sourceToken->strength; |
| 1504 | } |
| 1505 | sourceToken->next->previous = sourceToken->previous; |
| 1506 | } else { |
| 1507 | sourceToken->listHeader->last = sourceToken->previous; |
| 1508 | } |
| 1509 | |
| 1510 | if(sourceToken->previous != NULL) { |
| 1511 | sourceToken->previous->next = sourceToken->next; |
| 1512 | } else { |
| 1513 | sourceToken->listHeader->first = sourceToken->next; |
| 1514 | } |
| 1515 | sourceToken->next = NULL; |
| 1516 | sourceToken->previous = NULL; |
| 1517 | } |
| 1518 | } |
| 1519 | |
| 1520 | sourceToken->strength = src->parsedToken.strength; |
| 1521 | sourceToken->listHeader = lastToken->listHeader; |
| 1522 | |
| 1523 | /* |
| 1524 | 1. Find the strongest strength in each list, and set strongestP and strongestN |
| 1525 | accordingly in the headers. |
| 1526 | */ |
| 1527 | if(lastStrength == UCOL_TOK_RESET |
| 1528 | || sourceToken->listHeader->first == 0) { |
| 1529 | /* If LAST is a reset |
| 1530 | insert sourceToken in the list. */ |
| 1531 | if(sourceToken->listHeader->first == 0) { |
| 1532 | sourceToken->listHeader->first = sourceToken; |
| 1533 | sourceToken->listHeader->last = sourceToken; |
| 1534 | } else { /* we need to find a place for us */ |
| 1535 | /* and we'll get in front of the same strength */ |
| 1536 | if(sourceToken->listHeader->first->strength <= sourceToken->strength) { |
| 1537 | sourceToken->next = sourceToken->listHeader->first; |
| 1538 | sourceToken->next->previous = sourceToken; |
| 1539 | sourceToken->listHeader->first = sourceToken; |
| 1540 | sourceToken->previous = NULL; |
| 1541 | } else { |
| 1542 | lastToken = sourceToken->listHeader->first; |
| 1543 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
| 1544 | lastToken = lastToken->next; |
| 1545 | } |
| 1546 | if(lastToken->next != NULL) { |
| 1547 | lastToken->next->previous = sourceToken; |
| 1548 | } else { |
| 1549 | sourceToken->listHeader->last = sourceToken; |
| 1550 | } |
| 1551 | sourceToken->previous = lastToken; |
| 1552 | sourceToken->next = lastToken->next; |
| 1553 | lastToken->next = sourceToken; |
| 1554 | } |
| 1555 | } |
| 1556 | } else { |
| 1557 | /* Otherwise (when LAST is not a reset) |
| 1558 | if polarity (LAST) == polarity(relation), insert sourceToken after LAST, |
| 1559 | otherwise insert before. |
| 1560 | when inserting after or before, search to the next position with the same |
| 1561 | strength in that direction. (This is called postpone insertion). */ |
| 1562 | if(sourceToken != lastToken) { |
| 1563 | if(lastToken->polarity == sourceToken->polarity) { |
| 1564 | while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) { |
| 1565 | lastToken = lastToken->next; |
| 1566 | } |
| 1567 | sourceToken->previous = lastToken; |
| 1568 | if(lastToken->next != NULL) { |
| 1569 | lastToken->next->previous = sourceToken; |
| 1570 | } else { |
| 1571 | sourceToken->listHeader->last = sourceToken; |
| 1572 | } |
| 1573 | |
| 1574 | sourceToken->next = lastToken->next; |
| 1575 | lastToken->next = sourceToken; |
| 1576 | } else { |
| 1577 | while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) { |
| 1578 | lastToken = lastToken->previous; |
| 1579 | } |
| 1580 | sourceToken->next = lastToken; |
| 1581 | if(lastToken->previous != NULL) { |
| 1582 | lastToken->previous->next = sourceToken; |
| 1583 | } else { |
| 1584 | sourceToken->listHeader->first = sourceToken; |
| 1585 | } |
| 1586 | sourceToken->previous = lastToken->previous; |
| 1587 | lastToken->previous = sourceToken; |
| 1588 | } |
| 1589 | } else { /* repeated one thing twice in rules, stay with the stronger strength */ |
| 1590 | if(lastStrength < sourceToken->strength) { |
| 1591 | sourceToken->strength = lastStrength; |
| 1592 | } |
| 1593 | } |
| 1594 | } |
| 1595 | |
| 1596 | /* if the token was a variable top, we're gonna put it in */ |
| 1597 | if(variableTop == TRUE && src->varTop == NULL) { |
| 1598 | variableTop = FALSE; |
| 1599 | src->varTop = sourceToken; |
| 1600 | } |
| 1601 | |
| 1602 | // Treat the expansions. |
| 1603 | // There are two types of expansions: explicit (x / y) and reset based propagating expansions |
| 1604 | // (&abc * d * e <=> &ab * d / c * e / c) |
| 1605 | // if both of them are in effect for a token, they are combined. |
| 1606 | |
| 1607 | sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset; |
| 1608 | |
| 1609 | if(expandNext != 0) { |
| 1610 | if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */ |
| 1611 | expandNext = 0; |
| 1612 | } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */ |
| 1613 | sourceToken->expansion = expandNext; |
| 1614 | } else { /* there is both explicit and implicit expansion. We need to make a combination */ |
| 1615 | uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar)); |
| 1616 | uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar)); |
| 1617 | sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source)); |
| 1618 | src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen; |
| 1619 | } |
| 1620 | } |
| 1621 | |
| 1622 | // This is just for debugging purposes |
| 1623 | if(sourceToken->expansion != 0) { |
| 1624 | sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset); |
| 1625 | } else { |
| 1626 | sourceToken->debugExpansion = 0; |
| 1627 | } |
| 1628 | // if the previous token was a reset before, the strength of this |
| 1629 | // token must match the strength of before. Otherwise we have an |
| 1630 | // undefined situation. |
| 1631 | // In other words, we currently have a cludge which we use to |
| 1632 | // represent &a >> x. This is written as &[before 2]a << x. |
| 1633 | if((lastToken->flags & UCOL_TOK_BEFORE) != 0) { |
| 1634 | uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1; |
| 1635 | if(beforeStrength != sourceToken->strength) { |
| 1636 | *status = U_INVALID_FORMAT_ERROR; |
| 1637 | syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError); |
| 1638 | return 0; |
| 1639 | } |
| 1640 | } |
| 1641 | } else { |
| 1642 | if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) { |
| 1643 | /* if the previous token was also a reset, */ |
| 1644 | /*this means that we have two consecutive resets */ |
| 1645 | /* and we want to remove the previous one if empty*/ |
| 1646 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
| 1647 | src->resultLen--; |
| 1648 | } |
| 1649 | } |
| 1650 | |
| 1651 | if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ |
| 1652 | uint32_t searchCharsLen = src->parsedToken.charsLen; |
| 1653 | while(searchCharsLen > 1 && sourceToken == NULL) { |
| 1654 | searchCharsLen--; |
| 1655 | //key = searchCharsLen << 24 | charsOffset; |
| 1656 | UColToken key; |
| 1657 | key.source = searchCharsLen << 24 | src->parsedToken.charsOffset; |
| 1658 | key.rulesToParse = src->source; |
| 1659 | sourceToken = (UColToken *)uhash_get(src->tailored, &key); |
| 1660 | } |
| 1661 | if(sourceToken != NULL) { |
| 1662 | expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen); |
| 1663 | } |
| 1664 | } |
| 1665 | |
| 1666 | if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ |
| 1667 | if(top == FALSE) { /* there is no indirection */ |
| 1668 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
| 1669 | if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { |
| 1670 | /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ |
| 1671 | while(sourceToken->strength > strength && sourceToken->previous != NULL) { |
| 1672 | sourceToken = sourceToken->previous; |
| 1673 | } |
| 1674 | /* here, either we hit the strength or NULL */ |
| 1675 | if(sourceToken->strength == strength) { |
| 1676 | if(sourceToken->previous != NULL) { |
| 1677 | sourceToken = sourceToken->previous; |
| 1678 | } else { /* start of list */ |
| 1679 | sourceToken = sourceToken->listHeader->reset; |
| 1680 | } |
| 1681 | } else { /* we hit NULL */ |
| 1682 | /* we should be doing the else part */ |
| 1683 | sourceToken = sourceToken->listHeader->reset; |
| 1684 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); |
| 1685 | } |
| 1686 | } else { |
| 1687 | sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status); |
| 1688 | } |
| 1689 | } else { /* this is both before and indirection */ |
| 1690 | top = FALSE; |
| 1691 | ListList[src->resultLen].previousCE = 0; |
| 1692 | ListList[src->resultLen].previousContCE = 0; |
| 1693 | ListList[src->resultLen].indirect = TRUE; |
| 1694 | /* we need to do slightly more work. we need to get the baseCE using the */ |
| 1695 | /* inverse UCA & getPrevious. The next bound is not set, and will be decided */ |
| 1696 | /* in ucol_bld */ |
| 1697 | uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; |
| 1698 | uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; |
| 1699 | uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F; |
| 1700 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
| 1701 | |
| 1702 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
| 1703 | if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ |
| 1704 | uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16; |
| 1705 | uint32_t raw = uprv_uca_getRawFromImplicit(primary); |
| 1706 | uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1); |
| 1707 | CE = primaryCE & UCOL_PRIMARYMASK | 0x0505; |
| 1708 | SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER; |
| 1709 | } else { |
| 1710 | /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/ |
| 1711 | ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength); |
| 1712 | } |
| 1713 | |
| 1714 | ListList[src->resultLen].baseCE = CE; |
| 1715 | ListList[src->resultLen].baseContCE = SecondCE; |
| 1716 | ListList[src->resultLen].nextCE = 0; |
| 1717 | ListList[src->resultLen].nextContCE = 0; |
| 1718 | |
| 1719 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
| 1720 | } |
| 1721 | } |
| 1722 | |
| 1723 | |
| 1724 | /* 5 If the relation is a reset: |
| 1725 | If sourceToken is null |
| 1726 | Create new list, create new sourceToken, make the baseCE from source, put |
| 1727 | the sourceToken in ListHeader of the new list */ |
| 1728 | if(sourceToken == NULL) { |
| 1729 | /* |
| 1730 | 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ... |
| 1731 | First convert all expansions into normal form. Examples: |
| 1732 | If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * |
| 1733 | d * ... into &x * c/y * d * ... |
| 1734 | Note: reset values can never have expansions, although they can cause the |
| 1735 | very next item to have one. They may be contractions, if they are found |
| 1736 | earlier in the list. |
| 1737 | */ |
| 1738 | if(top == FALSE) { |
| 1739 | collIterate s; |
| 1740 | uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND; |
| 1741 | |
| 1742 | uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status); |
| 1743 | |
| 1744 | CE = ucol_getNextCE(src->UCA, &s, status); |
| 1745 | const UChar *expand = s.pos; |
| 1746 | SecondCE = ucol_getNextCE(src->UCA, &s, status); |
| 1747 | |
| 1748 | ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F; |
| 1749 | if(isContinuation(SecondCE)) { |
| 1750 | ListList[src->resultLen].baseContCE = SecondCE; |
| 1751 | } else { |
| 1752 | ListList[src->resultLen].baseContCE = 0; |
| 1753 | } |
| 1754 | ListList[src->resultLen].nextCE = 0; |
| 1755 | ListList[src->resultLen].nextContCE = 0; |
| 1756 | ListList[src->resultLen].previousCE = 0; |
| 1757 | ListList[src->resultLen].previousContCE = 0; |
| 1758 | ListList[src->resultLen].indirect = FALSE; |
| 1759 | sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); |
| 1760 | } else { /* top == TRUE */ |
| 1761 | /* just use the supplied values */ |
| 1762 | top = FALSE; |
| 1763 | ListList[src->resultLen].previousCE = 0; |
| 1764 | ListList[src->resultLen].previousContCE = 0; |
| 1765 | ListList[src->resultLen].indirect = TRUE; |
| 1766 | ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE; |
| 1767 | ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE; |
| 1768 | ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE; |
| 1769 | ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE; |
| 1770 | |
| 1771 | sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); |
| 1772 | |
| 1773 | } |
| 1774 | } else { /* reset to something already in rules */ |
| 1775 | top = FALSE; |
| 1776 | } |
| 1777 | } |
| 1778 | /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ |
| 1779 | lastToken = sourceToken; |
| 1780 | } else { |
| 1781 | if(U_FAILURE(*status)) { |
| 1782 | return 0; |
| 1783 | } |
| 1784 | } |
| 1785 | } |
| 1786 | |
| 1787 | if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) { |
| 1788 | src->resultLen--; |
| 1789 | } |
| 1790 | return src->resultLen; |
| 1791 | } |
| 1792 | |
| 1793 | void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) { |
| 1794 | U_NAMESPACE_USE |
| 1795 | |
| 1796 | uint32_t nSize = 0; |
| 1797 | uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE); |
| 1798 | if(U_FAILURE(*status)) { |
| 1799 | return; |
| 1800 | } |
| 1801 | |
| 1802 | // set everything to zero, so that we can clean up gracefully |
| 1803 | uprv_memset(src, 0, sizeof(UColTokenParser)); |
| 1804 | |
| 1805 | // first we need to find options that don't like to be normalized, |
| 1806 | // like copy and remove... |
| 1807 | //const UChar *openBrace = rules; |
| 1808 | int32_t optionNumber = -1; |
| 1809 | const UChar *setStart = NULL; |
| 1810 | uint32_t i = 0; |
| 1811 | while(i < rulesLength) { |
| 1812 | if(rules[i] == 0x005B) { |
| 1813 | // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces |
| 1814 | //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart); |
| 1815 | optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart); |
| 1816 | if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */ |
| 1817 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
| 1818 | if(U_SUCCESS(*status)) { |
| 1819 | if(src->copySet == NULL) { |
| 1820 | src->copySet = newSet; |
| 1821 | } else { |
| 1822 | uset_addAll(src->copySet, newSet); |
| 1823 | uset_close(newSet); |
| 1824 | } |
| 1825 | } else { |
| 1826 | return; |
| 1827 | } |
| 1828 | } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) { |
| 1829 | USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status); |
| 1830 | if(U_SUCCESS(*status)) { |
| 1831 | if(src->removeSet == NULL) { |
| 1832 | src->removeSet = newSet; |
| 1833 | } else { |
| 1834 | uset_addAll(src->removeSet, newSet); |
| 1835 | uset_close(newSet); |
| 1836 | } |
| 1837 | } else { |
| 1838 | return; |
| 1839 | } |
| 1840 | } |
| 1841 | } |
| 1842 | //openBrace++; |
| 1843 | i++; |
| 1844 | } |
| 1845 | |
| 1846 | src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar)); |
| 1847 | /* test for NULL */ |
| 1848 | if (src->source == NULL) { |
| 1849 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1850 | return; |
| 1851 | } |
| 1852 | uprv_memset(src->source, 0, estimatedSize*sizeof(UChar)); |
| 1853 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status); |
| 1854 | if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) { |
| 1855 | *status = U_ZERO_ERROR; |
| 1856 | src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); |
| 1857 | /* test for NULL */ |
| 1858 | if (src->source == NULL) { |
| 1859 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1860 | return; |
| 1861 | } |
| 1862 | nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status); |
| 1863 | } |
| 1864 | src->current = src->source; |
| 1865 | src->end = src->source+nSize; |
| 1866 | src->sourceCurrent = src->source; |
| 1867 | src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly |
| 1868 | src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; |
| 1869 | src->varTop = NULL; |
| 1870 | src->UCA = UCA; |
| 1871 | src->invUCA = ucol_initInverseUCA(status); |
| 1872 | src->parsedToken.charsLen = 0; |
| 1873 | src->parsedToken.charsOffset = 0; |
| 1874 | src->parsedToken.extensionLen = 0; |
| 1875 | src->parsedToken.extensionOffset = 0; |
| 1876 | src->parsedToken.prefixLen = 0; |
| 1877 | src->parsedToken.prefixOffset = 0; |
| 1878 | src->parsedToken.flags = 0; |
| 1879 | src->parsedToken.strength = UCOL_TOK_UNSET; |
| 1880 | src->buildCCTabFlag = FALSE; |
| 1881 | src->prevStrength = UCOL_TOK_UNSET; |
| 1882 | |
| 1883 | if(U_FAILURE(*status)) { |
| 1884 | return; |
| 1885 | } |
| 1886 | src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status); |
| 1887 | if(U_FAILURE(*status)) { |
| 1888 | return; |
| 1889 | } |
| 1890 | uhash_setValueDeleter(src->tailored, uhash_freeBlock); |
| 1891 | |
| 1892 | src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); |
| 1893 | /* test for NULL */ |
| 1894 | if (src->opts == NULL) { |
| 1895 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1896 | return; |
| 1897 | } |
| 1898 | |
| 1899 | uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet)); |
| 1900 | |
| 1901 | // rulesToParse = src->source; |
| 1902 | src->lh = 0; |
| 1903 | src->listCapacity = 1024; |
| 1904 | src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader)); |
| 1905 | //Test for NULL |
| 1906 | if (src->lh == NULL) { |
| 1907 | *status = U_MEMORY_ALLOCATION_ERROR; |
| 1908 | return; |
| 1909 | } |
| 1910 | uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader)); |
| 1911 | src->resultLen = 0; |
| 1912 | |
| 1913 | UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); |
| 1914 | |
| 1915 | // UCOL_RESET_TOP_VALUE |
| 1916 | setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
| 1917 | // UCOL_FIRST_PRIMARY_IGNORABLE |
| 1918 | setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0); |
| 1919 | // UCOL_LAST_PRIMARY_IGNORABLE |
| 1920 | setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0); |
| 1921 | // UCOL_FIRST_SECONDARY_IGNORABLE |
| 1922 | setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0); |
| 1923 | // UCOL_LAST_SECONDARY_IGNORABLE |
| 1924 | setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0); |
| 1925 | // UCOL_FIRST_TERTIARY_IGNORABLE |
| 1926 | setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0); |
| 1927 | // UCOL_LAST_TERTIARY_IGNORABLE |
| 1928 | setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0); |
| 1929 | // UCOL_FIRST_VARIABLE |
| 1930 | setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0); |
| 1931 | // UCOL_LAST_VARIABLE |
| 1932 | setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0); |
| 1933 | // UCOL_FIRST_NON_VARIABLE |
| 1934 | setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0); |
| 1935 | // UCOL_LAST_NON_VARIABLE |
| 1936 | setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT); |
| 1937 | // UCOL_FIRST_IMPLICIT |
| 1938 | setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0); |
| 1939 | // UCOL_LAST_IMPLICIT |
| 1940 | setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING); |
| 1941 | // UCOL_FIRST_TRAILING |
| 1942 | setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0); |
| 1943 | // UCOL_LAST_TRAILING |
| 1944 | setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0); |
| 1945 | ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24); |
| 1946 | } |
| 1947 | |
| 1948 | |
| 1949 | void ucol_tok_closeTokenList(UColTokenParser *src) { |
| 1950 | if(src->copySet != NULL) { |
| 1951 | uset_close(src->copySet); |
| 1952 | } |
| 1953 | if(src->removeSet != NULL) { |
| 1954 | uset_close(src->removeSet); |
| 1955 | } |
| 1956 | if(src->tailored != NULL) { |
| 1957 | uhash_close(src->tailored); |
| 1958 | } |
| 1959 | if(src->lh != NULL) { |
| 1960 | uprv_free(src->lh); |
| 1961 | } |
| 1962 | if(src->source != NULL) { |
| 1963 | uprv_free(src->source); |
| 1964 | } |
| 1965 | if(src->opts != NULL) { |
| 1966 | uprv_free(src->opts); |
| 1967 | } |
| 1968 | } |
| 1969 | |
| 1970 | #endif /* #if !UCONFIG_NO_COLLATION */ |