blob: 06cc48bb250891224d70770c1f105671920f1d18 [file] [log] [blame]
Igor Sarkisov7a4d6f02020-10-06 04:17:58 -07001/*
2*******************************************************************************
3*
4* Copyright (C) 2001-2010, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_tok.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
16* This module reads a tailoring rule string and produces a list of
17* tokens that will be turned into collation elements
18*
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "unicode/ustring.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
28
29#include "ucol_tok.h"
30#include "ucol_bld.h"
31#include "cmemory.h"
32#include "util.h"
33
34U_CDECL_BEGIN
35static int32_t U_CALLCONV
36uhash_hashTokens(const UHashTok k)
37{
38 int32_t hash = 0;
39 //uint32_t key = (uint32_t)k.integer;
40 UColToken *key = (UColToken *)k.pointer;
41 if (key != 0) {
42 //int32_t len = (key & 0xFF000000)>>24;
43 int32_t len = (key->source & 0xFF000000)>>24;
44 int32_t inc = ((len - 32) / 32) + 1;
45
46 //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
47 const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
48 const UChar *limit = p + len;
49
50 while (p<limit) {
51 hash = (hash * 37) + *p;
52 p += inc;
53 }
54 }
55 return hash;
56}
57
58static UBool U_CALLCONV
59uhash_compareTokens(const UHashTok key1, const UHashTok key2)
60{
61 //uint32_t p1 = (uint32_t) key1.integer;
62 //uint32_t p2 = (uint32_t) key2.integer;
63 UColToken *p1 = (UColToken *)key1.pointer;
64 UColToken *p2 = (UColToken *)key2.pointer;
65 const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
66 const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
67 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
68 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
69 const UChar *end = s1+s1L-1;
70
71 if (p1 == p2) {
72 return TRUE;
73 }
74 if (p1->source == 0 || p2->source == 0) {
75 return FALSE;
76 }
77 if(s1L != s2L) {
78 return FALSE;
79 }
80 if(p1->source == p2->source) {
81 return TRUE;
82 }
83 while((s1 < end) && *s1 == *s2) {
84 ++s1;
85 ++s2;
86 }
87 if(*s1 == *s2) {
88 return TRUE;
89 } else {
90 return FALSE;
91 }
92}
93U_CDECL_END
94
95/*static inline void U_CALLCONV
96uhash_freeBlockWrapper(void *obj) {
97 uhash_freeBlock(obj);
98}*/
99
100
101typedef struct {
102 uint32_t startCE;
103 uint32_t startContCE;
104 uint32_t limitCE;
105 uint32_t limitContCE;
106} indirectBoundaries;
107
108/* these values are used for finding CE values for indirect positioning. */
109/* Indirect positioning is a mechanism for allowing resets on symbolic */
110/* values. It only works for resets and you cannot tailor indirect names */
111/* An indirect name can define either an anchor point or a range. An */
112/* anchor point behaves in exactly the same way as a code point in reset */
113/* would, except that it cannot be tailored. A range (we currently only */
114/* know for the [top] range will explicitly set the upper bound for */
115/* generated CEs, thus allowing for better control over how many CEs can */
116/* be squeezed between in the range without performance penalty. */
117/* In that respect, we use [top] for tailoring of locales that use CJK */
118/* characters. Other indirect values are currently a pure convenience, */
119/* they can be used to assure that the CEs will be always positioned in */
120/* the same place relative to a point with known properties (e.g. first */
121/* primary ignorable). */
122static indirectBoundaries ucolIndirectBoundaries[15];
123/*
124static indirectBoundaries ucolIndirectBoundaries[11] = {
125{ UCOL_RESET_TOP_VALUE, 0,
126UCOL_NEXT_TOP_VALUE, 0 },
127{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
1280, 0 },
129{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
1300, 0 },
131{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
1320, 0 },
133{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
1340, 0 },
135{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
1360, 0 },
137{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
1380, 0 },
139{ UCOL_FIRST_VARIABLE, 0,
1400, 0 },
141{ UCOL_LAST_VARIABLE, 0,
1420, 0 },
143{ UCOL_FIRST_NON_VARIABLE, 0,
1440, 0 },
145{ UCOL_LAST_NON_VARIABLE, 0,
1460, 0 },
147};
148*/
149
150static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
151
152 // Set values for the top - TODO: once we have values for all the indirects, we are going
153 // to initalize here.
154 ucolIndirectBoundaries[indexR].startCE = start[0];
155 ucolIndirectBoundaries[indexR].startContCE = start[1];
156 if(end) {
157 ucolIndirectBoundaries[indexR].limitCE = end[0];
158 ucolIndirectBoundaries[indexR].limitContCE = end[1];
159 } else {
160 ucolIndirectBoundaries[indexR].limitCE = 0;
161 ucolIndirectBoundaries[indexR].limitContCE = 0;
162 }
163}
164
165
166static inline
167void syntaxError(const UChar* rules,
168 int32_t pos,
169 int32_t rulesLen,
170 UParseError* parseError)
171{
172 parseError->offset = pos;
173 parseError->line = 0 ; /* we are not using line numbers */
174
175 // for pre-context
176 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
177 int32_t stop = pos;
178
179 u_memcpy(parseError->preContext,rules+start,stop-start);
180 //null terminate the buffer
181 parseError->preContext[stop-start] = 0;
182
183 //for post-context
184 start = pos+1;
185 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
186 rulesLen;
187
188 if(start < stop) {
189 u_memcpy(parseError->postContext,rules+start,stop-start);
190 //null terminate the buffer
191 parseError->postContext[stop-start]= 0;
192 } else {
193 parseError->postContext[0] = 0;
194 }
195}
196
197static
198void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
199 switch(attrib) {
200 case UCOL_HIRAGANA_QUATERNARY_MODE:
201 opts->hiraganaQ = value;
202 break;
203 case UCOL_FRENCH_COLLATION:
204 opts->frenchCollation = value;
205 break;
206 case UCOL_ALTERNATE_HANDLING:
207 opts->alternateHandling = value;
208 break;
209 case UCOL_CASE_FIRST:
210 opts->caseFirst = value;
211 break;
212 case UCOL_CASE_LEVEL:
213 opts->caseLevel = value;
214 break;
215 case UCOL_NORMALIZATION_MODE:
216 opts->normalizationMode = value;
217 break;
218 case UCOL_STRENGTH:
219 opts->strength = value;
220 break;
221 case UCOL_NUMERIC_COLLATION:
222 opts->numericCollation = value;
223 break;
224 case UCOL_ATTRIBUTE_COUNT:
225 default:
226 break;
227 }
228}
229
230#define UTOK_OPTION_COUNT 20
231
232static UBool didInit = FALSE;
233/* we can be strict, or we can be lenient */
234/* I'd surely be lenient with the option arguments */
235/* maybe even with options */
236U_STRING_DECL(suboption_00, "non-ignorable", 13);
237U_STRING_DECL(suboption_01, "shifted", 7);
238
239U_STRING_DECL(suboption_02, "lower", 5);
240U_STRING_DECL(suboption_03, "upper", 5);
241U_STRING_DECL(suboption_04, "off", 3);
242U_STRING_DECL(suboption_05, "on", 2);
243U_STRING_DECL(suboption_06, "1", 1);
244U_STRING_DECL(suboption_07, "2", 1);
245U_STRING_DECL(suboption_08, "3", 1);
246U_STRING_DECL(suboption_09, "4", 1);
247U_STRING_DECL(suboption_10, "I", 1);
248
249U_STRING_DECL(suboption_11, "primary", 7);
250U_STRING_DECL(suboption_12, "secondary", 9);
251U_STRING_DECL(suboption_13, "tertiary", 8);
252U_STRING_DECL(suboption_14, "variable", 8);
253U_STRING_DECL(suboption_15, "regular", 7);
254U_STRING_DECL(suboption_16, "implicit", 8);
255U_STRING_DECL(suboption_17, "trailing", 8);
256
257
258U_STRING_DECL(option_00, "undefined", 9);
259U_STRING_DECL(option_01, "rearrange", 9);
260U_STRING_DECL(option_02, "alternate", 9);
261U_STRING_DECL(option_03, "backwards", 9);
262U_STRING_DECL(option_04, "variable top", 12);
263U_STRING_DECL(option_05, "top", 3);
264U_STRING_DECL(option_06, "normalization", 13);
265U_STRING_DECL(option_07, "caseLevel", 9);
266U_STRING_DECL(option_08, "caseFirst", 9);
267U_STRING_DECL(option_09, "scriptOrder", 11);
268U_STRING_DECL(option_10, "charsetname", 11);
269U_STRING_DECL(option_11, "charset", 7);
270U_STRING_DECL(option_12, "before", 6);
271U_STRING_DECL(option_13, "hiraganaQ", 9);
272U_STRING_DECL(option_14, "strength", 8);
273U_STRING_DECL(option_15, "first", 5);
274U_STRING_DECL(option_16, "last", 4);
275U_STRING_DECL(option_17, "optimize", 8);
276U_STRING_DECL(option_18, "suppressContractions", 20);
277U_STRING_DECL(option_19, "numericOrdering", 15);
278
279
280/*
281[last variable] last variable value
282[last primary ignorable] largest CE for primary ignorable
283[last secondary ignorable] largest CE for secondary ignorable
284[last tertiary ignorable] largest CE for tertiary ignorable
285[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
286*/
287
288
289static const ucolTokSuboption alternateSub[2] = {
290 {suboption_00, 13, UCOL_NON_IGNORABLE},
291 {suboption_01, 7, UCOL_SHIFTED}
292};
293
294static const ucolTokSuboption caseFirstSub[3] = {
295 {suboption_02, 5, UCOL_LOWER_FIRST},
296 {suboption_03, 5, UCOL_UPPER_FIRST},
297 {suboption_04, 3, UCOL_OFF},
298};
299
300static const ucolTokSuboption onOffSub[2] = {
301 {suboption_04, 3, UCOL_OFF},
302 {suboption_05, 2, UCOL_ON}
303};
304
305static const ucolTokSuboption frenchSub[1] = {
306 {suboption_07, 1, UCOL_ON}
307};
308
309static const ucolTokSuboption beforeSub[3] = {
310 {suboption_06, 1, UCOL_PRIMARY},
311 {suboption_07, 1, UCOL_SECONDARY},
312 {suboption_08, 1, UCOL_TERTIARY}
313};
314
315static const ucolTokSuboption strengthSub[5] = {
316 {suboption_06, 1, UCOL_PRIMARY},
317 {suboption_07, 1, UCOL_SECONDARY},
318 {suboption_08, 1, UCOL_TERTIARY},
319 {suboption_09, 1, UCOL_QUATERNARY},
320 {suboption_10, 1, UCOL_IDENTICAL},
321};
322
323static const ucolTokSuboption firstLastSub[7] = {
324 {suboption_11, 7, UCOL_PRIMARY},
325 {suboption_12, 9, UCOL_PRIMARY},
326 {suboption_13, 8, UCOL_PRIMARY},
327 {suboption_14, 8, UCOL_PRIMARY},
328 {suboption_15, 7, UCOL_PRIMARY},
329 {suboption_16, 8, UCOL_PRIMARY},
330 {suboption_17, 8, UCOL_PRIMARY},
331};
332
333enum OptionNumber {
334 OPTION_ALTERNATE_HANDLING = 0,
335 OPTION_FRENCH_COLLATION,
336 OPTION_CASE_LEVEL,
337 OPTION_CASE_FIRST,
338 OPTION_NORMALIZATION_MODE,
339 OPTION_HIRAGANA_QUATERNARY,
340 OPTION_STRENGTH,
341 OPTION_NUMERIC_COLLATION,
342 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
343 OPTION_VARIABLE_TOP,
344 OPTION_REARRANGE,
345 OPTION_BEFORE,
346 OPTION_TOP,
347 OPTION_FIRST,
348 OPTION_LAST,
349 OPTION_OPTIMIZE,
350 OPTION_SUPPRESS_CONTRACTIONS,
351 OPTION_UNDEFINED,
352 OPTION_SCRIPT_ORDER,
353 OPTION_CHARSET_NAME,
354 OPTION_CHARSET
355} ;
356
357static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
358 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
359 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
360 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
361 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
362 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
363 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
364 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
365 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
366 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
367 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
368 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
369 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
370 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
371 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
372 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
373 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
374 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
375 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
376 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
377 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
378};
379
380static
381int32_t u_strncmpNoCase(const UChar *s1,
382 const UChar *s2,
383 int32_t n)
384{
385 if(n > 0) {
386 int32_t rc;
387 for(;;) {
388 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
389 if(rc != 0 || *s1 == 0 || --n == 0) {
390 return rc;
391 }
392 ++s1;
393 ++s2;
394 }
395 }
396 return 0;
397}
398
399static
400void ucol_uprv_tok_initData() {
401 if(!didInit) {
402 U_STRING_INIT(suboption_00, "non-ignorable", 13);
403 U_STRING_INIT(suboption_01, "shifted", 7);
404
405 U_STRING_INIT(suboption_02, "lower", 5);
406 U_STRING_INIT(suboption_03, "upper", 5);
407 U_STRING_INIT(suboption_04, "off", 3);
408 U_STRING_INIT(suboption_05, "on", 2);
409
410 U_STRING_INIT(suboption_06, "1", 1);
411 U_STRING_INIT(suboption_07, "2", 1);
412 U_STRING_INIT(suboption_08, "3", 1);
413 U_STRING_INIT(suboption_09, "4", 1);
414 U_STRING_INIT(suboption_10, "I", 1);
415
416 U_STRING_INIT(suboption_11, "primary", 7);
417 U_STRING_INIT(suboption_12, "secondary", 9);
418 U_STRING_INIT(suboption_13, "tertiary", 8);
419 U_STRING_INIT(suboption_14, "variable", 8);
420 U_STRING_INIT(suboption_15, "regular", 7);
421 U_STRING_INIT(suboption_16, "implicit", 8);
422 U_STRING_INIT(suboption_17, "trailing", 8);
423
424
425 U_STRING_INIT(option_00, "undefined", 9);
426 U_STRING_INIT(option_01, "rearrange", 9);
427 U_STRING_INIT(option_02, "alternate", 9);
428 U_STRING_INIT(option_03, "backwards", 9);
429 U_STRING_INIT(option_04, "variable top", 12);
430 U_STRING_INIT(option_05, "top", 3);
431 U_STRING_INIT(option_06, "normalization", 13);
432 U_STRING_INIT(option_07, "caseLevel", 9);
433 U_STRING_INIT(option_08, "caseFirst", 9);
434 U_STRING_INIT(option_09, "scriptOrder", 11);
435 U_STRING_INIT(option_10, "charsetname", 11);
436 U_STRING_INIT(option_11, "charset", 7);
437 U_STRING_INIT(option_12, "before", 6);
438 U_STRING_INIT(option_13, "hiraganaQ", 9);
439 U_STRING_INIT(option_14, "strength", 8);
440 U_STRING_INIT(option_15, "first", 5);
441 U_STRING_INIT(option_16, "last", 4);
442 U_STRING_INIT(option_17, "optimize", 8);
443 U_STRING_INIT(option_18, "suppressContractions", 20);
444 U_STRING_INIT(option_19, "numericOrdering", 15);
445 didInit = TRUE;
446 }
447}
448
449
450// This function reads basic options to set in the runtime collator
451// used by data driven tests. Should not support build time options
452U_CAPI const UChar * U_EXPORT2
453ucol_tok_getNextArgument(const UChar *start, const UChar *end,
454 UColAttribute *attrib, UColAttributeValue *value,
455 UErrorCode *status)
456{
457 uint32_t i = 0;
458 int32_t j=0;
459 UBool foundOption = FALSE;
460 const UChar *optionArg = NULL;
461
462 ucol_uprv_tok_initData();
463
464 while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */
465 start++;
466 }
467 if(start >= end) {
468 return NULL;
469 }
470 /* skip opening '[' */
471 if(*start == 0x005b) {
472 start++;
473 } else {
474 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
475 return NULL;
476 }
477
478 while(i < UTOK_OPTION_COUNT) {
479 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
480 foundOption = TRUE;
481 if(end - start > rulesOptions[i].optionLen) {
482 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
483 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
484 optionArg++;
485 }
486 }
487 break;
488 }
489 i++;
490 }
491
492 if(!foundOption) {
493 *status = U_ILLEGAL_ARGUMENT_ERROR;
494 return NULL;
495 }
496
497 if(optionArg) {
498 for(j = 0; j<rulesOptions[i].subSize; j++) {
499 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
500 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
501 *attrib = rulesOptions[i].attr;
502 *value = rulesOptions[i].subopts[j].attrVal;
503 optionArg += rulesOptions[i].subopts[j].subLen;
504 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
505 optionArg++;
506 }
507 if(*optionArg == 0x005d) {
508 optionArg++;
509 return optionArg;
510 } else {
511 *status = U_ILLEGAL_ARGUMENT_ERROR;
512 return NULL;
513 }
514 }
515 }
516 }
517 *status = U_ILLEGAL_ARGUMENT_ERROR;
518 return NULL;
519}
520
521static
522USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
523 while(*start != 0x005b) { /* advance while we find the first '[' */
524 start++;
525 }
526 // now we need to get a balanced set of '[]'. The problem is that a set can have
527 // many, and *end point to the first closing '['
528 int32_t noOpenBraces = 1;
529 int32_t current = 1; // skip the opening brace
530 while(start+current < end && noOpenBraces != 0) {
531 if(start[current] == 0x005b) {
532 noOpenBraces++;
533 } else if(start[current] == 0x005D) { // closing brace
534 noOpenBraces--;
535 }
536 current++;
537 }
538
539 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
540 *status = U_ILLEGAL_ARGUMENT_ERROR;
541 return NULL;
542 }
543 return uset_openPattern(start, current, status);
544}
545
546static
547int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
548 int32_t i = 0;
549 ucol_uprv_tok_initData();
550
551 while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */
552 start++;
553 }
554 while(i < UTOK_OPTION_COUNT) {
555 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
556 if(end - start > rulesOptions[i].optionLen) {
557 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
558 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */
559 (*optionArg)++;
560 }
561 }
562 break;
563 }
564 i++;
565 }
566 if(i == UTOK_OPTION_COUNT) {
567 i = -1; // didn't find an option
568 }
569 return i;
570}
571
572
573// reads and conforms to various options in rules
574// end is the position of the first closing ']'
575// However, some of the options take an UnicodeSet definition
576// which needs to duplicate the closing ']'
577// for example: '[copy [\uAC00-\uD7FF]]'
578// These options will move end to the second ']' and the
579// caller will set the current to it.
580static
581uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
582 const UChar* start = src->current;
583 int32_t i = 0;
584 int32_t j=0;
585 const UChar *optionArg = NULL;
586
587 uint8_t result = 0;
588
589 start++; /*skip opening '['*/
590 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
591 if(optionArg) {
592 src->current = optionArg;
593 }
594
595 if(i < 0) {
596 *status = U_ILLEGAL_ARGUMENT_ERROR;
597 } else {
598 int32_t noOpenBraces = 1;
599 switch(i) {
600 case OPTION_ALTERNATE_HANDLING:
601 case OPTION_FRENCH_COLLATION:
602 case OPTION_CASE_LEVEL:
603 case OPTION_CASE_FIRST:
604 case OPTION_NORMALIZATION_MODE:
605 case OPTION_HIRAGANA_QUATERNARY:
606 case OPTION_STRENGTH:
607 case OPTION_NUMERIC_COLLATION:
608 if(optionArg) {
609 for(j = 0; j<rulesOptions[i].subSize; j++) {
610 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
611 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
612 result = UCOL_TOK_SUCCESS;
613 }
614 }
615 }
616 if(result == 0) {
617 *status = U_ILLEGAL_ARGUMENT_ERROR;
618 }
619 break;
620 case OPTION_VARIABLE_TOP:
621 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
622 break;
623 case OPTION_REARRANGE:
624 result = UCOL_TOK_SUCCESS;
625 break;
626 case OPTION_BEFORE:
627 if(optionArg) {
628 for(j = 0; j<rulesOptions[i].subSize; j++) {
629 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
630 result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
631 }
632 }
633 }
634 if(result == 0) {
635 *status = U_ILLEGAL_ARGUMENT_ERROR;
636 }
637 break;
638 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
639 /* index to this array will be src->parsedToken.indirectIndex*/
640 src->parsedToken.indirectIndex = 0;
641 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
642 break;
643 case OPTION_FIRST:
644 case OPTION_LAST: /* first, last */
645 for(j = 0; j<rulesOptions[i].subSize; j++) {
646 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
647 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
648 // element of indirect boundaries is reserved for top.
649 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
650 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
651 }
652 }
653 if(result == 0) {
654 *status = U_ILLEGAL_ARGUMENT_ERROR;
655 }
656 break;
657 case OPTION_OPTIMIZE:
658 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
659 // we need to move end here
660 src->current++; // skip opening brace
661 while(src->current < src->end && noOpenBraces != 0) {
662 if(*src->current == 0x005b) {
663 noOpenBraces++;
664 } else if(*src->current == 0x005D) { // closing brace
665 noOpenBraces--;
666 }
667 src->current++;
668 }
669 result = UCOL_TOK_SUCCESS;
670 break;
671 default:
672 *status = U_UNSUPPORTED_ERROR;
673 break;
674 }
675 }
676 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
677 return result;
678}
679
680
681inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
682 if (stuff == NULL || len <= 0) {
683 return;
684 }
685 UChar *tempStuff = (UChar *)stuff;
686 if(src->extraCurrent+len >= src->extraEnd) {
687 /* reallocate */
688 if (stuff >= src->source && stuff <= src->end) {
689 // Copy stuff to a new buffer if stuff points to an address within
690 // src->source buffer.
691 tempStuff = (UChar*)uprv_malloc(len*sizeof(UChar));
692 if (tempStuff == NULL) {
693 *status = U_MEMORY_ALLOCATION_ERROR;
694 return;
695 }
696 uprv_memcpy(tempStuff, stuff, len*sizeof(UChar));
697 }
698 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
699 if(newSrc != NULL) {
700 src->current = newSrc + (src->current - src->source);
701 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
702 src->end = newSrc + (src->end - src->source);
703 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
704 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
705 src->source = newSrc;
706 } else {
707 *status = U_MEMORY_ALLOCATION_ERROR;
708 if (tempStuff != stuff) {
709 uprv_free(tempStuff);
710 }
711 return;
712 }
713 }
714 if(len == 1) {
715 *src->extraCurrent++ = *tempStuff;
716 } else {
717 uprv_memcpy(src->extraCurrent, tempStuff, len*sizeof(UChar));
718 src->extraCurrent += len;
719 }
720 if (tempStuff != stuff) {
721 uprv_free(tempStuff);
722 }
723}
724
725inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
726 /*
727 top = TRUE;
728 */
729 UChar buff[5];
730 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
731 buff[0] = 0xFFFE;
732 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
733 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
734 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
735 src->parsedToken.charsLen = 3;
736 ucol_tok_addToExtraCurrent(src, buff, 3, status);
737 } else {
738 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
739 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
740 src->parsedToken.charsLen = 5;
741 ucol_tok_addToExtraCurrent(src, buff, 5, status);
742 }
743 return TRUE;
744}
745
746static UBool isCharNewLine(UChar c){
747 switch(c){
748 case 0x000A: /* LF */
749 case 0x000D: /* CR */
750 case 0x000C: /* FF */
751 case 0x0085: /* NEL */
752 case 0x2028: /* LS */
753 case 0x2029: /* PS */
754 return TRUE;
755 default:
756 return FALSE;
757 }
758}
759
760U_CAPI const UChar* U_EXPORT2
761ucol_tok_parseNextToken(UColTokenParser *src,
762 UBool startOfRules,
763 UParseError *parseError,
764 UErrorCode *status)
765{
766 /* parsing part */
767 UBool variableTop = FALSE;
768 UBool top = FALSE;
769 UBool inChars = TRUE;
770 UBool inQuote = FALSE;
771 UBool wasInQuote = FALSE;
772 uint8_t before = 0;
773 UBool isEscaped = FALSE;
774 // TODO: replace these variables with src->parsedToken counterparts
775 // no need to use them anymore since we have src->parsedToken.
776 // Ideally, token parser would be a nice class... Once, when I have
777 // more time (around 2020 probably).
778 uint32_t newExtensionLen = 0;
779 uint32_t extensionOffset = 0;
780 uint32_t newStrength = UCOL_TOK_UNSET;
781 UChar buff[10];
782 UChar32 codepoint;
783
784 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
785 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
786 src->parsedToken.indirectIndex = 0;
787
788 while (src->current < src->end) {
789 UChar ch = *(src->current);
790
791 if (inQuote) {
792 if (ch == 0x0027/*'\''*/) {
793 inQuote = FALSE;
794 } else {
795 if ((src->parsedToken.charsLen == 0) || inChars) {
796 if(src->parsedToken.charsLen == 0) {
797 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
798 }
799 src->parsedToken.charsLen++;
800 } else {
801 if(newExtensionLen == 0) {
802 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
803 }
804 newExtensionLen++;
805 }
806 }
807 }else if(isEscaped){
808 isEscaped =FALSE;
809 if (newStrength == UCOL_TOK_UNSET) {
810 *status = U_INVALID_FORMAT_ERROR;
811 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
812 return NULL;
813 // enabling rules to start with non-tokens a < b
814 // newStrength = UCOL_TOK_RESET;
815 }
816 if(ch != 0x0000 && src->current != src->end) {
817 if (inChars) {
818 if(src->parsedToken.charsLen == 0) {
819 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
820 }
821 src->parsedToken.charsLen++;
822 } else {
823 if(newExtensionLen == 0) {
824 extensionOffset = (uint32_t)(src->current - src->source);
825 }
826 newExtensionLen++;
827 }
828 }
829 }else {
830 if(!uprv_isRuleWhiteSpace(ch)) {
831 /* Sets the strength for this entry */
832 switch (ch) {
833 case 0x003D/*'='*/ :
834 if (newStrength != UCOL_TOK_UNSET) {
835 goto EndOfLoop;
836 }
837
838 /* if we start with strength, we'll reset to top */
839 if(startOfRules == TRUE) {
840 src->parsedToken.indirectIndex = 5;
841 top = ucol_tok_doSetTop(src, status);
842 newStrength = UCOL_TOK_RESET;
843 goto EndOfLoop;
844 }
845 newStrength = UCOL_IDENTICAL;
846 if(*(src->current+1) == 0x002A) {/*'*'*/
847 src->current++;
848 src->prevStrength = newStrength;
849 }else{
850 src->prevStrength = UCOL_TOK_UNSET;
851 }
852 break;
853
854 case 0x002C/*','*/:
855 if (newStrength != UCOL_TOK_UNSET) {
856 goto EndOfLoop;
857 }
858
859 /* if we start with strength, we'll reset to top */
860 if(startOfRules == TRUE) {
861 src->parsedToken.indirectIndex = 5;
862 top = ucol_tok_doSetTop(src, status);
863 newStrength = UCOL_TOK_RESET;
864 goto EndOfLoop;
865 }
866 newStrength = UCOL_TERTIARY;
867 src->prevStrength = UCOL_TOK_UNSET;
868 break;
869
870 case 0x003B/*';'*/:
871 if (newStrength != UCOL_TOK_UNSET) {
872 goto EndOfLoop;
873 }
874
875 /* if we start with strength, we'll reset to top */
876 if(startOfRules == TRUE) {
877 src->parsedToken.indirectIndex = 5;
878 top = ucol_tok_doSetTop(src, status);
879 newStrength = UCOL_TOK_RESET;
880 goto EndOfLoop;
881 }
882 newStrength = UCOL_SECONDARY;
883 src->prevStrength = UCOL_TOK_UNSET;
884 break;
885
886 case 0x003C/*'<'*/:
887 if (newStrength != UCOL_TOK_UNSET) {
888 goto EndOfLoop;
889 }
890
891 /* if we start with strength, we'll reset to top */
892 if(startOfRules == TRUE) {
893 src->parsedToken.indirectIndex = 5;
894 top = ucol_tok_doSetTop(src, status);
895 newStrength = UCOL_TOK_RESET;
896 goto EndOfLoop;
897 }
898 /* before this, do a scan to verify whether this is */
899 /* another strength */
900 if(*(src->current+1) == 0x003C) {
901 src->current++;
902 if(*(src->current+1) == 0x003C) {
903 src->current++; /* three in a row! */
904 newStrength = UCOL_TERTIARY;
905 } else { /* two in a row */
906 newStrength = UCOL_SECONDARY;
907 }
908 } else { /* just one */
909 newStrength = UCOL_PRIMARY;
910 }
911 if(*(src->current+1) == 0x002A) {/*'*'*/
912 src->current++;
913 src->prevStrength = newStrength;
914 }else{
915 src->prevStrength = UCOL_TOK_UNSET;
916 }
917 break;
918
919 case 0x0026/*'&'*/:
920 if (newStrength != UCOL_TOK_UNSET) {
921 /**/
922 goto EndOfLoop;
923 }
924
925 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
926 src->prevStrength = UCOL_TOK_UNSET;
927 break;
928
929 case 0x005b/*'['*/:
930 /* options - read an option, analyze it */
931 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
932 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
933 if(U_SUCCESS(*status)) {
934 if(result & UCOL_TOK_TOP) {
935 if(newStrength == UCOL_TOK_RESET) {
936 top = ucol_tok_doSetTop(src, status);
937 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
938 src->parsedToken.charsLen+=2;
939 buff[0] = 0x002d;
940 buff[1] = before;
941 ucol_tok_addToExtraCurrent(src, buff, 2, status);
942 }
943
944 src->current++;
945 goto EndOfLoop;
946 } else {
947 *status = U_INVALID_FORMAT_ERROR;
948 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
949 }
950 } else if(result & UCOL_TOK_VARIABLE_TOP) {
951 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
952 variableTop = TRUE;
953 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
954 src->parsedToken.charsLen = 1;
955 buff[0] = 0xFFFF;
956 ucol_tok_addToExtraCurrent(src, buff, 1, status);
957 src->current++;
958 goto EndOfLoop;
959 } else {
960 *status = U_INVALID_FORMAT_ERROR;
961 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
962 }
963 } else if (result & UCOL_TOK_BEFORE){
964 if(newStrength == UCOL_TOK_RESET) {
965 before = result & UCOL_TOK_BEFORE;
966 } else {
967 *status = U_INVALID_FORMAT_ERROR;
968 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
969
970 }
971 }
972 } else {
973 *status = U_INVALID_FORMAT_ERROR;
974 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
975 return NULL;
976 }
977 }
978 break;
979 case 0x0021/*! skip java thai modifier reordering*/:
980 break;
981 case 0x002F/*'/'*/:
982 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
983 inChars = FALSE; /* we're now processing expansion */
984 break;
985 case 0x005C /* back slash for escaped chars */:
986 isEscaped = TRUE;
987 break;
988 /* found a quote, we're gonna start copying */
989 case 0x0027/*'\''*/:
990 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
991 if(src->prevStrength == UCOL_TOK_UNSET){
992 *status = U_INVALID_FORMAT_ERROR;
993 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
994 return NULL;
995 // enabling rules to start with a non-token character a < b
996 // newStrength = UCOL_TOK_RESET;
997 }else{
998 newStrength = src->prevStrength;
999 }
1000 }
1001
1002 inQuote = TRUE;
1003
1004 if(inChars) { /* we're doing characters */
1005 if(wasInQuote == FALSE) {
1006 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1007 }
1008 if (src->parsedToken.charsLen != 0) {
1009 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1010 }
1011 src->parsedToken.charsLen++;
1012 } else { /* we're doing an expansion */
1013 if(wasInQuote == FALSE) {
1014 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1015 }
1016 if (newExtensionLen != 0) {
1017 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1018 }
1019 newExtensionLen++;
1020 }
1021
1022 wasInQuote = TRUE;
1023
1024 ch = *(++(src->current));
1025 if(ch == 0x0027) { /* copy the double quote */
1026 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1027 inQuote = FALSE;
1028 }
1029 break;
1030
1031 /* '@' is french only if the strength is not currently set */
1032 /* if it is, it's just a regular character in collation rules */
1033 case 0x0040/*'@'*/:
1034 if (newStrength == UCOL_TOK_UNSET) {
1035 src->opts->frenchCollation = UCOL_ON;
1036 break;
1037 }
1038
1039 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1040 // we want to store read characters to the prefix part and continue reading
1041 // the characters (proper way would be to restart reading the chars, but in
1042 // that case we would have to complicate the token hasher, which I do not
1043 // intend to play with. Instead, we will do prefixes when prefixes are due
1044 // (before adding the elements).
1045 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1046 src->parsedToken.prefixLen = src->parsedToken.charsLen;
1047
1048 if(inChars) { /* we're doing characters */
1049 if(wasInQuote == FALSE) {
1050 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1051 }
1052 if (src->parsedToken.charsLen != 0) {
1053 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1054 }
1055 src->parsedToken.charsLen++;
1056 }
1057
1058 wasInQuote = TRUE;
1059
1060 do {
1061 ch = *(++(src->current));
1062 // skip whitespace between '|' and the character
1063 } while (uprv_isRuleWhiteSpace(ch));
1064 break;
1065
1066 //charsOffset = 0;
1067 //newCharsLen = 0;
1068 //break; // We want to store the whole prefix/character sequence. If we break
1069 // the '|' is going to get lost.
1070 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1071 do {
1072 ch = *(++(src->current));
1073 } while (!isCharNewLine(ch));
1074
1075 break;
1076 default:
1077 if (newStrength == UCOL_TOK_UNSET) {
1078 if(src->prevStrength == UCOL_TOK_UNSET){
1079 *status = U_INVALID_FORMAT_ERROR;
1080 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1081 return NULL;
1082 }else{
1083 newStrength = src->prevStrength;
1084 }
1085 }
1086
1087 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1088 *status = U_INVALID_FORMAT_ERROR;
1089 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1090 return NULL;
1091 }
1092
1093 if(ch == 0x0000 && src->current+1 == src->end) {
1094 break;
1095 }
1096
1097 if (inChars) {
1098 if(src->parsedToken.charsLen == 0) {
1099 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1100 }
1101 src->parsedToken.charsLen++;
1102 if(src->prevStrength != UCOL_TOK_UNSET){
1103 U16_NEXT(0, src->current, src->end, codepoint);
1104 src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1;
1105 goto EndOfLoop;
1106 }
1107 } else {
1108 if(newExtensionLen == 0) {
1109 extensionOffset = (uint32_t)(src->current - src->source);
1110 }
1111 newExtensionLen++;
1112 }
1113
1114 break;
1115 }
1116 }
1117 }
1118
1119 if(wasInQuote) {
1120 if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){
1121 src->current++;
1122 goto EndOfLoop;
1123 }
1124 if(ch != 0x27) {
1125 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1126 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1127 }
1128 }
1129 }
1130
1131 src->current++;
1132 }
1133
1134EndOfLoop:
1135 wasInQuote = FALSE;
1136 if (newStrength == UCOL_TOK_UNSET) {
1137 return NULL;
1138 }
1139
1140 if (src->parsedToken.charsLen == 0 && top == FALSE) {
1141 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1142 *status = U_INVALID_FORMAT_ERROR;
1143 return NULL;
1144 }
1145
1146 src->parsedToken.strength = newStrength;
1147 src->parsedToken.extensionOffset = extensionOffset;
1148 src->parsedToken.extensionLen = newExtensionLen;
1149 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1150
1151 return src->current;
1152}
1153
1154/*
1155Processing Description
11561 Build a ListList. Each list has a header, which contains two lists (positive
1157and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1158reset may be null.
11592 As you process, you keep a LAST pointer that points to the last token you
1160handled.
1161*/
1162
1163static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
1164 UParseError *parseError, UErrorCode *status)
1165{
1166 if(src->resultLen == src->listCapacity) {
1167 // Unfortunately, this won't work, as we store addresses of lhs in token
1168 src->listCapacity *= 2;
1169 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1170 if(src->lh == NULL) {
1171 *status = U_MEMORY_ALLOCATION_ERROR;
1172 return NULL;
1173 }
1174 }
1175 /* do the reset thing */
1176 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1177 /* test for NULL */
1178 if (sourceToken == NULL) {
1179 *status = U_MEMORY_ALLOCATION_ERROR;
1180 return NULL;
1181 }
1182 sourceToken->rulesToParse = src->source;
1183 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1184 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1185
1186 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1187 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1188
1189 // keep the flags around so that we know about before
1190 sourceToken->flags = src->parsedToken.flags;
1191
1192 if(src->parsedToken.prefixOffset != 0) {
1193 // this is a syntax error
1194 *status = U_INVALID_FORMAT_ERROR;
1195 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1196 uprv_free(sourceToken);
1197 return 0;
1198 } else {
1199 sourceToken->prefix = 0;
1200 }
1201
1202 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1203 sourceToken->strength = UCOL_TOK_RESET;
1204 sourceToken->next = NULL;
1205 sourceToken->previous = NULL;
1206 sourceToken->noOfCEs = 0;
1207 sourceToken->noOfExpCEs = 0;
1208 sourceToken->listHeader = &src->lh[src->resultLen];
1209
1210 src->lh[src->resultLen].first = NULL;
1211 src->lh[src->resultLen].last = NULL;
1212 src->lh[src->resultLen].first = NULL;
1213 src->lh[src->resultLen].last = NULL;
1214
1215 src->lh[src->resultLen].reset = sourceToken;
1216
1217 /*
1218 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1219 First convert all expansions into normal form. Examples:
1220 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1221 d * ... into &x * c/y * d * ...
1222 Note: reset values can never have expansions, although they can cause the
1223 very next item to have one. They may be contractions, if they are found
1224 earlier in the list.
1225 */
1226 *expandNext = 0;
1227 if(expand != NULL) {
1228 /* check to see if there is an expansion */
1229 if(src->parsedToken.charsLen > 1) {
1230 uint32_t resetCharsOffset;
1231 resetCharsOffset = (uint32_t)(expand - src->source);
1232 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1233 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1234 }
1235 }
1236
1237 src->resultLen++;
1238
1239 uhash_put(src->tailored, sourceToken, sourceToken, status);
1240
1241 return sourceToken;
1242}
1243
1244static
1245inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1246 if(U_FAILURE(*status)) {
1247 return NULL;
1248 }
1249 /* this is a virgin before - we need to fish the anchor from the UCA */
1250 collIterate s;
1251 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1252 uint32_t CE, SecondCE;
1253 uint32_t invPos;
1254 if(sourceToken != NULL) {
1255 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
1256 } else {
1257 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1258 }
1259 if(U_FAILURE(*status)) {
1260 return NULL;
1261 }
1262
1263 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1264 baseContCE = ucol_getNextCE(src->UCA, &s, status);
1265 if(baseContCE == UCOL_NO_MORE_CES) {
1266 baseContCE = 0;
1267 }
1268
1269
1270 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1271 uint32_t ch = 0;
1272 uint32_t expandNext = 0;
1273 UColToken key;
1274
1275 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1276 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1277 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1278 ch = uprv_uca_getCodePointFromRaw(raw-1);
1279 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1280 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1281 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1282
1283 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1284 *src->extraCurrent++ = 0xFFFE;
1285 *src->extraCurrent++ = (UChar)ch;
1286 src->parsedToken.charsLen++;
1287
1288 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1289 key.rulesToParse = src->source;
1290
1291 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1292 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1293
1294 if(sourceToken == NULL) {
1295 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1296 if(isContinuation(SecondCE)) {
1297 src->lh[src->resultLen].baseContCE = SecondCE;
1298 } else {
1299 src->lh[src->resultLen].baseContCE = 0;
1300 }
1301 src->lh[src->resultLen].nextCE = 0;
1302 src->lh[src->resultLen].nextContCE = 0;
1303 src->lh[src->resultLen].previousCE = 0;
1304 src->lh[src->resultLen].previousContCE = 0;
1305
1306 src->lh[src->resultLen].indirect = FALSE;
1307
1308 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1309 }
1310
1311 } else {
1312 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1313
1314 // we got the previous CE. Now we need to see if the difference between
1315 // the two CEs is really of the requested strength.
1316 // if it's a bigger difference (we asked for secondary and got primary), we
1317 // need to modify the CE.
1318 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1319 // adjust the strength
1320 // now we are in the situation where our baseCE should actually be modified in
1321 // order to get the CE in the right position.
1322 if(strength == UCOL_SECONDARY) {
1323 CE = baseCE - 0x0200;
1324 } else { // strength == UCOL_TERTIARY
1325 CE = baseCE - 0x02;
1326 }
1327 if(baseContCE) {
1328 if(strength == UCOL_SECONDARY) {
1329 SecondCE = baseContCE - 0x0200;
1330 } else { // strength == UCOL_TERTIARY
1331 SecondCE = baseContCE - 0x02;
1332 }
1333 }
1334 }
1335
1336#if 0
1337 // the code below relies on getting a code point from the inverse table, in order to be
1338 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1339 // 1. There are many code points that have the same CE
1340 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1341 // Also, in case when there is no equivalent strength before an element, we have to actually
1342 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1343 // before a is a primary difference.
1344
1345 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1346
1347
1348 ch = CETable[3*invPos+2];
1349
1350 if((ch & UCOL_INV_SIZEMASK) != 0) {
1351 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1352 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1353 ch = conts[offset];
1354 }
1355
1356 *src->extraCurrent++ = (UChar)ch;
1357 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1358 src->parsedToken.charsLen = 1;
1359
1360 // We got an UCA before. However, this might have been tailored.
1361 // example:
1362 // &\u30ca = \u306a
1363 // &[before 3]\u306a<<<\u306a|\u309d
1364
1365
1366 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1367 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1368 key.rulesToParse = src->source;
1369
1370 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1371 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1372#endif
1373
1374 // here is how it should be. The situation such as &[before 1]a < x, should be
1375 // resolved exactly as if we wrote &a > x.
1376 // therefore, I don't really care if the UCA value before a has been changed.
1377 // However, I do care if the strength between my element and the previous element
1378 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1379 // have to construct the base CE.
1380
1381
1382
1383 // if we found a tailored thing, we have to use the UCA value and construct
1384 // a new reset token with constructed name
1385 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1386 // character to which we want to anchor is already tailored.
1387 // We need to construct a new token which will be the anchor
1388 // point
1389 //*(src->extraCurrent-1) = 0xFFFE;
1390 //*src->extraCurrent++ = (UChar)ch;
1391 // grab before
1392 src->parsedToken.charsOffset -= 10;
1393 src->parsedToken.charsLen += 10;
1394 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1395 if(isContinuation(SecondCE)) {
1396 src->lh[src->resultLen].baseContCE = SecondCE;
1397 } else {
1398 src->lh[src->resultLen].baseContCE = 0;
1399 }
1400 src->lh[src->resultLen].nextCE = 0;
1401 src->lh[src->resultLen].nextContCE = 0;
1402 src->lh[src->resultLen].previousCE = 0;
1403 src->lh[src->resultLen].previousContCE = 0;
1404
1405 src->lh[src->resultLen].indirect = FALSE;
1406
1407 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1408 //}
1409 }
1410
1411 return sourceToken;
1412
1413}
1414
1415uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1416 UColToken *lastToken = NULL;
1417 const UChar *parseEnd = NULL;
1418 uint32_t expandNext = 0;
1419 UBool variableTop = FALSE;
1420 UBool top = FALSE;
1421 uint16_t specs = 0;
1422 UColTokListHeader *ListList = NULL;
1423
1424 src->parsedToken.strength = UCOL_TOK_UNSET;
1425
1426 ListList = src->lh;
1427
1428 if(U_FAILURE(*status)) {
1429 return 0;
1430 }
1431
1432 while(src->current < src->end) {
1433 src->parsedToken.prefixOffset = 0;
1434
1435 parseEnd = ucol_tok_parseNextToken(src,
1436 (UBool)(lastToken == NULL),
1437 parseError,
1438 status);
1439
1440 specs = src->parsedToken.flags;
1441
1442
1443 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1444 top = ((specs & UCOL_TOK_TOP) != 0);
1445
1446 if(U_SUCCESS(*status) && parseEnd != NULL) {
1447 UColToken *sourceToken = NULL;
1448 //uint32_t key = 0;
1449 uint32_t lastStrength = UCOL_TOK_UNSET;
1450
1451 if(lastToken != NULL ) {
1452 lastStrength = lastToken->strength;
1453 }
1454
1455 //key = newCharsLen << 24 | charsOffset;
1456 UColToken key;
1457 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1458 key.rulesToParse = src->source;
1459
1460 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1461 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1462
1463 if(src->parsedToken.strength != UCOL_TOK_RESET) {
1464 if(lastToken == NULL) { /* this means that rules haven't started properly */
1465 *status = U_INVALID_FORMAT_ERROR;
1466 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1467 return 0;
1468 }
1469 /* 6 Otherwise (when relation != reset) */
1470 if(sourceToken == NULL) {
1471 /* If sourceToken is null, create new one, */
1472 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1473 /* test for NULL */
1474 if (sourceToken == NULL) {
1475 *status = U_MEMORY_ALLOCATION_ERROR;
1476 return 0;
1477 }
1478 sourceToken->rulesToParse = src->source;
1479 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1480
1481 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1482
1483 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1484 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1485
1486 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1487 sourceToken->next = NULL;
1488 sourceToken->previous = NULL;
1489 sourceToken->noOfCEs = 0;
1490 sourceToken->noOfExpCEs = 0;
1491 // keep the flags around so that we know about before
1492 sourceToken->flags = src->parsedToken.flags;
1493 uhash_put(src->tailored, sourceToken, sourceToken, status);
1494 if(U_FAILURE(*status)) {
1495 return 0;
1496 }
1497 } else {
1498 /* we could have fished out a reset here */
1499 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1500 /* otherwise remove sourceToken from where it was. */
1501 if(sourceToken->next != NULL) {
1502 if(sourceToken->next->strength > sourceToken->strength) {
1503 sourceToken->next->strength = sourceToken->strength;
1504 }
1505 sourceToken->next->previous = sourceToken->previous;
1506 } else {
1507 sourceToken->listHeader->last = sourceToken->previous;
1508 }
1509
1510 if(sourceToken->previous != NULL) {
1511 sourceToken->previous->next = sourceToken->next;
1512 } else {
1513 sourceToken->listHeader->first = sourceToken->next;
1514 }
1515 sourceToken->next = NULL;
1516 sourceToken->previous = NULL;
1517 }
1518 }
1519
1520 sourceToken->strength = src->parsedToken.strength;
1521 sourceToken->listHeader = lastToken->listHeader;
1522
1523 /*
1524 1. Find the strongest strength in each list, and set strongestP and strongestN
1525 accordingly in the headers.
1526 */
1527 if(lastStrength == UCOL_TOK_RESET
1528 || sourceToken->listHeader->first == 0) {
1529 /* If LAST is a reset
1530 insert sourceToken in the list. */
1531 if(sourceToken->listHeader->first == 0) {
1532 sourceToken->listHeader->first = sourceToken;
1533 sourceToken->listHeader->last = sourceToken;
1534 } else { /* we need to find a place for us */
1535 /* and we'll get in front of the same strength */
1536 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1537 sourceToken->next = sourceToken->listHeader->first;
1538 sourceToken->next->previous = sourceToken;
1539 sourceToken->listHeader->first = sourceToken;
1540 sourceToken->previous = NULL;
1541 } else {
1542 lastToken = sourceToken->listHeader->first;
1543 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1544 lastToken = lastToken->next;
1545 }
1546 if(lastToken->next != NULL) {
1547 lastToken->next->previous = sourceToken;
1548 } else {
1549 sourceToken->listHeader->last = sourceToken;
1550 }
1551 sourceToken->previous = lastToken;
1552 sourceToken->next = lastToken->next;
1553 lastToken->next = sourceToken;
1554 }
1555 }
1556 } else {
1557 /* Otherwise (when LAST is not a reset)
1558 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1559 otherwise insert before.
1560 when inserting after or before, search to the next position with the same
1561 strength in that direction. (This is called postpone insertion). */
1562 if(sourceToken != lastToken) {
1563 if(lastToken->polarity == sourceToken->polarity) {
1564 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1565 lastToken = lastToken->next;
1566 }
1567 sourceToken->previous = lastToken;
1568 if(lastToken->next != NULL) {
1569 lastToken->next->previous = sourceToken;
1570 } else {
1571 sourceToken->listHeader->last = sourceToken;
1572 }
1573
1574 sourceToken->next = lastToken->next;
1575 lastToken->next = sourceToken;
1576 } else {
1577 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1578 lastToken = lastToken->previous;
1579 }
1580 sourceToken->next = lastToken;
1581 if(lastToken->previous != NULL) {
1582 lastToken->previous->next = sourceToken;
1583 } else {
1584 sourceToken->listHeader->first = sourceToken;
1585 }
1586 sourceToken->previous = lastToken->previous;
1587 lastToken->previous = sourceToken;
1588 }
1589 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1590 if(lastStrength < sourceToken->strength) {
1591 sourceToken->strength = lastStrength;
1592 }
1593 }
1594 }
1595
1596 /* if the token was a variable top, we're gonna put it in */
1597 if(variableTop == TRUE && src->varTop == NULL) {
1598 variableTop = FALSE;
1599 src->varTop = sourceToken;
1600 }
1601
1602 // Treat the expansions.
1603 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1604 // (&abc * d * e <=> &ab * d / c * e / c)
1605 // if both of them are in effect for a token, they are combined.
1606
1607 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1608
1609 if(expandNext != 0) {
1610 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1611 expandNext = 0;
1612 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1613 sourceToken->expansion = expandNext;
1614 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1615 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1616 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1617 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1618 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1619 }
1620 }
1621
1622 // This is just for debugging purposes
1623 if(sourceToken->expansion != 0) {
1624 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1625 } else {
1626 sourceToken->debugExpansion = 0;
1627 }
1628 // if the previous token was a reset before, the strength of this
1629 // token must match the strength of before. Otherwise we have an
1630 // undefined situation.
1631 // In other words, we currently have a cludge which we use to
1632 // represent &a >> x. This is written as &[before 2]a << x.
1633 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1634 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1635 if(beforeStrength != sourceToken->strength) {
1636 *status = U_INVALID_FORMAT_ERROR;
1637 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1638 return 0;
1639 }
1640 }
1641 } else {
1642 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1643 /* if the previous token was also a reset, */
1644 /*this means that we have two consecutive resets */
1645 /* and we want to remove the previous one if empty*/
1646 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1647 src->resultLen--;
1648 }
1649 }
1650
1651 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1652 uint32_t searchCharsLen = src->parsedToken.charsLen;
1653 while(searchCharsLen > 1 && sourceToken == NULL) {
1654 searchCharsLen--;
1655 //key = searchCharsLen << 24 | charsOffset;
1656 UColToken key;
1657 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1658 key.rulesToParse = src->source;
1659 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1660 }
1661 if(sourceToken != NULL) {
1662 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1663 }
1664 }
1665
1666 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1667 if(top == FALSE) { /* there is no indirection */
1668 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1669 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1670 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1671 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1672 sourceToken = sourceToken->previous;
1673 }
1674 /* here, either we hit the strength or NULL */
1675 if(sourceToken->strength == strength) {
1676 if(sourceToken->previous != NULL) {
1677 sourceToken = sourceToken->previous;
1678 } else { /* start of list */
1679 sourceToken = sourceToken->listHeader->reset;
1680 }
1681 } else { /* we hit NULL */
1682 /* we should be doing the else part */
1683 sourceToken = sourceToken->listHeader->reset;
1684 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1685 }
1686 } else {
1687 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1688 }
1689 } else { /* this is both before and indirection */
1690 top = FALSE;
1691 ListList[src->resultLen].previousCE = 0;
1692 ListList[src->resultLen].previousContCE = 0;
1693 ListList[src->resultLen].indirect = TRUE;
1694 /* we need to do slightly more work. we need to get the baseCE using the */
1695 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1696 /* in ucol_bld */
1697 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1698 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1699 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1700 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1701
1702 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1703 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1704 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1705 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1706 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1707 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1708 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1709 } else {
1710 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1711 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1712 }
1713
1714 ListList[src->resultLen].baseCE = CE;
1715 ListList[src->resultLen].baseContCE = SecondCE;
1716 ListList[src->resultLen].nextCE = 0;
1717 ListList[src->resultLen].nextContCE = 0;
1718
1719 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1720 }
1721 }
1722
1723
1724 /* 5 If the relation is a reset:
1725 If sourceToken is null
1726 Create new list, create new sourceToken, make the baseCE from source, put
1727 the sourceToken in ListHeader of the new list */
1728 if(sourceToken == NULL) {
1729 /*
1730 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1731 First convert all expansions into normal form. Examples:
1732 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1733 d * ... into &x * c/y * d * ...
1734 Note: reset values can never have expansions, although they can cause the
1735 very next item to have one. They may be contractions, if they are found
1736 earlier in the list.
1737 */
1738 if(top == FALSE) {
1739 collIterate s;
1740 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1741
1742 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
1743
1744 CE = ucol_getNextCE(src->UCA, &s, status);
1745 const UChar *expand = s.pos;
1746 SecondCE = ucol_getNextCE(src->UCA, &s, status);
1747
1748 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1749 if(isContinuation(SecondCE)) {
1750 ListList[src->resultLen].baseContCE = SecondCE;
1751 } else {
1752 ListList[src->resultLen].baseContCE = 0;
1753 }
1754 ListList[src->resultLen].nextCE = 0;
1755 ListList[src->resultLen].nextContCE = 0;
1756 ListList[src->resultLen].previousCE = 0;
1757 ListList[src->resultLen].previousContCE = 0;
1758 ListList[src->resultLen].indirect = FALSE;
1759 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1760 } else { /* top == TRUE */
1761 /* just use the supplied values */
1762 top = FALSE;
1763 ListList[src->resultLen].previousCE = 0;
1764 ListList[src->resultLen].previousContCE = 0;
1765 ListList[src->resultLen].indirect = TRUE;
1766 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1767 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1768 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1769 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1770
1771 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1772
1773 }
1774 } else { /* reset to something already in rules */
1775 top = FALSE;
1776 }
1777 }
1778 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
1779 lastToken = sourceToken;
1780 } else {
1781 if(U_FAILURE(*status)) {
1782 return 0;
1783 }
1784 }
1785 }
1786
1787 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1788 src->resultLen--;
1789 }
1790 return src->resultLen;
1791}
1792
1793void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1794 U_NAMESPACE_USE
1795
1796 uint32_t nSize = 0;
1797 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1798 if(U_FAILURE(*status)) {
1799 return;
1800 }
1801
1802 // set everything to zero, so that we can clean up gracefully
1803 uprv_memset(src, 0, sizeof(UColTokenParser));
1804
1805 // first we need to find options that don't like to be normalized,
1806 // like copy and remove...
1807 //const UChar *openBrace = rules;
1808 int32_t optionNumber = -1;
1809 const UChar *setStart = NULL;
1810 uint32_t i = 0;
1811 while(i < rulesLength) {
1812 if(rules[i] == 0x005B) {
1813 // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1814 //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1815 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1816 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1817 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1818 if(U_SUCCESS(*status)) {
1819 if(src->copySet == NULL) {
1820 src->copySet = newSet;
1821 } else {
1822 uset_addAll(src->copySet, newSet);
1823 uset_close(newSet);
1824 }
1825 } else {
1826 return;
1827 }
1828 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1829 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1830 if(U_SUCCESS(*status)) {
1831 if(src->removeSet == NULL) {
1832 src->removeSet = newSet;
1833 } else {
1834 uset_addAll(src->removeSet, newSet);
1835 uset_close(newSet);
1836 }
1837 } else {
1838 return;
1839 }
1840 }
1841 }
1842 //openBrace++;
1843 i++;
1844 }
1845
1846 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1847 /* test for NULL */
1848 if (src->source == NULL) {
1849 *status = U_MEMORY_ALLOCATION_ERROR;
1850 return;
1851 }
1852 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1853 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1854 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1855 *status = U_ZERO_ERROR;
1856 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1857 /* test for NULL */
1858 if (src->source == NULL) {
1859 *status = U_MEMORY_ALLOCATION_ERROR;
1860 return;
1861 }
1862 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1863 }
1864 src->current = src->source;
1865 src->end = src->source+nSize;
1866 src->sourceCurrent = src->source;
1867 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1868 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1869 src->varTop = NULL;
1870 src->UCA = UCA;
1871 src->invUCA = ucol_initInverseUCA(status);
1872 src->parsedToken.charsLen = 0;
1873 src->parsedToken.charsOffset = 0;
1874 src->parsedToken.extensionLen = 0;
1875 src->parsedToken.extensionOffset = 0;
1876 src->parsedToken.prefixLen = 0;
1877 src->parsedToken.prefixOffset = 0;
1878 src->parsedToken.flags = 0;
1879 src->parsedToken.strength = UCOL_TOK_UNSET;
1880 src->buildCCTabFlag = FALSE;
1881 src->prevStrength = UCOL_TOK_UNSET;
1882
1883 if(U_FAILURE(*status)) {
1884 return;
1885 }
1886 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1887 if(U_FAILURE(*status)) {
1888 return;
1889 }
1890 uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1891
1892 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1893 /* test for NULL */
1894 if (src->opts == NULL) {
1895 *status = U_MEMORY_ALLOCATION_ERROR;
1896 return;
1897 }
1898
1899 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1900
1901 // rulesToParse = src->source;
1902 src->lh = 0;
1903 src->listCapacity = 1024;
1904 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1905 //Test for NULL
1906 if (src->lh == NULL) {
1907 *status = U_MEMORY_ALLOCATION_ERROR;
1908 return;
1909 }
1910 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1911 src->resultLen = 0;
1912
1913 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1914
1915 // UCOL_RESET_TOP_VALUE
1916 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1917 // UCOL_FIRST_PRIMARY_IGNORABLE
1918 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1919 // UCOL_LAST_PRIMARY_IGNORABLE
1920 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1921 // UCOL_FIRST_SECONDARY_IGNORABLE
1922 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1923 // UCOL_LAST_SECONDARY_IGNORABLE
1924 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1925 // UCOL_FIRST_TERTIARY_IGNORABLE
1926 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1927 // UCOL_LAST_TERTIARY_IGNORABLE
1928 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1929 // UCOL_FIRST_VARIABLE
1930 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1931 // UCOL_LAST_VARIABLE
1932 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1933 // UCOL_FIRST_NON_VARIABLE
1934 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1935 // UCOL_LAST_NON_VARIABLE
1936 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1937 // UCOL_FIRST_IMPLICIT
1938 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1939 // UCOL_LAST_IMPLICIT
1940 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1941 // UCOL_FIRST_TRAILING
1942 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1943 // UCOL_LAST_TRAILING
1944 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1945 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1946}
1947
1948
1949void ucol_tok_closeTokenList(UColTokenParser *src) {
1950 if(src->copySet != NULL) {
1951 uset_close(src->copySet);
1952 }
1953 if(src->removeSet != NULL) {
1954 uset_close(src->removeSet);
1955 }
1956 if(src->tailored != NULL) {
1957 uhash_close(src->tailored);
1958 }
1959 if(src->lh != NULL) {
1960 uprv_free(src->lh);
1961 }
1962 if(src->source != NULL) {
1963 uprv_free(src->source);
1964 }
1965 if(src->opts != NULL) {
1966 uprv_free(src->opts);
1967 }
1968}
1969
1970#endif /* #if !UCONFIG_NO_COLLATION */