Blame - icu/source/i18n/ucol_tok.cpp - nest-learning-thermostat/5.0.1/icu

blob: 06cc48bb250891224d70770c1f105671920f1d18 [file] [log] [blame]

Igor Sarkisov	7a4d6f0	2020-10-06 04:17:58 -0700	[diff] [blame]	1	/*
				2	*******************************************************************************
				3	*
				4	* Copyright (C) 2001-2010, International Business Machines
				5	* Corporation and others. All Rights Reserved.
				6	*
				7	*******************************************************************************
				8	* file name: ucol_tok.cpp
				9	* encoding: US-ASCII
				10	* tab size: 8 (not used)
				11	* indentation:4
				12	*
				13	* created 02/22/2001
				14	* created by: Vladimir Weinstein
				15	*
				16	* This module reads a tailoring rule string and produces a list of
				17	* tokens that will be turned into collation elements
				18	*
				19	*/
				20
				21	#include "unicode/utypes.h"
				22
				23	#if !UCONFIG_NO_COLLATION
				24
				25	#include "unicode/ustring.h"
				26	#include "unicode/uchar.h"
				27	#include "unicode/uniset.h"
				28
				29	#include "ucol_tok.h"
				30	#include "ucol_bld.h"
				31	#include "cmemory.h"
				32	#include "util.h"
				33
				34	U_CDECL_BEGIN
				35	static int32_t U_CALLCONV
				36	uhash_hashTokens(const UHashTok k)
				37	{
				38	int32_t hash = 0;
				39	//uint32_t key = (uint32_t)k.integer;
				40	UColToken key = (UColToken )k.pointer;
				41	if (key != 0) {
				42	//int32_t len = (key & 0xFF000000)>>24;
				43	int32_t len = (key->source & 0xFF000000)>>24;
				44	int32_t inc = ((len - 32) / 32) + 1;
				45
				46	//const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
				47	const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
				48	const UChar *limit = p + len;
				49
				50	while (p<limit) {
				51	hash = (hash * 37) + *p;
				52	p += inc;
				53	}
				54	}
				55	return hash;
				56	}
				57
				58	static UBool U_CALLCONV
				59	uhash_compareTokens(const UHashTok key1, const UHashTok key2)
				60	{
				61	//uint32_t p1 = (uint32_t) key1.integer;
				62	//uint32_t p2 = (uint32_t) key2.integer;
				63	UColToken p1 = (UColToken )key1.pointer;
				64	UColToken p2 = (UColToken )key2.pointer;
				65	const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
				66	const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
				67	uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
				68	uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
				69	const UChar *end = s1+s1L-1;
				70
				71	if (p1 == p2) {
				72	return TRUE;
				73	}
				74	if (p1->source == 0 \|\| p2->source == 0) {
				75	return FALSE;
				76	}
				77	if(s1L != s2L) {
				78	return FALSE;
				79	}
				80	if(p1->source == p2->source) {
				81	return TRUE;
				82	}
				83	while((s1 < end) && s1 == s2) {
				84	++s1;
				85	++s2;
				86	}
				87	if(s1 == s2) {
				88	return TRUE;
				89	} else {
				90	return FALSE;
				91	}
				92	}
				93	U_CDECL_END
				94
				95	/*static inline void U_CALLCONV
				96	uhash_freeBlockWrapper(void *obj) {
				97	uhash_freeBlock(obj);
				98	}*/
				99
				100
				101	typedef struct {
				102	uint32_t startCE;
				103	uint32_t startContCE;
				104	uint32_t limitCE;
				105	uint32_t limitContCE;
				106	} indirectBoundaries;
				107
				108	/* these values are used for finding CE values for indirect positioning. */
				109	/* Indirect positioning is a mechanism for allowing resets on symbolic */
				110	/* values. It only works for resets and you cannot tailor indirect names */
				111	/* An indirect name can define either an anchor point or a range. An */
				112	/* anchor point behaves in exactly the same way as a code point in reset */
				113	/* would, except that it cannot be tailored. A range (we currently only */
				114	/* know for the [top] range will explicitly set the upper bound for */
				115	/* generated CEs, thus allowing for better control over how many CEs can */
				116	/* be squeezed between in the range without performance penalty. */
				117	/* In that respect, we use [top] for tailoring of locales that use CJK */
				118	/* characters. Other indirect values are currently a pure convenience, */
				119	/* they can be used to assure that the CEs will be always positioned in */
				120	/* the same place relative to a point with known properties (e.g. first */
				121	/* primary ignorable). */
				122	static indirectBoundaries ucolIndirectBoundaries[15];
				123	/*
				124	static indirectBoundaries ucolIndirectBoundaries[11] = {
				125	{ UCOL_RESET_TOP_VALUE, 0,
				126	UCOL_NEXT_TOP_VALUE, 0 },
				127	{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
				128	0, 0 },
				129	{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
				130	0, 0 },
				131	{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
				132	0, 0 },
				133	{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
				134	0, 0 },
				135	{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
				136	0, 0 },
				137	{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
				138	0, 0 },
				139	{ UCOL_FIRST_VARIABLE, 0,
				140	0, 0 },
				141	{ UCOL_LAST_VARIABLE, 0,
				142	0, 0 },
				143	{ UCOL_FIRST_NON_VARIABLE, 0,
				144	0, 0 },
				145	{ UCOL_LAST_NON_VARIABLE, 0,
				146	0, 0 },
				147	};
				148	*/
				149
				150	static void setIndirectBoundaries(uint32_t indexR, uint32_t start, uint32_t end) {
				151
				152	// Set values for the top - TODO: once we have values for all the indirects, we are going
				153	// to initalize here.
				154	ucolIndirectBoundaries[indexR].startCE = start[0];
				155	ucolIndirectBoundaries[indexR].startContCE = start[1];
				156	if(end) {
				157	ucolIndirectBoundaries[indexR].limitCE = end[0];
				158	ucolIndirectBoundaries[indexR].limitContCE = end[1];
				159	} else {
				160	ucolIndirectBoundaries[indexR].limitCE = 0;
				161	ucolIndirectBoundaries[indexR].limitContCE = 0;
				162	}
				163	}
				164
				165
				166	static inline
				167	void syntaxError(const UChar* rules,
				168	int32_t pos,
				169	int32_t rulesLen,
				170	UParseError* parseError)
				171	{
				172	parseError->offset = pos;
				173	parseError->line = 0 ; /* we are not using line numbers */
				174
				175	// for pre-context
				176	int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
				177	int32_t stop = pos;
				178
				179	u_memcpy(parseError->preContext,rules+start,stop-start);
				180	//null terminate the buffer
				181	parseError->preContext[stop-start] = 0;
				182
				183	//for post-context
				184	start = pos+1;
				185	stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
				186	rulesLen;
				187
				188	if(start < stop) {
				189	u_memcpy(parseError->postContext,rules+start,stop-start);
				190	//null terminate the buffer
				191	parseError->postContext[stop-start]= 0;
				192	} else {
				193	parseError->postContext[0] = 0;
				194	}
				195	}
				196
				197	static
				198	void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
				199	switch(attrib) {
				200	case UCOL_HIRAGANA_QUATERNARY_MODE:
				201	opts->hiraganaQ = value;
				202	break;
				203	case UCOL_FRENCH_COLLATION:
				204	opts->frenchCollation = value;
				205	break;
				206	case UCOL_ALTERNATE_HANDLING:
				207	opts->alternateHandling = value;
				208	break;
				209	case UCOL_CASE_FIRST:
				210	opts->caseFirst = value;
				211	break;
				212	case UCOL_CASE_LEVEL:
				213	opts->caseLevel = value;
				214	break;
				215	case UCOL_NORMALIZATION_MODE:
				216	opts->normalizationMode = value;
				217	break;
				218	case UCOL_STRENGTH:
				219	opts->strength = value;
				220	break;
				221	case UCOL_NUMERIC_COLLATION:
				222	opts->numericCollation = value;
				223	break;
				224	case UCOL_ATTRIBUTE_COUNT:
				225	default:
				226	break;
				227	}
				228	}
				229
				230	#define UTOK_OPTION_COUNT 20
				231
				232	static UBool didInit = FALSE;
				233	/* we can be strict, or we can be lenient */
				234	/* I'd surely be lenient with the option arguments */
				235	/* maybe even with options */
				236	U_STRING_DECL(suboption_00, "non-ignorable", 13);
				237	U_STRING_DECL(suboption_01, "shifted", 7);
				238
				239	U_STRING_DECL(suboption_02, "lower", 5);
				240	U_STRING_DECL(suboption_03, "upper", 5);
				241	U_STRING_DECL(suboption_04, "off", 3);
				242	U_STRING_DECL(suboption_05, "on", 2);
				243	U_STRING_DECL(suboption_06, "1", 1);
				244	U_STRING_DECL(suboption_07, "2", 1);
				245	U_STRING_DECL(suboption_08, "3", 1);
				246	U_STRING_DECL(suboption_09, "4", 1);
				247	U_STRING_DECL(suboption_10, "I", 1);
				248
				249	U_STRING_DECL(suboption_11, "primary", 7);
				250	U_STRING_DECL(suboption_12, "secondary", 9);
				251	U_STRING_DECL(suboption_13, "tertiary", 8);
				252	U_STRING_DECL(suboption_14, "variable", 8);
				253	U_STRING_DECL(suboption_15, "regular", 7);
				254	U_STRING_DECL(suboption_16, "implicit", 8);
				255	U_STRING_DECL(suboption_17, "trailing", 8);
				256
				257
				258	U_STRING_DECL(option_00, "undefined", 9);
				259	U_STRING_DECL(option_01, "rearrange", 9);
				260	U_STRING_DECL(option_02, "alternate", 9);
				261	U_STRING_DECL(option_03, "backwards", 9);
				262	U_STRING_DECL(option_04, "variable top", 12);
				263	U_STRING_DECL(option_05, "top", 3);
				264	U_STRING_DECL(option_06, "normalization", 13);
				265	U_STRING_DECL(option_07, "caseLevel", 9);
				266	U_STRING_DECL(option_08, "caseFirst", 9);
				267	U_STRING_DECL(option_09, "scriptOrder", 11);
				268	U_STRING_DECL(option_10, "charsetname", 11);
				269	U_STRING_DECL(option_11, "charset", 7);
				270	U_STRING_DECL(option_12, "before", 6);
				271	U_STRING_DECL(option_13, "hiraganaQ", 9);
				272	U_STRING_DECL(option_14, "strength", 8);
				273	U_STRING_DECL(option_15, "first", 5);
				274	U_STRING_DECL(option_16, "last", 4);
				275	U_STRING_DECL(option_17, "optimize", 8);
				276	U_STRING_DECL(option_18, "suppressContractions", 20);
				277	U_STRING_DECL(option_19, "numericOrdering", 15);
				278
				279
				280	/*
				281	[last variable] last variable value
				282	[last primary ignorable] largest CE for primary ignorable
				283	[last secondary ignorable] largest CE for secondary ignorable
				284	[last tertiary ignorable] largest CE for tertiary ignorable
				285	[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
				286	*/
				287
				288
				289	static const ucolTokSuboption alternateSub[2] = {
				290	{suboption_00, 13, UCOL_NON_IGNORABLE},
				291	{suboption_01, 7, UCOL_SHIFTED}
				292	};
				293
				294	static const ucolTokSuboption caseFirstSub[3] = {
				295	{suboption_02, 5, UCOL_LOWER_FIRST},
				296	{suboption_03, 5, UCOL_UPPER_FIRST},
				297	{suboption_04, 3, UCOL_OFF},
				298	};
				299
				300	static const ucolTokSuboption onOffSub[2] = {
				301	{suboption_04, 3, UCOL_OFF},
				302	{suboption_05, 2, UCOL_ON}
				303	};
				304
				305	static const ucolTokSuboption frenchSub[1] = {
				306	{suboption_07, 1, UCOL_ON}
				307	};
				308
				309	static const ucolTokSuboption beforeSub[3] = {
				310	{suboption_06, 1, UCOL_PRIMARY},
				311	{suboption_07, 1, UCOL_SECONDARY},
				312	{suboption_08, 1, UCOL_TERTIARY}
				313	};
				314
				315	static const ucolTokSuboption strengthSub[5] = {
				316	{suboption_06, 1, UCOL_PRIMARY},
				317	{suboption_07, 1, UCOL_SECONDARY},
				318	{suboption_08, 1, UCOL_TERTIARY},
				319	{suboption_09, 1, UCOL_QUATERNARY},
				320	{suboption_10, 1, UCOL_IDENTICAL},
				321	};
				322
				323	static const ucolTokSuboption firstLastSub[7] = {
				324	{suboption_11, 7, UCOL_PRIMARY},
				325	{suboption_12, 9, UCOL_PRIMARY},
				326	{suboption_13, 8, UCOL_PRIMARY},
				327	{suboption_14, 8, UCOL_PRIMARY},
				328	{suboption_15, 7, UCOL_PRIMARY},
				329	{suboption_16, 8, UCOL_PRIMARY},
				330	{suboption_17, 8, UCOL_PRIMARY},
				331	};
				332
				333	enum OptionNumber {
				334	OPTION_ALTERNATE_HANDLING = 0,
				335	OPTION_FRENCH_COLLATION,
				336	OPTION_CASE_LEVEL,
				337	OPTION_CASE_FIRST,
				338	OPTION_NORMALIZATION_MODE,
				339	OPTION_HIRAGANA_QUATERNARY,
				340	OPTION_STRENGTH,
				341	OPTION_NUMERIC_COLLATION,
				342	OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
				343	OPTION_VARIABLE_TOP,
				344	OPTION_REARRANGE,
				345	OPTION_BEFORE,
				346	OPTION_TOP,
				347	OPTION_FIRST,
				348	OPTION_LAST,
				349	OPTION_OPTIMIZE,
				350	OPTION_SUPPRESS_CONTRACTIONS,
				351	OPTION_UNDEFINED,
				352	OPTION_SCRIPT_ORDER,
				353	OPTION_CHARSET_NAME,
				354	OPTION_CHARSET
				355	} ;
				356
				357	static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
				358	/00/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /"alternate" /
				359	/01/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /"backwards" /
				360	/02/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /"caseLevel" /
				361	/03/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /"caseFirst" /
				362	/04/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /"normalization" /
				363	/05/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /"hiraganaQ" /
				364	/06/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /"strength" /
				365	/07/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /"numericOrdering"/
				366	/08/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"variable top" /
				367	/09/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"rearrange" /
				368	/10/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /"before" /
				369	/11/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"top" /
				370	/12/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /"first" /
				371	/13/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /"last" /
				372	/14/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"optimize" /
				373	/15/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"suppressContractions" /
				374	/16/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"undefined" /
				375	/17/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"scriptOrder" /
				376	/18/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"charsetname" /
				377	/19/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /"charset" /
				378	};
				379
				380	static
				381	int32_t u_strncmpNoCase(const UChar *s1,
				382	const UChar *s2,
				383	int32_t n)
				384	{
				385	if(n > 0) {
				386	int32_t rc;
				387	for(;;) {
				388	rc = (int32_t)u_tolower(s1) - (int32_t)u_tolower(s2);
				389	if(rc != 0 \|\| *s1 == 0 \|\| --n == 0) {
				390	return rc;
				391	}
				392	++s1;
				393	++s2;
				394	}
				395	}
				396	return 0;
				397	}
				398
				399	static
				400	void ucol_uprv_tok_initData() {
				401	if(!didInit) {
				402	U_STRING_INIT(suboption_00, "non-ignorable", 13);
				403	U_STRING_INIT(suboption_01, "shifted", 7);
				404
				405	U_STRING_INIT(suboption_02, "lower", 5);
				406	U_STRING_INIT(suboption_03, "upper", 5);
				407	U_STRING_INIT(suboption_04, "off", 3);
				408	U_STRING_INIT(suboption_05, "on", 2);
				409
				410	U_STRING_INIT(suboption_06, "1", 1);
				411	U_STRING_INIT(suboption_07, "2", 1);
				412	U_STRING_INIT(suboption_08, "3", 1);
				413	U_STRING_INIT(suboption_09, "4", 1);
				414	U_STRING_INIT(suboption_10, "I", 1);
				415
				416	U_STRING_INIT(suboption_11, "primary", 7);
				417	U_STRING_INIT(suboption_12, "secondary", 9);
				418	U_STRING_INIT(suboption_13, "tertiary", 8);
				419	U_STRING_INIT(suboption_14, "variable", 8);
				420	U_STRING_INIT(suboption_15, "regular", 7);
				421	U_STRING_INIT(suboption_16, "implicit", 8);
				422	U_STRING_INIT(suboption_17, "trailing", 8);
				423
				424
				425	U_STRING_INIT(option_00, "undefined", 9);
				426	U_STRING_INIT(option_01, "rearrange", 9);
				427	U_STRING_INIT(option_02, "alternate", 9);
				428	U_STRING_INIT(option_03, "backwards", 9);
				429	U_STRING_INIT(option_04, "variable top", 12);
				430	U_STRING_INIT(option_05, "top", 3);
				431	U_STRING_INIT(option_06, "normalization", 13);
				432	U_STRING_INIT(option_07, "caseLevel", 9);
				433	U_STRING_INIT(option_08, "caseFirst", 9);
				434	U_STRING_INIT(option_09, "scriptOrder", 11);
				435	U_STRING_INIT(option_10, "charsetname", 11);
				436	U_STRING_INIT(option_11, "charset", 7);
				437	U_STRING_INIT(option_12, "before", 6);
				438	U_STRING_INIT(option_13, "hiraganaQ", 9);
				439	U_STRING_INIT(option_14, "strength", 8);
				440	U_STRING_INIT(option_15, "first", 5);
				441	U_STRING_INIT(option_16, "last", 4);
				442	U_STRING_INIT(option_17, "optimize", 8);
				443	U_STRING_INIT(option_18, "suppressContractions", 20);
				444	U_STRING_INIT(option_19, "numericOrdering", 15);
				445	didInit = TRUE;
				446	}
				447	}
				448
				449
				450	// This function reads basic options to set in the runtime collator
				451	// used by data driven tests. Should not support build time options
				452	U_CAPI const UChar * U_EXPORT2
				453	ucol_tok_getNextArgument(const UChar start, const UChar end,
				454	UColAttribute attrib, UColAttributeValue value,
				455	UErrorCode *status)
				456	{
				457	uint32_t i = 0;
				458	int32_t j=0;
				459	UBool foundOption = FALSE;
				460	const UChar *optionArg = NULL;
				461
				462	ucol_uprv_tok_initData();
				463
				464	while(start < end && (u_isWhitespace(start) \|\| uprv_isRuleWhiteSpace(start))) { /* eat whitespace */
				465	start++;
				466	}
				467	if(start >= end) {
				468	return NULL;
				469	}
				470	/* skip opening '[' */
				471	if(*start == 0x005b) {
				472	start++;
				473	} else {
				474	*status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
				475	return NULL;
				476	}
				477
				478	while(i < UTOK_OPTION_COUNT) {
				479	if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
				480	foundOption = TRUE;
				481	if(end - start > rulesOptions[i].optionLen) {
				482	optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
				483	while(u_isWhitespace(optionArg) \|\| uprv_isRuleWhiteSpace(optionArg)) { /* eat whitespace */
				484	optionArg++;
				485	}
				486	}
				487	break;
				488	}
				489	i++;
				490	}
				491
				492	if(!foundOption) {
				493	*status = U_ILLEGAL_ARGUMENT_ERROR;
				494	return NULL;
				495	}
				496
				497	if(optionArg) {
				498	for(j = 0; j<rulesOptions[i].subSize; j++) {
				499	if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
				500	//ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
				501	*attrib = rulesOptions[i].attr;
				502	*value = rulesOptions[i].subopts[j].attrVal;
				503	optionArg += rulesOptions[i].subopts[j].subLen;
				504	while(u_isWhitespace(optionArg) \|\| uprv_isRuleWhiteSpace(optionArg)) { /* eat whitespace */
				505	optionArg++;
				506	}
				507	if(*optionArg == 0x005d) {
				508	optionArg++;
				509	return optionArg;
				510	} else {
				511	*status = U_ILLEGAL_ARGUMENT_ERROR;
				512	return NULL;
				513	}
				514	}
				515	}
				516	}
				517	*status = U_ILLEGAL_ARGUMENT_ERROR;
				518	return NULL;
				519	}
				520
				521	static
				522	USet ucol_uprv_tok_readAndSetUnicodeSet(const UChar start, const UChar end, UErrorCode status) {
				523	while(start != 0x005b) { / advance while we find the first '[' */
				524	start++;
				525	}
				526	// now we need to get a balanced set of '[]'. The problem is that a set can have
				527	// many, and *end point to the first closing '['
				528	int32_t noOpenBraces = 1;
				529	int32_t current = 1; // skip the opening brace
				530	while(start+current < end && noOpenBraces != 0) {
				531	if(start[current] == 0x005b) {
				532	noOpenBraces++;
				533	} else if(start[current] == 0x005D) { // closing brace
				534	noOpenBraces--;
				535	}
				536	current++;
				537	}
				538
				539	if(noOpenBraces != 0 \|\| u_strchr(start+current, 0x005d /']'/) == NULL) {
				540	*status = U_ILLEGAL_ARGUMENT_ERROR;
				541	return NULL;
				542	}
				543	return uset_openPattern(start, current, status);
				544	}
				545
				546	static
				547	int32_t ucol_uprv_tok_readOption(const UChar start, const UChar end, const UChar **optionArg) {
				548	int32_t i = 0;
				549	ucol_uprv_tok_initData();
				550
				551	while(u_isWhitespace(start) \|\| uprv_isRuleWhiteSpace(start)) { /* eat whitespace */
				552	start++;
				553	}
				554	while(i < UTOK_OPTION_COUNT) {
				555	if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
				556	if(end - start > rulesOptions[i].optionLen) {
				557	optionArg = start+rulesOptions[i].optionLen; / start of the options*/
				558	while(u_isWhitespace(optionArg) \|\| uprv_isRuleWhiteSpace(optionArg)) { /* eat whitespace */
				559	(*optionArg)++;
				560	}
				561	}
				562	break;
				563	}
				564	i++;
				565	}
				566	if(i == UTOK_OPTION_COUNT) {
				567	i = -1; // didn't find an option
				568	}
				569	return i;
				570	}
				571
				572
				573	// reads and conforms to various options in rules
				574	// end is the position of the first closing ']'
				575	// However, some of the options take an UnicodeSet definition
				576	// which needs to duplicate the closing ']'
				577	// for example: '[copy [\uAC00-\uD7FF]]'
				578	// These options will move end to the second ']' and the
				579	// caller will set the current to it.
				580	static
				581	uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser src, UErrorCode status) {
				582	const UChar* start = src->current;
				583	int32_t i = 0;
				584	int32_t j=0;
				585	const UChar *optionArg = NULL;
				586
				587	uint8_t result = 0;
				588
				589	start++; /skip opening '['/
				590	i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
				591	if(optionArg) {
				592	src->current = optionArg;
				593	}
				594
				595	if(i < 0) {
				596	*status = U_ILLEGAL_ARGUMENT_ERROR;
				597	} else {
				598	int32_t noOpenBraces = 1;
				599	switch(i) {
				600	case OPTION_ALTERNATE_HANDLING:
				601	case OPTION_FRENCH_COLLATION:
				602	case OPTION_CASE_LEVEL:
				603	case OPTION_CASE_FIRST:
				604	case OPTION_NORMALIZATION_MODE:
				605	case OPTION_HIRAGANA_QUATERNARY:
				606	case OPTION_STRENGTH:
				607	case OPTION_NUMERIC_COLLATION:
				608	if(optionArg) {
				609	for(j = 0; j<rulesOptions[i].subSize; j++) {
				610	if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
				611	ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
				612	result = UCOL_TOK_SUCCESS;
				613	}
				614	}
				615	}
				616	if(result == 0) {
				617	*status = U_ILLEGAL_ARGUMENT_ERROR;
				618	}
				619	break;
				620	case OPTION_VARIABLE_TOP:
				621	result = UCOL_TOK_SUCCESS \| UCOL_TOK_VARIABLE_TOP;
				622	break;
				623	case OPTION_REARRANGE:
				624	result = UCOL_TOK_SUCCESS;
				625	break;
				626	case OPTION_BEFORE:
				627	if(optionArg) {
				628	for(j = 0; j<rulesOptions[i].subSize; j++) {
				629	if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
				630	result = UCOL_TOK_SUCCESS \| rulesOptions[i].subopts[j].attrVal + 1;
				631	}
				632	}
				633	}
				634	if(result == 0) {
				635	*status = U_ILLEGAL_ARGUMENT_ERROR;
				636	}
				637	break;
				638	case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
				639	/* index to this array will be src->parsedToken.indirectIndex*/
				640	src->parsedToken.indirectIndex = 0;
				641	result = UCOL_TOK_SUCCESS \| UCOL_TOK_TOP;
				642	break;
				643	case OPTION_FIRST:
				644	case OPTION_LAST: /* first, last */
				645	for(j = 0; j<rulesOptions[i].subSize; j++) {
				646	if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
				647	// the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
				648	// element of indirect boundaries is reserved for top.
				649	src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
				650	result = UCOL_TOK_SUCCESS \| UCOL_TOK_TOP;;
				651	}
				652	}
				653	if(result == 0) {
				654	*status = U_ILLEGAL_ARGUMENT_ERROR;
				655	}
				656	break;
				657	case OPTION_OPTIMIZE:
				658	case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
				659	// we need to move end here
				660	src->current++; // skip opening brace
				661	while(src->current < src->end && noOpenBraces != 0) {
				662	if(*src->current == 0x005b) {
				663	noOpenBraces++;
				664	} else if(*src->current == 0x005D) { // closing brace
				665	noOpenBraces--;
				666	}
				667	src->current++;
				668	}
				669	result = UCOL_TOK_SUCCESS;
				670	break;
				671	default:
				672	*status = U_UNSUPPORTED_ERROR;
				673	break;
				674	}
				675	}
				676	src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
				677	return result;
				678	}
				679
				680
				681	inline void ucol_tok_addToExtraCurrent(UColTokenParser src, const UChar stuff, int32_t len, UErrorCode *status) {
				682	if (stuff == NULL \|\| len <= 0) {
				683	return;
				684	}
				685	UChar tempStuff = (UChar )stuff;
				686	if(src->extraCurrent+len >= src->extraEnd) {
				687	/* reallocate */
				688	if (stuff >= src->source && stuff <= src->end) {
				689	// Copy stuff to a new buffer if stuff points to an address within
				690	// src->source buffer.
				691	tempStuff = (UChar)uprv_malloc(lensizeof(UChar));
				692	if (tempStuff == NULL) {
				693	*status = U_MEMORY_ALLOCATION_ERROR;
				694	return;
				695	}
				696	uprv_memcpy(tempStuff, stuff, len*sizeof(UChar));
				697	}
				698	UChar newSrc = (UChar )uprv_realloc(src->source, (src->extraEnd-src->source)2sizeof(UChar));
				699	if(newSrc != NULL) {
				700	src->current = newSrc + (src->current - src->source);
				701	src->extraCurrent = newSrc + (src->extraCurrent - src->source);
				702	src->end = newSrc + (src->end - src->source);
				703	src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
				704	src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
				705	src->source = newSrc;
				706	} else {
				707	*status = U_MEMORY_ALLOCATION_ERROR;
				708	if (tempStuff != stuff) {
				709	uprv_free(tempStuff);
				710	}
				711	return;
				712	}
				713	}
				714	if(len == 1) {
				715	src->extraCurrent++ = tempStuff;
				716	} else {
				717	uprv_memcpy(src->extraCurrent, tempStuff, len*sizeof(UChar));
				718	src->extraCurrent += len;
				719	}
				720	if (tempStuff != stuff) {
				721	uprv_free(tempStuff);
				722	}
				723	}
				724
				725	inline UBool ucol_tok_doSetTop(UColTokenParser src, UErrorCode status) {
				726	/*
				727	top = TRUE;
				728	*/
				729	UChar buff[5];
				730	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
				731	buff[0] = 0xFFFE;
				732	buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
				733	buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
				734	if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
				735	src->parsedToken.charsLen = 3;
				736	ucol_tok_addToExtraCurrent(src, buff, 3, status);
				737	} else {
				738	buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
				739	buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
				740	src->parsedToken.charsLen = 5;
				741	ucol_tok_addToExtraCurrent(src, buff, 5, status);
				742	}
				743	return TRUE;
				744	}
				745
				746	static UBool isCharNewLine(UChar c){
				747	switch(c){
				748	case 0x000A: /* LF */
				749	case 0x000D: /* CR */
				750	case 0x000C: /* FF */
				751	case 0x0085: /* NEL */
				752	case 0x2028: /* LS */
				753	case 0x2029: /* PS */
				754	return TRUE;
				755	default:
				756	return FALSE;
				757	}
				758	}
				759
				760	U_CAPI const UChar* U_EXPORT2
				761	ucol_tok_parseNextToken(UColTokenParser *src,
				762	UBool startOfRules,
				763	UParseError *parseError,
				764	UErrorCode *status)
				765	{
				766	/* parsing part */
				767	UBool variableTop = FALSE;
				768	UBool top = FALSE;
				769	UBool inChars = TRUE;
				770	UBool inQuote = FALSE;
				771	UBool wasInQuote = FALSE;
				772	uint8_t before = 0;
				773	UBool isEscaped = FALSE;
				774	// TODO: replace these variables with src->parsedToken counterparts
				775	// no need to use them anymore since we have src->parsedToken.
				776	// Ideally, token parser would be a nice class... Once, when I have
				777	// more time (around 2020 probably).
				778	uint32_t newExtensionLen = 0;
				779	uint32_t extensionOffset = 0;
				780	uint32_t newStrength = UCOL_TOK_UNSET;
				781	UChar buff[10];
				782	UChar32 codepoint;
				783
				784	src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
				785	src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
				786	src->parsedToken.indirectIndex = 0;
				787
				788	while (src->current < src->end) {
				789	UChar ch = *(src->current);
				790
				791	if (inQuote) {
				792	if (ch == 0x0027/'\''/) {
				793	inQuote = FALSE;
				794	} else {
				795	if ((src->parsedToken.charsLen == 0) \|\| inChars) {
				796	if(src->parsedToken.charsLen == 0) {
				797	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
				798	}
				799	src->parsedToken.charsLen++;
				800	} else {
				801	if(newExtensionLen == 0) {
				802	extensionOffset = (uint32_t)(src->extraCurrent - src->source);
				803	}
				804	newExtensionLen++;
				805	}
				806	}
				807	}else if(isEscaped){
				808	isEscaped =FALSE;
				809	if (newStrength == UCOL_TOK_UNSET) {
				810	*status = U_INVALID_FORMAT_ERROR;
				811	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				812	return NULL;
				813	// enabling rules to start with non-tokens a < b
				814	// newStrength = UCOL_TOK_RESET;
				815	}
				816	if(ch != 0x0000 && src->current != src->end) {
				817	if (inChars) {
				818	if(src->parsedToken.charsLen == 0) {
				819	src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
				820	}
				821	src->parsedToken.charsLen++;
				822	} else {
				823	if(newExtensionLen == 0) {
				824	extensionOffset = (uint32_t)(src->current - src->source);
				825	}
				826	newExtensionLen++;
				827	}
				828	}
				829	}else {
				830	if(!uprv_isRuleWhiteSpace(ch)) {
				831	/* Sets the strength for this entry */
				832	switch (ch) {
				833	case 0x003D/'='/ :
				834	if (newStrength != UCOL_TOK_UNSET) {
				835	goto EndOfLoop;
				836	}
				837
				838	/* if we start with strength, we'll reset to top */
				839	if(startOfRules == TRUE) {
				840	src->parsedToken.indirectIndex = 5;
				841	top = ucol_tok_doSetTop(src, status);
				842	newStrength = UCOL_TOK_RESET;
				843	goto EndOfLoop;
				844	}
				845	newStrength = UCOL_IDENTICAL;
				846	if((src->current+1) == 0x002A) {/''/
				847	src->current++;
				848	src->prevStrength = newStrength;
				849	}else{
				850	src->prevStrength = UCOL_TOK_UNSET;
				851	}
				852	break;
				853
				854	case 0x002C/','/:
				855	if (newStrength != UCOL_TOK_UNSET) {
				856	goto EndOfLoop;
				857	}
				858
				859	/* if we start with strength, we'll reset to top */
				860	if(startOfRules == TRUE) {
				861	src->parsedToken.indirectIndex = 5;
				862	top = ucol_tok_doSetTop(src, status);
				863	newStrength = UCOL_TOK_RESET;
				864	goto EndOfLoop;
				865	}
				866	newStrength = UCOL_TERTIARY;
				867	src->prevStrength = UCOL_TOK_UNSET;
				868	break;
				869
				870	case 0x003B/';'/:
				871	if (newStrength != UCOL_TOK_UNSET) {
				872	goto EndOfLoop;
				873	}
				874
				875	/* if we start with strength, we'll reset to top */
				876	if(startOfRules == TRUE) {
				877	src->parsedToken.indirectIndex = 5;
				878	top = ucol_tok_doSetTop(src, status);
				879	newStrength = UCOL_TOK_RESET;
				880	goto EndOfLoop;
				881	}
				882	newStrength = UCOL_SECONDARY;
				883	src->prevStrength = UCOL_TOK_UNSET;
				884	break;
				885
				886	case 0x003C/'<'/:
				887	if (newStrength != UCOL_TOK_UNSET) {
				888	goto EndOfLoop;
				889	}
				890
				891	/* if we start with strength, we'll reset to top */
				892	if(startOfRules == TRUE) {
				893	src->parsedToken.indirectIndex = 5;
				894	top = ucol_tok_doSetTop(src, status);
				895	newStrength = UCOL_TOK_RESET;
				896	goto EndOfLoop;
				897	}
				898	/* before this, do a scan to verify whether this is */
				899	/* another strength */
				900	if(*(src->current+1) == 0x003C) {
				901	src->current++;
				902	if(*(src->current+1) == 0x003C) {
				903	src->current++; /* three in a row! */
				904	newStrength = UCOL_TERTIARY;
				905	} else { /* two in a row */
				906	newStrength = UCOL_SECONDARY;
				907	}
				908	} else { /* just one */
				909	newStrength = UCOL_PRIMARY;
				910	}
				911	if((src->current+1) == 0x002A) {/''/
				912	src->current++;
				913	src->prevStrength = newStrength;
				914	}else{
				915	src->prevStrength = UCOL_TOK_UNSET;
				916	}
				917	break;
				918
				919	case 0x0026/'&'/:
				920	if (newStrength != UCOL_TOK_UNSET) {
				921	/**/
				922	goto EndOfLoop;
				923	}
				924
				925	newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
				926	src->prevStrength = UCOL_TOK_UNSET;
				927	break;
				928
				929	case 0x005b/'['/:
				930	/* options - read an option, analyze it */
				931	if(u_strchr(src->current, 0x005d /']'/) != NULL) {
				932	uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
				933	if(U_SUCCESS(*status)) {
				934	if(result & UCOL_TOK_TOP) {
				935	if(newStrength == UCOL_TOK_RESET) {
				936	top = ucol_tok_doSetTop(src, status);
				937	if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
				938	src->parsedToken.charsLen+=2;
				939	buff[0] = 0x002d;
				940	buff[1] = before;
				941	ucol_tok_addToExtraCurrent(src, buff, 2, status);
				942	}
				943
				944	src->current++;
				945	goto EndOfLoop;
				946	} else {
				947	*status = U_INVALID_FORMAT_ERROR;
				948	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				949	}
				950	} else if(result & UCOL_TOK_VARIABLE_TOP) {
				951	if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
				952	variableTop = TRUE;
				953	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
				954	src->parsedToken.charsLen = 1;
				955	buff[0] = 0xFFFF;
				956	ucol_tok_addToExtraCurrent(src, buff, 1, status);
				957	src->current++;
				958	goto EndOfLoop;
				959	} else {
				960	*status = U_INVALID_FORMAT_ERROR;
				961	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				962	}
				963	} else if (result & UCOL_TOK_BEFORE){
				964	if(newStrength == UCOL_TOK_RESET) {
				965	before = result & UCOL_TOK_BEFORE;
				966	} else {
				967	*status = U_INVALID_FORMAT_ERROR;
				968	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				969
				970	}
				971	}
				972	} else {
				973	*status = U_INVALID_FORMAT_ERROR;
				974	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				975	return NULL;
				976	}
				977	}
				978	break;
				979	case 0x0021/! skip java thai modifier reordering/:
				980	break;
				981	case 0x002F/'/'/:
				982	wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
				983	inChars = FALSE; /* we're now processing expansion */
				984	break;
				985	case 0x005C /* back slash for escaped chars */:
				986	isEscaped = TRUE;
				987	break;
				988	/* found a quote, we're gonna start copying */
				989	case 0x0027/'\''/:
				990	if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
				991	if(src->prevStrength == UCOL_TOK_UNSET){
				992	*status = U_INVALID_FORMAT_ERROR;
				993	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				994	return NULL;
				995	// enabling rules to start with a non-token character a < b
				996	// newStrength = UCOL_TOK_RESET;
				997	}else{
				998	newStrength = src->prevStrength;
				999	}
				1000	}
				1001
				1002	inQuote = TRUE;
				1003
				1004	if(inChars) { /* we're doing characters */
				1005	if(wasInQuote == FALSE) {
				1006	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
				1007	}
				1008	if (src->parsedToken.charsLen != 0) {
				1009	ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
				1010	}
				1011	src->parsedToken.charsLen++;
				1012	} else { /* we're doing an expansion */
				1013	if(wasInQuote == FALSE) {
				1014	extensionOffset = (uint32_t)(src->extraCurrent - src->source);
				1015	}
				1016	if (newExtensionLen != 0) {
				1017	ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
				1018	}
				1019	newExtensionLen++;
				1020	}
				1021
				1022	wasInQuote = TRUE;
				1023
				1024	ch = *(++(src->current));
				1025	if(ch == 0x0027) { /* copy the double quote */
				1026	ucol_tok_addToExtraCurrent(src, &ch, 1, status);
				1027	inQuote = FALSE;
				1028	}
				1029	break;
				1030
				1031	/* '@' is french only if the strength is not currently set */
				1032	/* if it is, it's just a regular character in collation rules */
				1033	case 0x0040/'@'/:
				1034	if (newStrength == UCOL_TOK_UNSET) {
				1035	src->opts->frenchCollation = UCOL_ON;
				1036	break;
				1037	}
				1038
				1039	case 0x007C /\|/: /* this means we have actually been reading prefix part */
				1040	// we want to store read characters to the prefix part and continue reading
				1041	// the characters (proper way would be to restart reading the chars, but in
				1042	// that case we would have to complicate the token hasher, which I do not
				1043	// intend to play with. Instead, we will do prefixes when prefixes are due
				1044	// (before adding the elements).
				1045	src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
				1046	src->parsedToken.prefixLen = src->parsedToken.charsLen;
				1047
				1048	if(inChars) { /* we're doing characters */
				1049	if(wasInQuote == FALSE) {
				1050	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
				1051	}
				1052	if (src->parsedToken.charsLen != 0) {
				1053	ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
				1054	}
				1055	src->parsedToken.charsLen++;
				1056	}
				1057
				1058	wasInQuote = TRUE;
				1059
				1060	do {
				1061	ch = *(++(src->current));
				1062	// skip whitespace between '\|' and the character
				1063	} while (uprv_isRuleWhiteSpace(ch));
				1064	break;
				1065
				1066	//charsOffset = 0;
				1067	//newCharsLen = 0;
				1068	//break; // We want to store the whole prefix/character sequence. If we break
				1069	// the '\|' is going to get lost.
				1070	case 0x0023 /#/: /* this is a comment, skip everything through the end of line */
				1071	do {
				1072	ch = *(++(src->current));
				1073	} while (!isCharNewLine(ch));
				1074
				1075	break;
				1076	default:
				1077	if (newStrength == UCOL_TOK_UNSET) {
				1078	if(src->prevStrength == UCOL_TOK_UNSET){
				1079	*status = U_INVALID_FORMAT_ERROR;
				1080	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				1081	return NULL;
				1082	}else{
				1083	newStrength = src->prevStrength;
				1084	}
				1085	}
				1086
				1087	if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
				1088	*status = U_INVALID_FORMAT_ERROR;
				1089	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				1090	return NULL;
				1091	}
				1092
				1093	if(ch == 0x0000 && src->current+1 == src->end) {
				1094	break;
				1095	}
				1096
				1097	if (inChars) {
				1098	if(src->parsedToken.charsLen == 0) {
				1099	src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
				1100	}
				1101	src->parsedToken.charsLen++;
				1102	if(src->prevStrength != UCOL_TOK_UNSET){
				1103	U16_NEXT(0, src->current, src->end, codepoint);
				1104	src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1;
				1105	goto EndOfLoop;
				1106	}
				1107	} else {
				1108	if(newExtensionLen == 0) {
				1109	extensionOffset = (uint32_t)(src->current - src->source);
				1110	}
				1111	newExtensionLen++;
				1112	}
				1113
				1114	break;
				1115	}
				1116	}
				1117	}
				1118
				1119	if(wasInQuote) {
				1120	if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){
				1121	src->current++;
				1122	goto EndOfLoop;
				1123	}
				1124	if(ch != 0x27) {
				1125	if(inQuote \|\| !uprv_isRuleWhiteSpace(ch)) {
				1126	ucol_tok_addToExtraCurrent(src, &ch, 1, status);
				1127	}
				1128	}
				1129	}
				1130
				1131	src->current++;
				1132	}
				1133
				1134	EndOfLoop:
				1135	wasInQuote = FALSE;
				1136	if (newStrength == UCOL_TOK_UNSET) {
				1137	return NULL;
				1138	}
				1139
				1140	if (src->parsedToken.charsLen == 0 && top == FALSE) {
				1141	syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
				1142	*status = U_INVALID_FORMAT_ERROR;
				1143	return NULL;
				1144	}
				1145
				1146	src->parsedToken.strength = newStrength;
				1147	src->parsedToken.extensionOffset = extensionOffset;
				1148	src->parsedToken.extensionLen = newExtensionLen;
				1149	src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) \| (UCOL_TOK_TOP * (top?1:0)) \| before;
				1150
				1151	return src->current;
				1152	}
				1153
				1154	/*
				1155	Processing Description
				1156	1 Build a ListList. Each list has a header, which contains two lists (positive
				1157	and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
				1158	reset may be null.
				1159	2 As you process, you keep a LAST pointer that points to the last token you
				1160	handled.
				1161	*/
				1162
				1163	static UColToken ucol_tok_initAReset(UColTokenParser src, const UChar expand, uint32_t expandNext,
				1164	UParseError parseError, UErrorCode status)
				1165	{
				1166	if(src->resultLen == src->listCapacity) {
				1167	// Unfortunately, this won't work, as we store addresses of lhs in token
				1168	src->listCapacity *= 2;
				1169	src->lh = (UColTokListHeader )uprv_realloc(src->lh, src->listCapacitysizeof(UColTokListHeader));
				1170	if(src->lh == NULL) {
				1171	*status = U_MEMORY_ALLOCATION_ERROR;
				1172	return NULL;
				1173	}
				1174	}
				1175	/* do the reset thing */
				1176	UColToken sourceToken = (UColToken )uprv_malloc(sizeof(UColToken));
				1177	/* test for NULL */
				1178	if (sourceToken == NULL) {
				1179	*status = U_MEMORY_ALLOCATION_ERROR;
				1180	return NULL;
				1181	}
				1182	sourceToken->rulesToParse = src->source;
				1183	sourceToken->source = src->parsedToken.charsLen << 24 \| src->parsedToken.charsOffset;
				1184	sourceToken->expansion = src->parsedToken.extensionLen << 24 \| src->parsedToken.extensionOffset;
				1185
				1186	sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
				1187	sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
				1188
				1189	// keep the flags around so that we know about before
				1190	sourceToken->flags = src->parsedToken.flags;
				1191
				1192	if(src->parsedToken.prefixOffset != 0) {
				1193	// this is a syntax error
				1194	*status = U_INVALID_FORMAT_ERROR;
				1195	syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
				1196	uprv_free(sourceToken);
				1197	return 0;
				1198	} else {
				1199	sourceToken->prefix = 0;
				1200	}
				1201
				1202	sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
				1203	sourceToken->strength = UCOL_TOK_RESET;
				1204	sourceToken->next = NULL;
				1205	sourceToken->previous = NULL;
				1206	sourceToken->noOfCEs = 0;
				1207	sourceToken->noOfExpCEs = 0;
				1208	sourceToken->listHeader = &src->lh[src->resultLen];
				1209
				1210	src->lh[src->resultLen].first = NULL;
				1211	src->lh[src->resultLen].last = NULL;
				1212	src->lh[src->resultLen].first = NULL;
				1213	src->lh[src->resultLen].last = NULL;
				1214
				1215	src->lh[src->resultLen].reset = sourceToken;
				1216
				1217	/*
				1218	3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
				1219	First convert all expansions into normal form. Examples:
				1220	If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
				1221	d * ... into &x * c/y * d * ...
				1222	Note: reset values can never have expansions, although they can cause the
				1223	very next item to have one. They may be contractions, if they are found
				1224	earlier in the list.
				1225	*/
				1226	*expandNext = 0;
				1227	if(expand != NULL) {
				1228	/* check to see if there is an expansion */
				1229	if(src->parsedToken.charsLen > 1) {
				1230	uint32_t resetCharsOffset;
				1231	resetCharsOffset = (uint32_t)(expand - src->source);
				1232	sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) \| src->parsedToken.charsOffset;
				1233	*expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) \| (resetCharsOffset);
				1234	}
				1235	}
				1236
				1237	src->resultLen++;
				1238
				1239	uhash_put(src->tailored, sourceToken, sourceToken, status);
				1240
				1241	return sourceToken;
				1242	}
				1243
				1244	static
				1245	inline UColToken getVirginBefore(UColTokenParser src, UColToken sourceToken, uint8_t strength, UParseError parseError, UErrorCode *status) {
				1246	if(U_FAILURE(*status)) {
				1247	return NULL;
				1248	}
				1249	/* this is a virgin before - we need to fish the anchor from the UCA */
				1250	collIterate s;
				1251	uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
				1252	uint32_t CE, SecondCE;
				1253	uint32_t invPos;
				1254	if(sourceToken != NULL) {
				1255	uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
				1256	} else {
				1257	uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /*charsOffset/, 1, &s, status);
				1258	}
				1259	if(U_FAILURE(*status)) {
				1260	return NULL;
				1261	}
				1262
				1263	baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
				1264	baseContCE = ucol_getNextCE(src->UCA, &s, status);
				1265	if(baseContCE == UCOL_NO_MORE_CES) {
				1266	baseContCE = 0;
				1267	}
				1268
				1269
				1270	UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
				1271	uint32_t ch = 0;
				1272	uint32_t expandNext = 0;
				1273	UColToken key;
				1274
				1275	if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
				1276	uint32_t primary = baseCE & UCOL_PRIMARYMASK \| (baseContCE & UCOL_PRIMARYMASK) >> 16;
				1277	uint32_t raw = uprv_uca_getRawFromImplicit(primary);
				1278	ch = uprv_uca_getCodePointFromRaw(raw-1);
				1279	uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
				1280	CE = primaryCE & UCOL_PRIMARYMASK \| 0x0505;
				1281	SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK \| UCOL_CONTINUATION_MARKER;
				1282
				1283	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
				1284	*src->extraCurrent++ = 0xFFFE;
				1285	*src->extraCurrent++ = (UChar)ch;
				1286	src->parsedToken.charsLen++;
				1287
				1288	key.source = (src->parsedToken.charsLen/*newCharsLen/ << 24) \| src->parsedToken.charsOffset/*charsOffset/;
				1289	key.rulesToParse = src->source;
				1290
				1291	//sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
				1292	sourceToken = (UColToken *)uhash_get(src->tailored, &key);
				1293
				1294	if(sourceToken == NULL) {
				1295	src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
				1296	if(isContinuation(SecondCE)) {
				1297	src->lh[src->resultLen].baseContCE = SecondCE;
				1298	} else {
				1299	src->lh[src->resultLen].baseContCE = 0;
				1300	}
				1301	src->lh[src->resultLen].nextCE = 0;
				1302	src->lh[src->resultLen].nextContCE = 0;
				1303	src->lh[src->resultLen].previousCE = 0;
				1304	src->lh[src->resultLen].previousContCE = 0;
				1305
				1306	src->lh[src->resultLen].indirect = FALSE;
				1307
				1308	sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
				1309	}
				1310
				1311	} else {
				1312	invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
				1313
				1314	// we got the previous CE. Now we need to see if the difference between
				1315	// the two CEs is really of the requested strength.
				1316	// if it's a bigger difference (we asked for secondary and got primary), we
				1317	// need to modify the CE.
				1318	if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
				1319	// adjust the strength
				1320	// now we are in the situation where our baseCE should actually be modified in
				1321	// order to get the CE in the right position.
				1322	if(strength == UCOL_SECONDARY) {
				1323	CE = baseCE - 0x0200;
				1324	} else { // strength == UCOL_TERTIARY
				1325	CE = baseCE - 0x02;
				1326	}
				1327	if(baseContCE) {
				1328	if(strength == UCOL_SECONDARY) {
				1329	SecondCE = baseContCE - 0x0200;
				1330	} else { // strength == UCOL_TERTIARY
				1331	SecondCE = baseContCE - 0x02;
				1332	}
				1333	}
				1334	}
				1335
				1336	#if 0
				1337	// the code below relies on getting a code point from the inverse table, in order to be
				1338	// able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
				1339	// 1. There are many code points that have the same CE
				1340	// 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
				1341	// Also, in case when there is no equivalent strength before an element, we have to actually
				1342	// construct one. For example, &[before 2]a << x won't result in x << a, because the element
				1343	// before a is a primary difference.
				1344
				1345	//uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);
				1346
				1347
				1348	ch = CETable[3*invPos+2];
				1349
				1350	if((ch & UCOL_INV_SIZEMASK) != 0) {
				1351	uint16_t conts = (uint16_t )((uint8_t *)src->invUCA+src->invUCA->conts);
				1352	uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
				1353	ch = conts[offset];
				1354	}
				1355
				1356	*src->extraCurrent++ = (UChar)ch;
				1357	src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
				1358	src->parsedToken.charsLen = 1;
				1359
				1360	// We got an UCA before. However, this might have been tailored.
				1361	// example:
				1362	// &\u30ca = \u306a
				1363	// &[before 3]\u306a<<<\u306a\|\u309d
				1364
				1365
				1366	// uint32_t key = (newCharsLen << 24) \| charsOffset;
				1367	key.source = (src->parsedToken.charsLen/*newCharsLen/ << 24) \| src->parsedToken.charsOffset/*charsOffset/;
				1368	key.rulesToParse = src->source;
				1369
				1370	//sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
				1371	sourceToken = (UColToken *)uhash_get(src->tailored, &key);
				1372	#endif
				1373
				1374	// here is how it should be. The situation such as &[before 1]a < x, should be
				1375	// resolved exactly as if we wrote &a > x.
				1376	// therefore, I don't really care if the UCA value before a has been changed.
				1377	// However, I do care if the strength between my element and the previous element
				1378	// is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
				1379	// have to construct the base CE.
				1380
				1381
				1382
				1383	// if we found a tailored thing, we have to use the UCA value and construct
				1384	// a new reset token with constructed name
				1385	//if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
				1386	// character to which we want to anchor is already tailored.
				1387	// We need to construct a new token which will be the anchor
				1388	// point
				1389	//*(src->extraCurrent-1) = 0xFFFE;
				1390	//*src->extraCurrent++ = (UChar)ch;
				1391	// grab before
				1392	src->parsedToken.charsOffset -= 10;
				1393	src->parsedToken.charsLen += 10;
				1394	src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
				1395	if(isContinuation(SecondCE)) {
				1396	src->lh[src->resultLen].baseContCE = SecondCE;
				1397	} else {
				1398	src->lh[src->resultLen].baseContCE = 0;
				1399	}
				1400	src->lh[src->resultLen].nextCE = 0;
				1401	src->lh[src->resultLen].nextContCE = 0;
				1402	src->lh[src->resultLen].previousCE = 0;
				1403	src->lh[src->resultLen].previousContCE = 0;
				1404
				1405	src->lh[src->resultLen].indirect = FALSE;
				1406
				1407	sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
				1408	//}
				1409	}
				1410
				1411	return sourceToken;
				1412
				1413	}
				1414
				1415	uint32_t ucol_tok_assembleTokenList(UColTokenParser src, UParseError parseError, UErrorCode *status) {
				1416	UColToken *lastToken = NULL;
				1417	const UChar *parseEnd = NULL;
				1418	uint32_t expandNext = 0;
				1419	UBool variableTop = FALSE;
				1420	UBool top = FALSE;
				1421	uint16_t specs = 0;
				1422	UColTokListHeader *ListList = NULL;
				1423
				1424	src->parsedToken.strength = UCOL_TOK_UNSET;
				1425
				1426	ListList = src->lh;
				1427
				1428	if(U_FAILURE(*status)) {
				1429	return 0;
				1430	}
				1431
				1432	while(src->current < src->end) {
				1433	src->parsedToken.prefixOffset = 0;
				1434
				1435	parseEnd = ucol_tok_parseNextToken(src,
				1436	(UBool)(lastToken == NULL),
				1437	parseError,
				1438	status);
				1439
				1440	specs = src->parsedToken.flags;
				1441
				1442
				1443	variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
				1444	top = ((specs & UCOL_TOK_TOP) != 0);
				1445
				1446	if(U_SUCCESS(*status) && parseEnd != NULL) {
				1447	UColToken *sourceToken = NULL;
				1448	//uint32_t key = 0;
				1449	uint32_t lastStrength = UCOL_TOK_UNSET;
				1450
				1451	if(lastToken != NULL ) {
				1452	lastStrength = lastToken->strength;
				1453	}
				1454
				1455	//key = newCharsLen << 24 \| charsOffset;
				1456	UColToken key;
				1457	key.source = src->parsedToken.charsLen << 24 \| src->parsedToken.charsOffset;
				1458	key.rulesToParse = src->source;
				1459
				1460	/* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
				1461	sourceToken = (UColToken *)uhash_get(src->tailored, &key);
				1462
				1463	if(src->parsedToken.strength != UCOL_TOK_RESET) {
				1464	if(lastToken == NULL) { /* this means that rules haven't started properly */
				1465	*status = U_INVALID_FORMAT_ERROR;
				1466	syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
				1467	return 0;
				1468	}
				1469	/* 6 Otherwise (when relation != reset) */
				1470	if(sourceToken == NULL) {
				1471	/* If sourceToken is null, create new one, */
				1472	sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
				1473	/* test for NULL */
				1474	if (sourceToken == NULL) {
				1475	*status = U_MEMORY_ALLOCATION_ERROR;
				1476	return 0;
				1477	}
				1478	sourceToken->rulesToParse = src->source;
				1479	sourceToken->source = src->parsedToken.charsLen << 24 \| src->parsedToken.charsOffset;
				1480
				1481	sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
				1482
				1483	sourceToken->prefix = src->parsedToken.prefixLen << 24 \| src->parsedToken.prefixOffset;
				1484	sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
				1485
				1486	sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
				1487	sourceToken->next = NULL;
				1488	sourceToken->previous = NULL;
				1489	sourceToken->noOfCEs = 0;
				1490	sourceToken->noOfExpCEs = 0;
				1491	// keep the flags around so that we know about before
				1492	sourceToken->flags = src->parsedToken.flags;
				1493	uhash_put(src->tailored, sourceToken, sourceToken, status);
				1494	if(U_FAILURE(*status)) {
				1495	return 0;
				1496	}
				1497	} else {
				1498	/* we could have fished out a reset here */
				1499	if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
				1500	/* otherwise remove sourceToken from where it was. */
				1501	if(sourceToken->next != NULL) {
				1502	if(sourceToken->next->strength > sourceToken->strength) {
				1503	sourceToken->next->strength = sourceToken->strength;
				1504	}
				1505	sourceToken->next->previous = sourceToken->previous;
				1506	} else {
				1507	sourceToken->listHeader->last = sourceToken->previous;
				1508	}
				1509
				1510	if(sourceToken->previous != NULL) {
				1511	sourceToken->previous->next = sourceToken->next;
				1512	} else {
				1513	sourceToken->listHeader->first = sourceToken->next;
				1514	}
				1515	sourceToken->next = NULL;
				1516	sourceToken->previous = NULL;
				1517	}
				1518	}
				1519
				1520	sourceToken->strength = src->parsedToken.strength;
				1521	sourceToken->listHeader = lastToken->listHeader;
				1522
				1523	/*
				1524	1. Find the strongest strength in each list, and set strongestP and strongestN
				1525	accordingly in the headers.
				1526	*/
				1527	if(lastStrength == UCOL_TOK_RESET
				1528	\|\| sourceToken->listHeader->first == 0) {
				1529	/* If LAST is a reset
				1530	insert sourceToken in the list. */
				1531	if(sourceToken->listHeader->first == 0) {
				1532	sourceToken->listHeader->first = sourceToken;
				1533	sourceToken->listHeader->last = sourceToken;
				1534	} else { /* we need to find a place for us */
				1535	/* and we'll get in front of the same strength */
				1536	if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
				1537	sourceToken->next = sourceToken->listHeader->first;
				1538	sourceToken->next->previous = sourceToken;
				1539	sourceToken->listHeader->first = sourceToken;
				1540	sourceToken->previous = NULL;
				1541	} else {
				1542	lastToken = sourceToken->listHeader->first;
				1543	while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
				1544	lastToken = lastToken->next;
				1545	}
				1546	if(lastToken->next != NULL) {
				1547	lastToken->next->previous = sourceToken;
				1548	} else {
				1549	sourceToken->listHeader->last = sourceToken;
				1550	}
				1551	sourceToken->previous = lastToken;
				1552	sourceToken->next = lastToken->next;
				1553	lastToken->next = sourceToken;
				1554	}
				1555	}
				1556	} else {
				1557	/* Otherwise (when LAST is not a reset)
				1558	if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
				1559	otherwise insert before.
				1560	when inserting after or before, search to the next position with the same
				1561	strength in that direction. (This is called postpone insertion). */
				1562	if(sourceToken != lastToken) {
				1563	if(lastToken->polarity == sourceToken->polarity) {
				1564	while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
				1565	lastToken = lastToken->next;
				1566	}
				1567	sourceToken->previous = lastToken;
				1568	if(lastToken->next != NULL) {
				1569	lastToken->next->previous = sourceToken;
				1570	} else {
				1571	sourceToken->listHeader->last = sourceToken;
				1572	}
				1573
				1574	sourceToken->next = lastToken->next;
				1575	lastToken->next = sourceToken;
				1576	} else {
				1577	while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
				1578	lastToken = lastToken->previous;
				1579	}
				1580	sourceToken->next = lastToken;
				1581	if(lastToken->previous != NULL) {
				1582	lastToken->previous->next = sourceToken;
				1583	} else {
				1584	sourceToken->listHeader->first = sourceToken;
				1585	}
				1586	sourceToken->previous = lastToken->previous;
				1587	lastToken->previous = sourceToken;
				1588	}
				1589	} else { /* repeated one thing twice in rules, stay with the stronger strength */
				1590	if(lastStrength < sourceToken->strength) {
				1591	sourceToken->strength = lastStrength;
				1592	}
				1593	}
				1594	}
				1595
				1596	/* if the token was a variable top, we're gonna put it in */
				1597	if(variableTop == TRUE && src->varTop == NULL) {
				1598	variableTop = FALSE;
				1599	src->varTop = sourceToken;
				1600	}
				1601
				1602	// Treat the expansions.
				1603	// There are two types of expansions: explicit (x / y) and reset based propagating expansions
				1604	// (&abc * d * e <=> &ab * d / c * e / c)
				1605	// if both of them are in effect for a token, they are combined.
				1606
				1607	sourceToken->expansion = src->parsedToken.extensionLen << 24 \| src->parsedToken.extensionOffset;
				1608
				1609	if(expandNext != 0) {
				1610	if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
				1611	expandNext = 0;
				1612	} else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
				1613	sourceToken->expansion = expandNext;
				1614	} else { /* there is both explicit and implicit expansion. We need to make a combination */
				1615	uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
				1616	uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
				1617	sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 \| (uint32_t)(src->extraCurrent - src->source));
				1618	src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
				1619	}
				1620	}
				1621
				1622	// This is just for debugging purposes
				1623	if(sourceToken->expansion != 0) {
				1624	sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
				1625	} else {
				1626	sourceToken->debugExpansion = 0;
				1627	}
				1628	// if the previous token was a reset before, the strength of this
				1629	// token must match the strength of before. Otherwise we have an
				1630	// undefined situation.
				1631	// In other words, we currently have a cludge which we use to
				1632	// represent &a >> x. This is written as &[before 2]a << x.
				1633	if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
				1634	uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
				1635	if(beforeStrength != sourceToken->strength) {
				1636	*status = U_INVALID_FORMAT_ERROR;
				1637	syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
				1638	return 0;
				1639	}
				1640	}
				1641	} else {
				1642	if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
				1643	/* if the previous token was also a reset, */
				1644	/this means that we have two consecutive resets /
				1645	/* and we want to remove the previous one if empty*/
				1646	if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
				1647	src->resultLen--;
				1648	}
				1649	}
				1650
				1651	if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
				1652	uint32_t searchCharsLen = src->parsedToken.charsLen;
				1653	while(searchCharsLen > 1 && sourceToken == NULL) {
				1654	searchCharsLen--;
				1655	//key = searchCharsLen << 24 \| charsOffset;
				1656	UColToken key;
				1657	key.source = searchCharsLen << 24 \| src->parsedToken.charsOffset;
				1658	key.rulesToParse = src->source;
				1659	sourceToken = (UColToken *)uhash_get(src->tailored, &key);
				1660	}
				1661	if(sourceToken != NULL) {
				1662	expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 \| (src->parsedToken.charsOffset + searchCharsLen);
				1663	}
				1664	}
				1665
				1666	if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
				1667	if(top == FALSE) { /* there is no indirection */
				1668	uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
				1669	if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
				1670	/* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
				1671	while(sourceToken->strength > strength && sourceToken->previous != NULL) {
				1672	sourceToken = sourceToken->previous;
				1673	}
				1674	/* here, either we hit the strength or NULL */
				1675	if(sourceToken->strength == strength) {
				1676	if(sourceToken->previous != NULL) {
				1677	sourceToken = sourceToken->previous;
				1678	} else { /* start of list */
				1679	sourceToken = sourceToken->listHeader->reset;
				1680	}
				1681	} else { /* we hit NULL */
				1682	/* we should be doing the else part */
				1683	sourceToken = sourceToken->listHeader->reset;
				1684	sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
				1685	}
				1686	} else {
				1687	sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
				1688	}
				1689	} else { /* this is both before and indirection */
				1690	top = FALSE;
				1691	ListList[src->resultLen].previousCE = 0;
				1692	ListList[src->resultLen].previousContCE = 0;
				1693	ListList[src->resultLen].indirect = TRUE;
				1694	/* we need to do slightly more work. we need to get the baseCE using the */
				1695	/* inverse UCA & getPrevious. The next bound is not set, and will be decided */
				1696	/* in ucol_bld */
				1697	uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
				1698	uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
				1699	uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
				1700	uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
				1701
				1702	UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
				1703	if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
				1704	uint32_t primary = baseCE & UCOL_PRIMARYMASK \| (baseContCE & UCOL_PRIMARYMASK) >> 16;
				1705	uint32_t raw = uprv_uca_getRawFromImplicit(primary);
				1706	uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
				1707	CE = primaryCE & UCOL_PRIMARYMASK \| 0x0505;
				1708	SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK \| UCOL_CONTINUATION_MARKER;
				1709	} else {
				1710	/int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);/
				1711	ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
				1712	}
				1713
				1714	ListList[src->resultLen].baseCE = CE;
				1715	ListList[src->resultLen].baseContCE = SecondCE;
				1716	ListList[src->resultLen].nextCE = 0;
				1717	ListList[src->resultLen].nextContCE = 0;
				1718
				1719	sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
				1720	}
				1721	}
				1722
				1723
				1724	/* 5 If the relation is a reset:
				1725	If sourceToken is null
				1726	Create new list, create new sourceToken, make the baseCE from source, put
				1727	the sourceToken in ListHeader of the new list */
				1728	if(sourceToken == NULL) {
				1729	/*
				1730	3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
				1731	First convert all expansions into normal form. Examples:
				1732	If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
				1733	d * ... into &x * c/y * d * ...
				1734	Note: reset values can never have expansions, although they can cause the
				1735	very next item to have one. They may be contractions, if they are found
				1736	earlier in the list.
				1737	*/
				1738	if(top == FALSE) {
				1739	collIterate s;
				1740	uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
				1741
				1742	uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
				1743
				1744	CE = ucol_getNextCE(src->UCA, &s, status);
				1745	const UChar *expand = s.pos;
				1746	SecondCE = ucol_getNextCE(src->UCA, &s, status);
				1747
				1748	ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
				1749	if(isContinuation(SecondCE)) {
				1750	ListList[src->resultLen].baseContCE = SecondCE;
				1751	} else {
				1752	ListList[src->resultLen].baseContCE = 0;
				1753	}
				1754	ListList[src->resultLen].nextCE = 0;
				1755	ListList[src->resultLen].nextContCE = 0;
				1756	ListList[src->resultLen].previousCE = 0;
				1757	ListList[src->resultLen].previousContCE = 0;
				1758	ListList[src->resultLen].indirect = FALSE;
				1759	sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
				1760	} else { /* top == TRUE */
				1761	/* just use the supplied values */
				1762	top = FALSE;
				1763	ListList[src->resultLen].previousCE = 0;
				1764	ListList[src->resultLen].previousContCE = 0;
				1765	ListList[src->resultLen].indirect = TRUE;
				1766	ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
				1767	ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
				1768	ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
				1769	ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
				1770
				1771	sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
				1772
				1773	}
				1774	} else { /* reset to something already in rules */
				1775	top = FALSE;
				1776	}
				1777	}
				1778	/* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
				1779	lastToken = sourceToken;
				1780	} else {
				1781	if(U_FAILURE(*status)) {
				1782	return 0;
				1783	}
				1784	}
				1785	}
				1786
				1787	if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
				1788	src->resultLen--;
				1789	}
				1790	return src->resultLen;
				1791	}
				1792
				1793	void ucol_tok_initTokenList(UColTokenParser src, const UChar rules, const uint32_t rulesLength, const UCollator UCA, UErrorCode status) {
				1794	U_NAMESPACE_USE
				1795
				1796	uint32_t nSize = 0;
				1797	uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
				1798	if(U_FAILURE(*status)) {
				1799	return;
				1800	}
				1801
				1802	// set everything to zero, so that we can clean up gracefully
				1803	uprv_memset(src, 0, sizeof(UColTokenParser));
				1804
				1805	// first we need to find options that don't like to be normalized,
				1806	// like copy and remove...
				1807	//const UChar *openBrace = rules;
				1808	int32_t optionNumber = -1;
				1809	const UChar *setStart = NULL;
				1810	uint32_t i = 0;
				1811	while(i < rulesLength) {
				1812	if(rules[i] == 0x005B) {
				1813	// while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
				1814	//optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
				1815	optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
				1816	if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
				1817	USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
				1818	if(U_SUCCESS(*status)) {
				1819	if(src->copySet == NULL) {
				1820	src->copySet = newSet;
				1821	} else {
				1822	uset_addAll(src->copySet, newSet);
				1823	uset_close(newSet);
				1824	}
				1825	} else {
				1826	return;
				1827	}
				1828	} else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
				1829	USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
				1830	if(U_SUCCESS(*status)) {
				1831	if(src->removeSet == NULL) {
				1832	src->removeSet = newSet;
				1833	} else {
				1834	uset_addAll(src->removeSet, newSet);
				1835	uset_close(newSet);
				1836	}
				1837	} else {
				1838	return;
				1839	}
				1840	}
				1841	}
				1842	//openBrace++;
				1843	i++;
				1844	}
				1845
				1846	src->source = (UChar )uprv_malloc(estimatedSizesizeof(UChar));
				1847	/* test for NULL */
				1848	if (src->source == NULL) {
				1849	*status = U_MEMORY_ALLOCATION_ERROR;
				1850	return;
				1851	}
				1852	uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
				1853	nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
				1854	if(nSize > estimatedSize \|\| *status == U_BUFFER_OVERFLOW_ERROR) {
				1855	*status = U_ZERO_ERROR;
				1856	src->source = (UChar )uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)sizeof(UChar));
				1857	/* test for NULL */
				1858	if (src->source == NULL) {
				1859	*status = U_MEMORY_ALLOCATION_ERROR;
				1860	return;
				1861	}
				1862	nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
				1863	}
				1864	src->current = src->source;
				1865	src->end = src->source+nSize;
				1866	src->sourceCurrent = src->source;
				1867	src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
				1868	src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
				1869	src->varTop = NULL;
				1870	src->UCA = UCA;
				1871	src->invUCA = ucol_initInverseUCA(status);
				1872	src->parsedToken.charsLen = 0;
				1873	src->parsedToken.charsOffset = 0;
				1874	src->parsedToken.extensionLen = 0;
				1875	src->parsedToken.extensionOffset = 0;
				1876	src->parsedToken.prefixLen = 0;
				1877	src->parsedToken.prefixOffset = 0;
				1878	src->parsedToken.flags = 0;
				1879	src->parsedToken.strength = UCOL_TOK_UNSET;
				1880	src->buildCCTabFlag = FALSE;
				1881	src->prevStrength = UCOL_TOK_UNSET;
				1882
				1883	if(U_FAILURE(*status)) {
				1884	return;
				1885	}
				1886	src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
				1887	if(U_FAILURE(*status)) {
				1888	return;
				1889	}
				1890	uhash_setValueDeleter(src->tailored, uhash_freeBlock);
				1891
				1892	src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
				1893	/* test for NULL */
				1894	if (src->opts == NULL) {
				1895	*status = U_MEMORY_ALLOCATION_ERROR;
				1896	return;
				1897	}
				1898
				1899	uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
				1900
				1901	// rulesToParse = src->source;
				1902	src->lh = 0;
				1903	src->listCapacity = 1024;
				1904	src->lh = (UColTokListHeader )uprv_malloc(src->listCapacitysizeof(UColTokListHeader));
				1905	//Test for NULL
				1906	if (src->lh == NULL) {
				1907	*status = U_MEMORY_ALLOCATION_ERROR;
				1908	return;
				1909	}
				1910	uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
				1911	src->resultLen = 0;
				1912
				1913	UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
				1914
				1915	// UCOL_RESET_TOP_VALUE
				1916	setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
				1917	// UCOL_FIRST_PRIMARY_IGNORABLE
				1918	setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
				1919	// UCOL_LAST_PRIMARY_IGNORABLE
				1920	setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
				1921	// UCOL_FIRST_SECONDARY_IGNORABLE
				1922	setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
				1923	// UCOL_LAST_SECONDARY_IGNORABLE
				1924	setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
				1925	// UCOL_FIRST_TERTIARY_IGNORABLE
				1926	setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
				1927	// UCOL_LAST_TERTIARY_IGNORABLE
				1928	setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
				1929	// UCOL_FIRST_VARIABLE
				1930	setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
				1931	// UCOL_LAST_VARIABLE
				1932	setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
				1933	// UCOL_FIRST_NON_VARIABLE
				1934	setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
				1935	// UCOL_LAST_NON_VARIABLE
				1936	setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
				1937	// UCOL_FIRST_IMPLICIT
				1938	setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
				1939	// UCOL_LAST_IMPLICIT
				1940	setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
				1941	// UCOL_FIRST_TRAILING
				1942	setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
				1943	// UCOL_LAST_TRAILING
				1944	setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
				1945	ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
				1946	}
				1947
				1948
				1949	void ucol_tok_closeTokenList(UColTokenParser *src) {
				1950	if(src->copySet != NULL) {
				1951	uset_close(src->copySet);
				1952	}
				1953	if(src->removeSet != NULL) {
				1954	uset_close(src->removeSet);
				1955	}
				1956	if(src->tailored != NULL) {
				1957	uhash_close(src->tailored);
				1958	}
				1959	if(src->lh != NULL) {
				1960	uprv_free(src->lh);
				1961	}
				1962	if(src->source != NULL) {
				1963	uprv_free(src->source);
				1964	}
				1965	if(src->opts != NULL) {
				1966	uprv_free(src->opts);
				1967	}
				1968	}
				1969
				1970	#endif /* #if !UCONFIG_NO_COLLATION */