| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * file name: ucol.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * Modification history |
| * Date Name Comments |
| * 1996-1999 various members of ICU team maintained C API for collation framework |
| * 02/16/2001 synwee Added internal method getPrevSpecialCE |
| * 03/01/2001 synwee Added maxexpansion functionality. |
| * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/coleitr.h" |
| #include "unicode/unorm.h" |
| #include "unicode/udata.h" |
| #include "unicode/ustring.h" |
| |
| #include "ucol_imp.h" |
| #include "bocsu.h" |
| |
| #include "normalizer2impl.h" |
| #include "unorm_it.h" |
| #include "umutex.h" |
| #include "cmemory.h" |
| #include "ucln_in.h" |
| #include "cstring.h" |
| #include "utracimp.h" |
| #include "putilimp.h" |
| #include "uassert.h" |
| |
| #ifdef UCOL_DEBUG |
| #include <stdio.h> |
| #endif |
| |
| U_NAMESPACE_USE |
| |
| #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| |
| #define LAST_BYTE_MASK_ 0xFF |
| #define SECOND_LAST_BYTE_SHIFT_ 8 |
| |
| #define ZERO_CC_LIMIT_ 0xC0 |
| |
| // this is static pointer to the normalizer fcdTrieIndex |
| // it is always the same between calls to u_cleanup |
| // and therefore writing to it is not synchronized. |
| // It is cleaned in ucol_cleanup |
| static const uint16_t *fcdTrieIndex=NULL; |
| // Code points at fcdHighStart and above have a zero FCD value. |
| static UChar32 fcdHighStart = 0; |
| |
| // These are values from UCA required for |
| // implicit generation and supressing sort key compression |
| // they should regularly be in the UCA, but if one |
| // is running without UCA, it could be a problem |
| static const int32_t maxRegularPrimary = 0xA0; |
| static const int32_t minImplicitPrimary = 0xE0; |
| static const int32_t maxImplicitPrimary = 0xE4; |
| |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV |
| ucol_cleanup(void) |
| { |
| fcdTrieIndex = NULL; |
| return TRUE; |
| } |
| |
| static int32_t U_CALLCONV |
| _getFoldingOffset(uint32_t data) { |
| return (int32_t)(data&0xFFFFFF); |
| } |
| |
| U_CDECL_END |
| |
| static |
| inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, |
| int32_t sourceLen, collIterate *s, |
| UErrorCode *status) |
| { |
| (s)->string = (s)->pos = sourceString; |
| (s)->origFlags = 0; |
| (s)->flags = 0; |
| if (sourceLen >= 0) { |
| s->flags |= UCOL_ITER_HASLEN; |
| (s)->endp = (UChar *)sourceString+sourceLen; |
| } |
| else { |
| /* change to enable easier checking for end of string for fcdpositon */ |
| (s)->endp = NULL; |
| } |
| (s)->extendCEs = NULL; |
| (s)->extendCEsSize = 0; |
| (s)->CEpos = (s)->toReturn = (s)->CEs; |
| (s)->offsetBuffer = NULL; |
| (s)->offsetBufferSize = 0; |
| (s)->offsetReturn = (s)->offsetStore = NULL; |
| (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; |
| (s)->coll = (collator); |
| (s)->nfd = Normalizer2Factory::getNFDInstance(*status); |
| (s)->fcdPosition = 0; |
| if(collator->normalizationMode == UCOL_ON) { |
| (s)->flags |= UCOL_ITER_NORM; |
| } |
| if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { |
| (s)->flags |= UCOL_HIRAGANA_Q; |
| } |
| (s)->iterator = NULL; |
| //(s)->iteratorIndex = 0; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, |
| int32_t sourceLen, collIterate *s, |
| UErrorCode *status) { |
| /* Out-of-line version for use from other files. */ |
| IInit_collIterate(collator, sourceString, sourceLen, s, status); |
| } |
| |
| U_CAPI collIterate * U_EXPORT2 |
| uprv_new_collIterate(UErrorCode *status) { |
| if(U_FAILURE(*status)) { |
| return NULL; |
| } |
| collIterate *s = new collIterate; |
| if(s == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| return s; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| uprv_delete_collIterate(collIterate *s) { |
| delete s; |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| uprv_collIterateAtEnd(collIterate *s) { |
| return s == NULL || s->pos == s->endp; |
| } |
| |
| /** |
| * Backup the state of the collIterate struct data |
| * @param data collIterate to backup |
| * @param backup storage |
| */ |
| static |
| inline void backupState(const collIterate *data, collIterateState *backup) |
| { |
| backup->fcdPosition = data->fcdPosition; |
| backup->flags = data->flags; |
| backup->origFlags = data->origFlags; |
| backup->pos = data->pos; |
| backup->bufferaddress = data->writableBuffer.getBuffer(); |
| backup->buffersize = data->writableBuffer.length(); |
| backup->iteratorMove = 0; |
| backup->iteratorIndex = 0; |
| if(data->iterator != NULL) { |
| //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); |
| backup->iteratorIndex = data->iterator->getState(data->iterator); |
| // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE |
| if(backup->iteratorIndex == UITER_NO_STATE) { |
| while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { |
| backup->iteratorMove++; |
| data->iterator->move(data->iterator, -1, UITER_CURRENT); |
| } |
| data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); |
| } |
| } |
| } |
| |
| /** |
| * Loads the state into the collIterate struct data |
| * @param data collIterate to backup |
| * @param backup storage |
| * @param forwards boolean to indicate if forwards iteration is used, |
| * false indicates backwards iteration |
| */ |
| static |
| inline void loadState(collIterate *data, const collIterateState *backup, |
| UBool forwards) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| data->flags = backup->flags; |
| data->origFlags = backup->origFlags; |
| if(data->iterator != NULL) { |
| //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); |
| data->iterator->setState(data->iterator, backup->iteratorIndex, &status); |
| if(backup->iteratorMove != 0) { |
| data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); |
| } |
| } |
| data->pos = backup->pos; |
| |
| if ((data->flags & UCOL_ITER_INNORMBUF) && |
| data->writableBuffer.getBuffer() != backup->bufferaddress) { |
| /* |
| this is when a new buffer has been reallocated and we'll have to |
| calculate the new position. |
| note the new buffer has to contain the contents of the old buffer. |
| */ |
| if (forwards) { |
| data->pos = data->writableBuffer.getTerminatedBuffer() + |
| (data->pos - backup->bufferaddress); |
| } |
| else { |
| /* backwards direction */ |
| int32_t temp = backup->buffersize - |
| (int32_t)(data->pos - backup->bufferaddress); |
| data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); |
| } |
| } |
| if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| /* |
| this is alittle tricky. |
| if we are initially not in the normalization buffer, even if we |
| normalize in the later stage, the data in the buffer will be |
| ignored, since we skip back up to the data string. |
| however if we are already in the normalization buffer, any |
| further normalization will pull data into the normalization |
| buffer and modify the fcdPosition. |
| since we are keeping the data in the buffer for use, the |
| fcdPosition can not be reverted back. |
| arrgghh.... |
| */ |
| data->fcdPosition = backup->fcdPosition; |
| } |
| } |
| |
| static UBool |
| reallocCEs(collIterate *data, int32_t newCapacity) { |
| uint32_t *oldCEs = data->extendCEs; |
| if(oldCEs == NULL) { |
| oldCEs = data->CEs; |
| } |
| int32_t length = data->CEpos - oldCEs; |
| uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); |
| if(newCEs == NULL) { |
| return FALSE; |
| } |
| uprv_memcpy(newCEs, oldCEs, length * 4); |
| uprv_free(data->extendCEs); |
| data->extendCEs = newCEs; |
| data->extendCEsSize = newCapacity; |
| data->CEpos = newCEs + length; |
| return TRUE; |
| } |
| |
| static UBool |
| increaseCEsCapacity(collIterate *data) { |
| int32_t oldCapacity; |
| if(data->extendCEs != NULL) { |
| oldCapacity = data->extendCEsSize; |
| } else { |
| oldCapacity = LENGTHOF(data->CEs); |
| } |
| return reallocCEs(data, 2 * oldCapacity); |
| } |
| |
| static UBool |
| ensureCEsCapacity(collIterate *data, int32_t minCapacity) { |
| int32_t oldCapacity; |
| if(data->extendCEs != NULL) { |
| oldCapacity = data->extendCEsSize; |
| } else { |
| oldCapacity = LENGTHOF(data->CEs); |
| } |
| if(minCapacity <= oldCapacity) { |
| return TRUE; |
| } |
| oldCapacity *= 2; |
| return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); |
| } |
| |
| /* |
| * collIter_eos() |
| * Checks for a collIterate being positioned at the end of |
| * its source string. |
| * |
| */ |
| static |
| inline UBool collIter_eos(collIterate *s) { |
| if(s->flags & UCOL_USE_ITERATOR) { |
| return !(s->iterator->hasNext(s->iterator)); |
| } |
| if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { |
| // Null terminated string, but not at null, so not at end. |
| // Whether in main or normalization buffer doesn't matter. |
| return FALSE; |
| } |
| |
| // String with length. Can't be in normalization buffer, which is always |
| // null termintated. |
| if (s->flags & UCOL_ITER_HASLEN) { |
| return (s->pos == s->endp); |
| } |
| |
| // We are at a null termination, could be either normalization buffer or main string. |
| if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { |
| // At null at end of main string. |
| return TRUE; |
| } |
| |
| // At null at end of normalization buffer. Need to check whether there there are |
| // any characters left in the main buffer. |
| if(s->origFlags & UCOL_USE_ITERATOR) { |
| return !(s->iterator->hasNext(s->iterator)); |
| } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { |
| // Null terminated main string. fcdPosition is the 'return' position into main buf. |
| return (*s->fcdPosition == 0); |
| } |
| else { |
| // Main string with an end pointer. |
| return s->fcdPosition == s->endp; |
| } |
| } |
| |
| /* |
| * collIter_bos() |
| * Checks for a collIterate being positioned at the start of |
| * its source string. |
| * |
| */ |
| static |
| inline UBool collIter_bos(collIterate *source) { |
| // if we're going backwards, we need to know whether there is more in the |
| // iterator, even if we are in the side buffer |
| if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { |
| return !source->iterator->hasPrevious(source->iterator); |
| } |
| if (source->pos <= source->string || |
| ((source->flags & UCOL_ITER_INNORMBUF) && |
| *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| /*static |
| inline UBool collIter_SimpleBos(collIterate *source) { |
| // if we're going backwards, we need to know whether there is more in the |
| // iterator, even if we are in the side buffer |
| if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { |
| return !source->iterator->hasPrevious(source->iterator); |
| } |
| if (source->pos == source->string) { |
| return TRUE; |
| } |
| return FALSE; |
| }*/ |
| //return (data->pos == data->string) || |
| |
| |
| /****************************************************************************/ |
| /* Following are the open/close functions */ |
| /* */ |
| /****************************************************************************/ |
| |
| static UCollator* |
| ucol_initFromBinary(const uint8_t *bin, int32_t length, |
| const UCollator *base, |
| UCollator *fillIn, |
| UErrorCode *status) |
| { |
| UCollator *result = fillIn; |
| if(U_FAILURE(*status)) { |
| return NULL; |
| } |
| /* |
| if(base == NULL) { |
| // we don't support null base yet |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| */ |
| // We need these and we could be running without UCA |
| uprv_uca_initImplicitConstants(status); |
| UCATableHeader *colData = (UCATableHeader *)bin; |
| // do we want version check here? We're trying to figure out whether collators are compatible |
| if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || |
| uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || |
| colData->version[0] != UCOL_BUILDER_VERSION) |
| { |
| *status = U_COLLATOR_VERSION_MISMATCH; |
| return NULL; |
| } |
| else { |
| if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { |
| result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); |
| if(U_FAILURE(*status)){ |
| return NULL; |
| } |
| result->hasRealData = TRUE; |
| } |
| else { |
| if(base) { |
| result = ucol_initCollator(base->image, result, base, status); |
| ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); |
| if(U_FAILURE(*status)){ |
| return NULL; |
| } |
| result->hasRealData = FALSE; |
| } |
| else { |
| *status = U_USELESS_COLLATOR_ERROR; |
| return NULL; |
| } |
| } |
| result->freeImageOnClose = FALSE; |
| } |
| result->actualLocale = NULL; |
| result->validLocale = NULL; |
| result->requestedLocale = NULL; |
| result->rules = NULL; |
| result->rulesLength = 0; |
| result->freeRulesOnClose = FALSE; |
| result->ucaRules = NULL; |
| return result; |
| } |
| |
| U_CAPI UCollator* U_EXPORT2 |
| ucol_openBinary(const uint8_t *bin, int32_t length, |
| const UCollator *base, |
| UErrorCode *status) |
| { |
| return ucol_initFromBinary(bin, length, base, NULL, status); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_cloneBinary(const UCollator *coll, |
| uint8_t *buffer, int32_t capacity, |
| UErrorCode *status) |
| { |
| int32_t length = 0; |
| if(U_FAILURE(*status)) { |
| return length; |
| } |
| if(capacity < 0) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return length; |
| } |
| if(coll->hasRealData == TRUE) { |
| length = coll->image->size; |
| if(length <= capacity) { |
| uprv_memcpy(buffer, coll->image, length); |
| } else { |
| *status = U_BUFFER_OVERFLOW_ERROR; |
| } |
| } else { |
| length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); |
| if(length <= capacity) { |
| /* build the UCATableHeader with minimal entries */ |
| /* do not copy the header from the UCA file because its values are wrong! */ |
| /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ |
| |
| /* reset everything */ |
| uprv_memset(buffer, 0, length); |
| |
| /* set the tailoring-specific values */ |
| UCATableHeader *myData = (UCATableHeader *)buffer; |
| myData->size = length; |
| |
| /* offset for the options, the only part of the data that is present after the header */ |
| myData->options = sizeof(UCATableHeader); |
| |
| /* need to always set the expansion value for an upper bound of the options */ |
| myData->expansion = myData->options + sizeof(UColOptionSet); |
| |
| myData->magic = UCOL_HEADER_MAGIC; |
| myData->isBigEndian = U_IS_BIG_ENDIAN; |
| myData->charSetFamily = U_CHARSET_FAMILY; |
| |
| /* copy UCA's version; genrb will override all but the builder version with tailoring data */ |
| uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); |
| |
| uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); |
| uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); |
| uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); |
| myData->jamoSpecial = coll->image->jamoSpecial; |
| |
| /* copy the collator options */ |
| uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); |
| } else { |
| *status = U_BUFFER_OVERFLOW_ERROR; |
| } |
| } |
| return length; |
| } |
| |
| U_CAPI UCollator* U_EXPORT2 |
| ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) |
| { |
| UCollator * localCollator; |
| int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); |
| char *stackBufferChars = (char *)stackBuffer; |
| int32_t imageSize = 0; |
| int32_t rulesSize = 0; |
| int32_t rulesPadding = 0; |
| uint8_t *image; |
| UChar *rules; |
| UBool colAllocated = FALSE; |
| UBool imageAllocated = FALSE; |
| |
| if (status == NULL || U_FAILURE(*status)){ |
| return 0; |
| } |
| if ((stackBuffer && !pBufferSize) || !coll){ |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| if (coll->rules && coll->freeRulesOnClose) { |
| rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); |
| rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); |
| bufferSizeNeeded += rulesSize + rulesPadding; |
| } |
| |
| if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ |
| *pBufferSize = bufferSizeNeeded; |
| return 0; |
| } |
| |
| /* Pointers on 64-bit platforms need to be aligned |
| * on a 64-bit boundry in memory. |
| */ |
| if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { |
| int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); |
| if (*pBufferSize > offsetUp) { |
| *pBufferSize -= offsetUp; |
| stackBufferChars += offsetUp; |
| } |
| else { |
| /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ |
| *pBufferSize = 1; |
| } |
| } |
| stackBuffer = (void *)stackBufferChars; |
| |
| if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { |
| /* allocate one here...*/ |
| stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); |
| // Null pointer check. |
| if (stackBufferChars == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| colAllocated = TRUE; |
| if (U_SUCCESS(*status)) { |
| *status = U_SAFECLONE_ALLOCATED_WARNING; |
| } |
| } |
| localCollator = (UCollator *)stackBufferChars; |
| rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); |
| { |
| UErrorCode tempStatus = U_ZERO_ERROR; |
| imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); |
| } |
| if (coll->freeImageOnClose) { |
| image = (uint8_t *)uprv_malloc(imageSize); |
| // Null pointer check |
| if (image == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| ucol_cloneBinary(coll, image, imageSize, status); |
| imageAllocated = TRUE; |
| } |
| else { |
| image = (uint8_t *)coll->image; |
| } |
| localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| |
| if (coll->rules) { |
| if (coll->freeRulesOnClose) { |
| localCollator->rules = u_strcpy(rules, coll->rules); |
| //bufferEnd += rulesSize; |
| } |
| else { |
| localCollator->rules = coll->rules; |
| } |
| localCollator->freeRulesOnClose = FALSE; |
| localCollator->rulesLength = coll->rulesLength; |
| } |
| |
| int32_t i; |
| for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { |
| ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); |
| } |
| // zero copies of pointers |
| localCollator->actualLocale = NULL; |
| localCollator->validLocale = NULL; |
| localCollator->requestedLocale = NULL; |
| localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. |
| localCollator->freeOnClose = colAllocated; |
| localCollator->freeImageOnClose = imageAllocated; |
| return localCollator; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucol_close(UCollator *coll) |
| { |
| UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); |
| UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); |
| if(coll != NULL) { |
| // these are always owned by each UCollator struct, |
| // so we always free them |
| if(coll->validLocale != NULL) { |
| uprv_free(coll->validLocale); |
| } |
| if(coll->actualLocale != NULL) { |
| uprv_free(coll->actualLocale); |
| } |
| if(coll->requestedLocale != NULL) { |
| uprv_free(coll->requestedLocale); |
| } |
| if(coll->latinOneCEs != NULL) { |
| uprv_free(coll->latinOneCEs); |
| } |
| if(coll->options != NULL && coll->freeOptionsOnClose) { |
| uprv_free(coll->options); |
| } |
| if(coll->rules != NULL && coll->freeRulesOnClose) { |
| uprv_free((UChar *)coll->rules); |
| } |
| if(coll->image != NULL && coll->freeImageOnClose) { |
| uprv_free((UCATableHeader *)coll->image); |
| } |
| |
| /* Here, it would be advisable to close: */ |
| /* - UData for UCA (unless we stuff it in the root resb */ |
| /* Again, do we need additional housekeeping... HMMM! */ |
| UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); |
| if(coll->freeOnClose){ |
| /* for safeClone, if freeOnClose is FALSE, |
| don't free the other instance data */ |
| uprv_free(coll); |
| } |
| } |
| UTRACE_EXIT(); |
| } |
| |
| /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ |
| /* you should be able to get the binary chunk to write out... Doesn't look very full now */ |
| U_CFUNC uint8_t* U_EXPORT2 |
| ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) |
| { |
| uint8_t *result = NULL; |
| if(U_FAILURE(*status)) { |
| return NULL; |
| } |
| if(coll->hasRealData == TRUE) { |
| *length = coll->image->size; |
| result = (uint8_t *)uprv_malloc(*length); |
| /* test for NULL */ |
| if (result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| uprv_memcpy(result, coll->image, *length); |
| } else { |
| *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); |
| result = (uint8_t *)uprv_malloc(*length); |
| /* test for NULL */ |
| if (result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| |
| /* build the UCATableHeader with minimal entries */ |
| /* do not copy the header from the UCA file because its values are wrong! */ |
| /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ |
| |
| /* reset everything */ |
| uprv_memset(result, 0, *length); |
| |
| /* set the tailoring-specific values */ |
| UCATableHeader *myData = (UCATableHeader *)result; |
| myData->size = *length; |
| |
| /* offset for the options, the only part of the data that is present after the header */ |
| myData->options = sizeof(UCATableHeader); |
| |
| /* need to always set the expansion value for an upper bound of the options */ |
| myData->expansion = myData->options + sizeof(UColOptionSet); |
| |
| myData->magic = UCOL_HEADER_MAGIC; |
| myData->isBigEndian = U_IS_BIG_ENDIAN; |
| myData->charSetFamily = U_CHARSET_FAMILY; |
| |
| /* copy UCA's version; genrb will override all but the builder version with tailoring data */ |
| uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); |
| |
| uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); |
| uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); |
| uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); |
| myData->jamoSpecial = coll->image->jamoSpecial; |
| |
| /* copy the collator options */ |
| uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); |
| } |
| return result; |
| } |
| |
| void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| result->caseFirst = (UColAttributeValue)opts->caseFirst; |
| result->caseLevel = (UColAttributeValue)opts->caseLevel; |
| result->frenchCollation = (UColAttributeValue)opts->frenchCollation; |
| result->normalizationMode = (UColAttributeValue)opts->normalizationMode; |
| result->strength = (UColAttributeValue)opts->strength; |
| result->variableTopValue = opts->variableTopValue; |
| result->alternateHandling = (UColAttributeValue)opts->alternateHandling; |
| result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; |
| result->numericCollation = (UColAttributeValue)opts->numericCollation; |
| |
| result->caseFirstisDefault = TRUE; |
| result->caseLevelisDefault = TRUE; |
| result->frenchCollationisDefault = TRUE; |
| result->normalizationModeisDefault = TRUE; |
| result->strengthisDefault = TRUE; |
| result->variableTopValueisDefault = TRUE; |
| result->hiraganaQisDefault = TRUE; |
| result->numericCollationisDefault = TRUE; |
| |
| ucol_updateInternalState(result, status); |
| |
| result->options = opts; |
| } |
| |
| |
| /** |
| * Approximate determination if a character is at a contraction end. |
| * Guaranteed to be TRUE if a character is at the end of a contraction, |
| * otherwise it is not deterministic. |
| * @param c character to be determined |
| * @param coll collator |
| */ |
| static |
| inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { |
| if (c < coll->minContrEndCP) { |
| return FALSE; |
| } |
| |
| int32_t hash = c; |
| uint8_t htbyte; |
| if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { |
| if (U16_IS_TRAIL(c)) { |
| return TRUE; |
| } |
| hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; |
| } |
| htbyte = coll->contrEndCP[hash>>3]; |
| return (((htbyte >> (hash & 7)) & 1) == 1); |
| } |
| |
| |
| |
| /* |
| * i_getCombiningClass() |
| * A fast, at least partly inline version of u_getCombiningClass() |
| * This is a candidate for further optimization. Used heavily |
| * in contraction processing. |
| */ |
| static |
| inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { |
| uint8_t sCC = 0; |
| if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { |
| sCC = u_getCombiningClass(c); |
| } |
| return sCC; |
| } |
| |
| UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { |
| UChar c; |
| UCollator *result = fillIn; |
| if(U_FAILURE(*status) || image == NULL) { |
| return NULL; |
| } |
| |
| if(result == NULL) { |
| result = (UCollator *)uprv_malloc(sizeof(UCollator)); |
| if(result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return result; |
| } |
| result->freeOnClose = TRUE; |
| } else { |
| result->freeOnClose = FALSE; |
| } |
| |
| // init FCD data |
| if (fcdTrieIndex == NULL) { |
| // The result is constant, until the library is reloaded. |
| fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); |
| ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); |
| } |
| |
| result->image = image; |
| result->mapping.getFoldingOffset = _getFoldingOffset; |
| const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; |
| utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); |
| if(U_FAILURE(*status)) { |
| if(result->freeOnClose == TRUE) { |
| uprv_free(result); |
| result = NULL; |
| } |
| return result; |
| } |
| |
| /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/ |
| result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); |
| result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); |
| result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); |
| result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); |
| |
| result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options); |
| result->freeOptionsOnClose = FALSE; |
| |
| /* set attributes */ |
| result->caseFirst = (UColAttributeValue)result->options->caseFirst; |
| result->caseLevel = (UColAttributeValue)result->options->caseLevel; |
| result->frenchCollation = (UColAttributeValue)result->options->frenchCollation; |
| result->normalizationMode = (UColAttributeValue)result->options->normalizationMode; |
| result->strength = (UColAttributeValue)result->options->strength; |
| result->variableTopValue = result->options->variableTopValue; |
| result->alternateHandling = (UColAttributeValue)result->options->alternateHandling; |
| result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ; |
| result->numericCollation = (UColAttributeValue)result->options->numericCollation; |
| |
| result->caseFirstisDefault = TRUE; |
| result->caseLevelisDefault = TRUE; |
| result->frenchCollationisDefault = TRUE; |
| result->normalizationModeisDefault = TRUE; |
| result->strengthisDefault = TRUE; |
| result->variableTopValueisDefault = TRUE; |
| result->alternateHandlingisDefault = TRUE; |
| result->hiraganaQisDefault = TRUE; |
| result->numericCollationisDefault = TRUE; |
| |
| /*result->scriptOrder = NULL;*/ |
| |
| result->rules = NULL; |
| result->rulesLength = 0; |
| result->freeRulesOnClose = FALSE; |
| |
| /* get the version info from UCATableHeader and populate the Collator struct*/ |
| result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ |
| result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ |
| result->dataVersion[2] = 0; |
| result->dataVersion[3] = 0; |
| |
| result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; |
| result->minUnsafeCP = 0; |
| for (c=0; c<0x300; c++) { // Find the smallest unsafe char. |
| if (ucol_unsafeCP(c, result)) break; |
| } |
| result->minUnsafeCP = c; |
| |
| result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; |
| result->minContrEndCP = 0; |
| for (c=0; c<0x300; c++) { // Find the Contraction-ending char. |
| if (ucol_contractionEndCP(c, result)) break; |
| } |
| result->minContrEndCP = c; |
| |
| /* max expansion tables */ |
| result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + |
| result->image->endExpansionCE); |
| result->lastEndExpansionCE = result->endExpansionCE + |
| result->image->endExpansionCECount - 1; |
| result->expansionCESize = (uint8_t*)result->image + |
| result->image->expansionCESize; |
| |
| |
| //result->errorCode = *status; |
| |
| result->latinOneCEs = NULL; |
| |
| result->latinOneRegenTable = FALSE; |
| result->latinOneFailed = FALSE; |
| result->UCA = UCA; |
| |
| ucol_updateInternalState(result, status); |
| |
| /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ |
| result->ucaRules = NULL; |
| result->actualLocale = NULL; |
| result->validLocale = NULL; |
| result->requestedLocale = NULL; |
| result->hasRealData = FALSE; // real data lives in .dat file... |
| result->freeImageOnClose = FALSE; |
| |
| return result; |
| } |
| |
| /* new Mark's code */ |
| |
| /** |
| * For generation of Implicit CEs |
| * @author Davis |
| * |
| * Cleaned up so that changes can be made more easily. |
| * Old values: |
| # First Implicit: E26A792D |
| # Last Implicit: E3DC70C0 |
| # First CJK: E0030300 |
| # Last CJK: E0A9DD00 |
| # First CJK_A: E0A9DF00 |
| # Last CJK_A: E0DE3100 |
| */ |
| /* Following is a port of Mark's code for new treatment of implicits. |
| * It is positioned here, since ucol_initUCA need to initialize the |
| * variables below according to the data in the fractional UCA. |
| */ |
| |
| /** |
| * Function used to: |
| * a) collapse the 2 different Han ranges from UCA into one (in the right order), and |
| * b) bump any non-CJK characters by 10FFFF. |
| * The relevant blocks are: |
| * A: 4E00..9FFF; CJK Unified Ideographs |
| * F900..FAFF; CJK Compatibility Ideographs |
| * B: 3400..4DBF; CJK Unified Ideographs Extension A |
| * 20000..XX; CJK Unified Ideographs Extension B (and others later on) |
| * As long as |
| * no new B characters are allocated between 4E00 and FAFF, and |
| * no new A characters are outside of this range, |
| * (very high probability) this simple code will work. |
| * The reordered blocks are: |
| * Block1 is CJK |
| * Block2 is CJK_COMPAT_USED |
| * Block3 is CJK_A |
| * (all contiguous) |
| * Any other CJK gets its normal code point |
| * Any non-CJK gets +10FFFF |
| * When we reorder Block1, we make sure that it is at the very start, |
| * so that it will use a 3-byte form. |
| * Warning: the we only pick up the compatibility characters that are |
| * NOT decomposed, so that block is smaller! |
| */ |
| |
| // CONSTANTS |
| static const UChar32 |
| NON_CJK_OFFSET = 0x110000, |
| UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 |
| |
| /** |
| * Precomputed by initImplicitConstants() |
| */ |
| static int32_t |
| final3Multiplier = 0, |
| final4Multiplier = 0, |
| final3Count = 0, |
| final4Count = 0, |
| medialCount = 0, |
| min3Primary = 0, |
| min4Primary = 0, |
| max4Primary = 0, |
| minTrail = 0, |
| maxTrail = 0, |
| max3Trail = 0, |
| max4Trail = 0, |
| min4Boundary = 0; |
| |
| static const UChar32 |
| CJK_BASE = 0x4E00, |
| CJK_LIMIT = 0x9FFF+1, |
| CJK_COMPAT_USED_BASE = 0xFA0E, |
| CJK_COMPAT_USED_LIMIT = 0xFA2F+1, |
| CJK_A_BASE = 0x3400, |
| CJK_A_LIMIT = 0x4DBF+1, |
| CJK_B_BASE = 0x20000, |
| CJK_B_LIMIT = 0x2A6DF+1; |
| |
| static UChar32 swapCJK(UChar32 i) { |
| |
| if (i >= CJK_BASE) { |
| if (i < CJK_LIMIT) return i - CJK_BASE; |
| |
| if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE |
| + (CJK_LIMIT - CJK_BASE); |
| if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_B_LIMIT) return i; // non-BMP-CJK |
| |
| return i + NON_CJK_OFFSET; // non-CJK |
| } |
| if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; |
| |
| if (i < CJK_A_LIMIT) return i - CJK_A_BASE |
| + (CJK_LIMIT - CJK_BASE) |
| + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| return i + NON_CJK_OFFSET; // non-CJK |
| } |
| |
| U_CAPI UChar32 U_EXPORT2 |
| uprv_uca_getRawFromCodePoint(UChar32 i) { |
| return swapCJK(i)+1; |
| } |
| |
| U_CAPI UChar32 U_EXPORT2 |
| uprv_uca_getCodePointFromRaw(UChar32 i) { |
| i--; |
| UChar32 result = 0; |
| if(i >= NON_CJK_OFFSET) { |
| result = i - NON_CJK_OFFSET; |
| } else if(i >= CJK_B_BASE) { |
| result = i; |
| } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted |
| if(i < CJK_LIMIT - CJK_BASE) { |
| result = i + CJK_BASE; |
| } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { |
| result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); |
| } else { |
| result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); |
| } |
| } else { |
| result = -1; |
| } |
| return result; |
| } |
| |
| // GET IMPLICIT PRIMARY WEIGHTS |
| // Return value is left justified primary key |
| U_CAPI uint32_t U_EXPORT2 |
| uprv_uca_getImplicitFromRaw(UChar32 cp) { |
| /* |
| if (cp < 0 || cp > UCOL_MAX_INPUT) { |
| throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); |
| } |
| */ |
| int32_t last0 = cp - min4Boundary; |
| if (last0 < 0) { |
| int32_t last1 = cp / final3Count; |
| last0 = cp % final3Count; |
| |
| int32_t last2 = last1 / medialCount; |
| last1 %= medialCount; |
| |
| last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start |
| last1 = minTrail + last1; // offset |
| last2 = min3Primary + last2; // offset |
| /* |
| if (last2 >= min4Primary) { |
| throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); |
| } |
| */ |
| return (last2 << 24) + (last1 << 16) + (last0 << 8); |
| } else { |
| int32_t last1 = last0 / final4Count; |
| last0 %= final4Count; |
| |
| int32_t last2 = last1 / medialCount; |
| last1 %= medialCount; |
| |
| int32_t last3 = last2 / medialCount; |
| last2 %= medialCount; |
| |
| last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start |
| last1 = minTrail + last1; // offset |
| last2 = minTrail + last2; // offset |
| last3 = min4Primary + last3; // offset |
| /* |
| if (last3 > max4Primary) { |
| throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); |
| } |
| */ |
| return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; |
| } |
| } |
| |
| static uint32_t U_EXPORT2 |
| uprv_uca_getImplicitPrimary(UChar32 cp) { |
| //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); |
| |
| cp = swapCJK(cp); |
| cp++; |
| // we now have a range of numbers from 0 to 21FFFF. |
| |
| //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); |
| |
| return uprv_uca_getImplicitFromRaw(cp); |
| } |
| |
| /** |
| * Converts implicit CE into raw integer ("code point") |
| * @param implicit |
| * @return -1 if illegal format |
| */ |
| U_CAPI UChar32 U_EXPORT2 |
| uprv_uca_getRawFromImplicit(uint32_t implicit) { |
| UChar32 result; |
| UChar32 b3 = implicit & 0xFF; |
| UChar32 b2 = (implicit >> 8) & 0xFF; |
| UChar32 b1 = (implicit >> 16) & 0xFF; |
| UChar32 b0 = (implicit >> 24) & 0xFF; |
| |
| // simple parameter checks |
| if (b0 < min3Primary || b0 > max4Primary |
| || b1 < minTrail || b1 > maxTrail) |
| return -1; |
| // normal offsets |
| b1 -= minTrail; |
| |
| // take care of the final values, and compose |
| if (b0 < min4Primary) { |
| if (b2 < minTrail || b2 > max3Trail || b3 != 0) |
| return -1; |
| b2 -= minTrail; |
| UChar32 remainder = b2 % final3Multiplier; |
| if (remainder != 0) |
| return -1; |
| b0 -= min3Primary; |
| b2 /= final3Multiplier; |
| result = ((b0 * medialCount) + b1) * final3Count + b2; |
| } else { |
| if (b2 < minTrail || b2 > maxTrail |
| || b3 < minTrail || b3 > max4Trail) |
| return -1; |
| b2 -= minTrail; |
| b3 -= minTrail; |
| UChar32 remainder = b3 % final4Multiplier; |
| if (remainder != 0) |
| return -1; |
| b3 /= final4Multiplier; |
| b0 -= min4Primary; |
| result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; |
| } |
| // final check |
| if (result < 0 || result > UCOL_MAX_INPUT) |
| return -1; |
| return result; |
| } |
| |
| |
| static inline int32_t divideAndRoundUp(int a, int b) { |
| return 1 + (a-1)/b; |
| } |
| |
| /* this function is either called from initUCA or from genUCA before |
| * doing canonical closure for the UCA. |
| */ |
| |
| /** |
| * Set up to generate implicits. |
| * Maintenance Note: this function may end up being called more than once, due |
| * to threading races during initialization. Make sure that |
| * none of the Constants is ever transiently assigned an |
| * incorrect value. |
| * @param minPrimary |
| * @param maxPrimary |
| * @param minTrail final byte |
| * @param maxTrail final byte |
| * @param gap3 the gap we leave for tailoring for 3-byte forms |
| * @param gap4 the gap we leave for tailoring for 4-byte forms |
| */ |
| static void initImplicitConstants(int minPrimary, int maxPrimary, |
| int minTrailIn, int maxTrailIn, |
| int gap3, int primaries3count, |
| UErrorCode *status) { |
| // some simple parameter checks |
| if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) |
| || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) |
| || (primaries3count < 1)) |
| { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| }; |
| |
| minTrail = minTrailIn; |
| maxTrail = maxTrailIn; |
| |
| min3Primary = minPrimary; |
| max4Primary = maxPrimary; |
| // compute constants for use later. |
| // number of values we can use in trailing bytes |
| // leave room for empty values between AND above, e.g. if gap = 2 |
| // range 3..7 => +3 -4 -5 -6 -7: so 1 value |
| // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values |
| // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values |
| final3Multiplier = gap3 + 1; |
| final3Count = (maxTrail - minTrail + 1) / final3Multiplier; |
| max3Trail = minTrail + (final3Count - 1) * final3Multiplier; |
| |
| // medials can use full range |
| medialCount = (maxTrail - minTrail + 1); |
| // find out how many values fit in each form |
| int32_t threeByteCount = medialCount * final3Count; |
| // now determine where the 3/4 boundary is. |
| // we use 3 bytes below the boundary, and 4 above |
| int32_t primariesAvailable = maxPrimary - minPrimary + 1; |
| int32_t primaries4count = primariesAvailable - primaries3count; |
| |
| |
| int32_t min3ByteCoverage = primaries3count * threeByteCount; |
| min4Primary = minPrimary + primaries3count; |
| min4Boundary = min3ByteCoverage; |
| // Now expand out the multiplier for the 4 bytes, and redo. |
| |
| int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; |
| int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); |
| int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); |
| int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; |
| if (gap4 < 1) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| final4Multiplier = gap4 + 1; |
| final4Count = neededPerFinalByte; |
| max4Trail = minTrail + (final4Count - 1) * final4Multiplier; |
| } |
| |
| /** |
| * Supply parameters for generating implicit CEs |
| */ |
| U_CAPI void U_EXPORT2 |
| uprv_uca_initImplicitConstants(UErrorCode *status) { |
| // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. |
| //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); |
| initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); |
| } |
| |
| |
| /* collIterNormalize Incremental Normalization happens here. */ |
| /* pick up the range of chars identifed by FCD, */ |
| /* normalize it into the collIterate's writable buffer, */ |
| /* switch the collIterate's state to use the writable buffer. */ |
| /* */ |
| static |
| void collIterNormalize(collIterate *collationSource) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ |
| const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ |
| |
| collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), |
| collationSource->writableBuffer, |
| status); |
| if (U_FAILURE(status)) { |
| #ifdef UCOL_DEBUG |
| fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); |
| #endif |
| return; |
| } |
| |
| collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); |
| collationSource->origFlags = collationSource->flags; |
| collationSource->flags |= UCOL_ITER_INNORMBUF; |
| collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
| } |
| |
| |
| // This function takes the iterator and extracts normalized stuff up to the next boundary |
| // It is similar in the end results to the collIterNormalize, but for the cases when we |
| // use an iterator |
| /*static |
| inline void normalizeIterator(collIterate *collationSource) { |
| UErrorCode status = U_ZERO_ERROR; |
| UBool wasNormalized = FALSE; |
| //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); |
| uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); |
| int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
| (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
| if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { |
| // reallocate and terminate |
| if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, |
| &collationSource->writableBuffer, |
| (int32_t *)&collationSource->writableBufSize, normLen + 1, |
| 0) |
| ) { |
| #ifdef UCOL_DEBUG |
| fprintf(stderr, "normalizeIterator(), out of memory\n"); |
| #endif |
| return; |
| } |
| status = U_ZERO_ERROR; |
| //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); |
| collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); |
| normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, |
| (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); |
| } |
| // Terminate the buffer - we already checked that it is big enough |
| collationSource->writableBuffer[normLen] = 0; |
| if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { |
| collationSource->flags |= UCOL_ITER_ALLOCATED; |
| } |
| collationSource->pos = collationSource->writableBuffer; |
| collationSource->origFlags = collationSource->flags; |
| collationSource->flags |= UCOL_ITER_INNORMBUF; |
| collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
| }*/ |
| |
| |
| /* Incremental FCD check and normalize */ |
| /* Called from getNextCE when normalization state is suspect. */ |
| /* When entering, the state is known to be this: */ |
| /* o We are working in the main buffer of the collIterate, not the side */ |
| /* writable buffer. When in the side buffer, normalization mode is always off, */ |
| /* so we won't get here. */ |
| /* o The leading combining class from the current character is 0 or */ |
| /* the trailing combining class of the previous char was zero. */ |
| /* True because the previous call to this function will have always exited */ |
| /* that way, and we get called for every char where cc might be non-zero. */ |
| static |
| inline UBool collIterFCD(collIterate *collationSource) { |
| const UChar *srcP, *endP; |
| uint8_t leadingCC; |
| uint8_t prevTrailingCC = 0; |
| uint16_t fcd; |
| UBool needNormalize = FALSE; |
| |
| srcP = collationSource->pos-1; |
| |
| if (collationSource->flags & UCOL_ITER_HASLEN) { |
| endP = collationSource->endp; |
| } else { |
| endP = NULL; |
| } |
| |
| // Get the trailing combining class of the current character. If it's zero, |
| // we are OK. |
| /* trie access */ |
| fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); |
| if (fcd != 0) { |
| prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| |
| if (prevTrailingCC != 0) { |
| // The current char has a non-zero trailing CC. Scan forward until we find |
| // a char with a leading cc of zero. |
| while (endP == NULL || srcP != endP) |
| { |
| const UChar *savedSrcP = srcP; |
| |
| /* trie access */ |
| fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); |
| leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| if (leadingCC == 0) { |
| srcP = savedSrcP; // Hit char that is not part of combining sequence. |
| // back up over it. (Could be surrogate pair!) |
| break; |
| } |
| |
| if (leadingCC < prevTrailingCC) { |
| needNormalize = TRUE; |
| } |
| |
| prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| } |
| } |
| } |
| |
| collationSource->fcdPosition = (UChar *)srcP; |
| |
| return needNormalize; |
| } |
| |
| /****************************************************************************/ |
| /* Following are the CE retrieval functions */ |
| /* */ |
| /****************************************************************************/ |
| |
| static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); |
| static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); |
| |
| /* there should be a macro version of this function in the header file */ |
| /* This is the first function that tries to fetch a collation element */ |
| /* If it's not succesfull or it encounters a more difficult situation */ |
| /* some more sofisticated and slower functions are invoked */ |
| static |
| inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { |
| uint32_t order = 0; |
| if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ |
| order = *(collationSource->toReturn++); /* if so, return them */ |
| if(collationSource->CEpos == collationSource->toReturn) { |
| collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; |
| } |
| return order; |
| } |
| |
| UChar ch = 0; |
| collationSource->offsetReturn = NULL; |
| |
| for (;;) /* Loop handles case when incremental normalize switches */ |
| { /* to or from the side buffer / original string, and we */ |
| /* need to start again to get the next character. */ |
| |
| if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) |
| { |
| // The source string is null terminated and we're not working from the side buffer, |
| // and we're not normalizing. This is the fast path. |
| // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) |
| ch = *collationSource->pos++; |
| if (ch != 0) { |
| break; |
| } |
| else { |
| return UCOL_NO_MORE_CES; |
| } |
| } |
| |
| if (collationSource->flags & UCOL_ITER_HASLEN) { |
| // Normal path for strings when length is specified. |
| // (We can't be in side buffer because it is always null terminated.) |
| if (collationSource->pos >= collationSource->endp) { |
| // Ran off of the end of the main source string. We're done. |
| return UCOL_NO_MORE_CES; |
| } |
| ch = *collationSource->pos++; |
| } |
| else if(collationSource->flags & UCOL_USE_ITERATOR) { |
| UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); |
| if(iterCh == U_SENTINEL) { |
| return UCOL_NO_MORE_CES; |
| } |
| ch = (UChar)iterCh; |
| } |
| else |
| { |
| // Null terminated string. |
| ch = *collationSource->pos++; |
| if (ch == 0) { |
| // Ran off end of buffer. |
| if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
| // Ran off end of main string. backing up one character. |
| collationSource->pos--; |
| return UCOL_NO_MORE_CES; |
| } |
| else |
| { |
| // Hit null in the normalize side buffer. |
| // Usually this means the end of the normalized data, |
| // except for one odd case: a null followed by combining chars, |
| // which is the case if we are at the start of the buffer. |
| if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { |
| break; |
| } |
| |
| // Null marked end of side buffer. |
| // Revert to the main string and |
| // loop back to top to try again to get a character. |
| collationSource->pos = collationSource->fcdPosition; |
| collationSource->flags = collationSource->origFlags; |
| continue; |
| } |
| } |
| } |
| |
| if(collationSource->flags&UCOL_HIRAGANA_Q) { |
| /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag |
| * based on whether the previous codepoint was Hiragana or Katakana. |
| */ |
| if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || |
| ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { |
| collationSource->flags |= UCOL_WAS_HIRAGANA; |
| } else { |
| collationSource->flags &= ~UCOL_WAS_HIRAGANA; |
| } |
| } |
| |
| // We've got a character. See if there's any fcd and/or normalization stuff to do. |
| // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. |
| if ((collationSource->flags & UCOL_ITER_NORM) == 0) { |
| break; |
| } |
| |
| if (collationSource->fcdPosition >= collationSource->pos) { |
| // An earlier FCD check has already covered the current character. |
| // We can go ahead and process this char. |
| break; |
| } |
| |
| if (ch < ZERO_CC_LIMIT_ ) { |
| // Fast fcd safe path. Trailing combining class == 0. This char is OK. |
| break; |
| } |
| |
| if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| // We need to peek at the next character in order to tell if we are FCD |
| if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { |
| // We are at the last char of source string. |
| // It is always OK for FCD check. |
| break; |
| } |
| |
| // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test |
| if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| break; |
| } |
| } |
| |
| |
| // Need a more complete FCD check and possible normalization. |
| if (collIterFCD(collationSource)) { |
| collIterNormalize(collationSource); |
| } |
| if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { |
| // No normalization was needed. Go ahead and process the char we already had. |
| break; |
| } |
| |
| // Some normalization happened. Next loop iteration will pick up a char |
| // from the normalization buffer. |
| |
| } // end for (;;) |
| |
| |
| if (ch <= 0xFF) { |
| /* For latin-1 characters we never need to fall back to the UCA table */ |
| /* because all of the UCA data is replicated in the latinOneMapping array */ |
| order = coll->latinOneMapping[ch]; |
| if (order > UCOL_NOT_FOUND) { |
| order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); |
| } |
| } |
| else |
| { |
| // Always use UCA for Han, Hangul |
| // (Han extension A is before main Han block) |
| // **** Han compatibility chars ?? **** |
| if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && |
| (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { |
| if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { |
| // between the two target ranges; do normal lookup |
| // **** this range is YI, Modifier tone letters, **** |
| // **** Latin-D, Syloti Nagari, Phagas-pa. **** |
| // **** Latin-D might be tailored, so we need to **** |
| // **** do the normal lookup for these guys. **** |
| order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| } else { |
| // in one of the target ranges; use UCA |
| order = UCOL_NOT_FOUND; |
| } |
| } else { |
| order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| } |
| |
| if(order > UCOL_NOT_FOUND) { /* if a CE is special */ |
| order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ |
| } |
| |
| if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ |
| /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ |
| order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| |
| if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ |
| order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); |
| } |
| } |
| } |
| if(order == UCOL_NOT_FOUND) { |
| order = getImplicit(ch, collationSource); |
| } |
| return order; /* return the CE */ |
| } |
| |
| /* ucol_getNextCE, out-of-line version for use from other files. */ |
| U_CAPI uint32_t U_EXPORT2 |
| ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { |
| return ucol_IGetNextCE(coll, collationSource, status); |
| } |
| |
| |
| /** |
| * Incremental previous normalization happens here. Pick up the range of chars |
| * identifed by FCD, normalize it into the collIterate's writable buffer, |
| * switch the collIterate's state to use the writable buffer. |
| * @param data collation iterator data |
| */ |
| static |
| void collPrevIterNormalize(collIterate *data) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| const UChar *pEnd = data->pos; /* End normalize + 1 */ |
| const UChar *pStart; |
| |
| /* Start normalize */ |
| if (data->fcdPosition == NULL) { |
| pStart = data->string; |
| } |
| else { |
| pStart = data->fcdPosition + 1; |
| } |
| |
| int32_t normLen = |
| data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), |
| data->writableBuffer, |
| status). |
| length(); |
| if(U_FAILURE(status)) { |
| return; |
| } |
| /* |
| this puts the null termination infront of the normalized string instead |
| of the end |
| */ |
| data->writableBuffer.insert(0, (UChar)0); |
| |
| if (data->offsetBuffer == NULL) { |
| int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE; |
| |
| data->offsetBufferSize = len; |
| data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len); |
| data->offsetStore = data->offsetBuffer; |
| } else if(data->offsetBufferSize < normLen) { |
| int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer); |
| int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1)); |
| |
| if (tob != NULL) { |
| data->offsetBuffer = tob; |
| data->offsetStore = &data->offsetBuffer[storeIX]; |
| data->offsetBufferSize = normLen + 1; |
| } |
| } |
| |
| /* |
| * The usual case at this point is that we've got a base |
| * character followed by marks that were normalized. If |
| * fcdPosition is NULL, that means that we backed up to |
| * the beginning of the string and there's no base character. |
| * |
| * Forward processing will usually normalize when it sees |
| * the first mark, so that mark will get it's natural offset |
| * and the rest will get the offset of the character following |
| * the marks. The base character will also get its natural offset. |
| * |
| * We write the offset of the base character, if there is one, |
| * followed by the offset of the first mark and then the offsets |
| * of the rest of the marks. |
| */ |
| int32_t firstMarkOffset = 0; |
| int32_t trailOffset = (int32_t)(data->pos - data->string + 1); |
| int32_t trailCount = normLen - 1; |
| |
| if (data->fcdPosition != NULL) { |
| int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); |
| UChar baseChar = *data->fcdPosition; |
| |
| firstMarkOffset = baseOffset + 1; |
| |
| /* |
| * If the base character is the start of a contraction, forward processing |
| * will normalize the marks while checking for the contraction, which means |
| * that the offset of the first mark will the same as the other marks. |
| * |
| * **** THIS IS PROBABLY NOT A COMPLETE TEST **** |
| */ |
| if (baseChar >= 0x100) { |
| uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); |
| |
| if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { |
| baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); |
| } |
| |
| if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { |
| firstMarkOffset = trailOffset; |
| } |
| } |
| |
| *(data->offsetStore++) = baseOffset; |
| } |
| |
| *(data->offsetStore++) = firstMarkOffset; |
| |
| for (int32_t i = 0; i < trailCount; i += 1) { |
| *(data->offsetStore++) = trailOffset; |
| } |
| |
| data->offsetRepeatValue = trailOffset; |
| |
| data->offsetReturn = data->offsetStore - 1; |
| if (data->offsetReturn == data->offsetBuffer) { |
| data->offsetStore = data->offsetBuffer; |
| } |
| |
| data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; |
| data->origFlags = data->flags; |
| data->flags |= UCOL_ITER_INNORMBUF; |
| data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| } |
| |
| |
| /** |
| * Incremental FCD check for previous iteration and normalize. Called from |
| * getPrevCE when normalization state is suspect. |
| * When entering, the state is known to be this: |
| * o We are working in the main buffer of the collIterate, not the side |
| * writable buffer. When in the side buffer, normalization mode is always |
| * off, so we won't get here. |
| * o The leading combining class from the current character is 0 or the |
| * trailing combining class of the previous char was zero. |
| * True because the previous call to this function will have always exited |
| * that way, and we get called for every char where cc might be non-zero. |
| * @param data collation iterate struct |
| * @return normalization status, TRUE for normalization to be done, FALSE |
| * otherwise |
| */ |
| static |
| inline UBool collPrevIterFCD(collIterate *data) |
| { |
| const UChar *src, *start; |
| uint8_t leadingCC; |
| uint8_t trailingCC = 0; |
| uint16_t fcd; |
| UBool result = FALSE; |
| |
| start = data->string; |
| src = data->pos + 1; |
| |
| /* Get the trailing combining class of the current character. */ |
| fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); |
| |
| leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| |
| if (leadingCC != 0) { |
| /* |
| The current char has a non-zero leading combining class. |
| Scan backward until we find a char with a trailing cc of zero. |
| */ |
| for (;;) |
| { |
| if (start == src) { |
| data->fcdPosition = NULL; |
| return result; |
| } |
| |
| fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); |
| |
| trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); |
| |
| if (trailingCC == 0) { |
| break; |
| } |
| |
| if (leadingCC < trailingCC) { |
| result = TRUE; |
| } |
| |
| leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); |
| } |
| } |
| |
| data->fcdPosition = (UChar *)src; |
| |
| return result; |
| } |
| |
| /** gets a character from the string at a given offset |
| * Handles both normal and iterative cases. |
| * No error checking - caller beware! |
| */ |
| inline static |
| UChar peekCharacter(collIterate *source, int32_t offset) { |
| if(source->pos != NULL) { |
| return *(source->pos + offset); |
| } else if(source->iterator != NULL) { |
| if(offset != 0) { |
| source->iterator->move(source->iterator, offset, UITER_CURRENT); |
| UChar toReturn = (UChar)source->iterator->next(source->iterator); |
| source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); |
| return toReturn; |
| } else { |
| return (UChar)source->iterator->current(source->iterator); |
| } |
| } else { |
| return (UChar)U_SENTINEL; |
| } |
| } |
| |
| /** |
| * Determines if we are at the start of the data string in the backwards |
| * collation iterator |
| * @param data collation iterator |
| * @return TRUE if we are at the start |
| */ |
| static |
| inline UBool isAtStartPrevIterate(collIterate *data) { |
| if(data->pos == NULL && data->iterator != NULL) { |
| return !data->iterator->hasPrevious(data->iterator); |
| } |
| //return (collIter_bos(data)) || |
| return (data->pos == data->string) || |
| ((data->flags & UCOL_ITER_INNORMBUF) && |
| *(data->pos - 1) == 0 && data->fcdPosition == NULL); |
| } |
| |
| static |
| inline void goBackOne(collIterate *data) { |
| # if 0 |
| // somehow, it looks like we need to keep iterator synced up |
| // at all times, as above. |
| if(data->pos) { |
| data->pos--; |
| } |
| if(data->iterator) { |
| data->iterator->previous(data->iterator); |
| } |
| #endif |
| if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { |
| data->iterator->previous(data->iterator); |
| } |
| if(data->pos) { |
| data->pos --; |
| } |
| } |
| |
| /** |
| * Inline function that gets a simple CE. |
| * So what it does is that it will first check the expansion buffer. If the |
| * expansion buffer is not empty, ie the end pointer to the expansion buffer |
| * is different from the string pointer, we return the collation element at the |
| * return pointer and decrement it. |
| * For more complicated CEs it resorts to getComplicatedCE. |
| * @param coll collator data |
| * @param data collation iterator struct |
| * @param status error status |
| */ |
| static |
| inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, |
| UErrorCode *status) |
| { |
| uint32_t result = (uint32_t)UCOL_NULLORDER; |
| |
| if (data->offsetReturn != NULL) { |
| if (data->offsetRepeatCount > 0) { |
| data->offsetRepeatCount -= 1; |
| } else { |
| if (data->offsetReturn == data->offsetBuffer) { |
| data->offsetReturn = NULL; |
| data->offsetStore = data->offsetBuffer; |
| } else { |
| data->offsetReturn -= 1; |
| } |
| } |
| } |
| |
| if ((data->extendCEs && data->toReturn > data->extendCEs) || |
| (!data->extendCEs && data->toReturn > data->CEs)) |
| { |
| data->toReturn -= 1; |
| result = *(data->toReturn); |
| if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { |
| data->CEpos = data->toReturn; |
| } |
| } |
| else { |
| UChar ch = 0; |
| |
| /* |
| Loop handles case when incremental normalize switches to or from the |
| side buffer / original string, and we need to start again to get the |
| next character. |
| */ |
| for (;;) { |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* |
| Normal path for strings when length is specified. |
| Not in side buffer because it is always null terminated. |
| */ |
| if (data->pos <= data->string) { |
| /* End of the main source string */ |
| return UCOL_NO_MORE_CES; |
| } |
| data->pos --; |
| ch = *data->pos; |
| } |
| // we are using an iterator to go back. Pray for us! |
| else if (data->flags & UCOL_USE_ITERATOR) { |
| UChar32 iterCh = data->iterator->previous(data->iterator); |
| if(iterCh == U_SENTINEL) { |
| return UCOL_NO_MORE_CES; |
| } else { |
| ch = (UChar)iterCh; |
| } |
| } |
| else { |
| data->pos --; |
| ch = *data->pos; |
| /* we are in the side buffer. */ |
| if (ch == 0) { |
| /* |
| At the start of the normalize side buffer. |
| Go back to string. |
| Because pointer points to the last accessed character, |
| hence we have to increment it by one here. |
| */ |
| data->flags = data->origFlags; |
| data->offsetRepeatValue = 0; |
| |
| if (data->fcdPosition == NULL) { |
| data->pos = data->string; |
| return UCOL_NO_MORE_CES; |
| } |
| else { |
| data->pos = data->fcdPosition + 1; |
| } |
| |
| continue; |
| } |
| } |
| |
| if(data->flags&UCOL_HIRAGANA_Q) { |
| if(ch>=0x3040 && ch<=0x309f) { |
| data->flags |= UCOL_WAS_HIRAGANA; |
| } else { |
| data->flags &= ~UCOL_WAS_HIRAGANA; |
| } |
| } |
| |
| /* |
| * got a character to determine if there's fcd and/or normalization |
| * stuff to do. |
| * if the current character is not fcd. |
| * if current character is at the start of the string |
| * Trailing combining class == 0. |
| * Note if pos is in the writablebuffer, norm is always 0 |
| */ |
| if (ch < ZERO_CC_LIMIT_ || |
| // this should propel us out of the loop in the iterator case |
| (data->flags & UCOL_ITER_NORM) == 0 || |
| (data->fcdPosition != NULL && data->fcdPosition <= data->pos) |
| || data->string == data->pos) { |
| break; |
| } |
| |
| if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| /* if next character is FCD */ |
| if (data->pos == data->string) { |
| /* First char of string is always OK for FCD check */ |
| break; |
| } |
| |
| /* Not first char of string, do the FCD fast test */ |
| if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { |
| break; |
| } |
| } |
| |
| /* Need a more complete FCD check and possible normalization. */ |
| if (collPrevIterFCD(data)) { |
| collPrevIterNormalize(data); |
| } |
| |
| if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| /* No normalization. Go ahead and process the char. */ |
| break; |
| } |
| |
| /* |
| Some normalization happened. |
| Next loop picks up a char from the normalization buffer. |
| */ |
| } |
| |
| /* attempt to handle contractions, after removal of the backwards |
| contraction |
| */ |
| if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { |
| result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); |
| } else { |
| if (ch <= 0xFF) { |
| result = coll->latinOneMapping[ch]; |
| } |
| else { |
| // Always use UCA for [3400..9FFF], [AC00..D7AF] |
| // **** [FA0E..FA2F] ?? **** |
| if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && |
| (ch >= 0x3400 && ch <= 0xD7AF)) { |
| if (ch > 0x9FFF && ch < 0xAC00) { |
| // between the two target ranges; do normal lookup |
| // **** this range is YI, Modifier tone letters, **** |
| // **** Latin-D, Syloti Nagari, Phagas-pa. **** |
| // **** Latin-D might be tailored, so we need to **** |
| // **** do the normal lookup for these guys. **** |
| result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| } else { |
| result = UCOL_NOT_FOUND; |
| } |
| } else { |
| result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); |
| } |
| } |
| if (result > UCOL_NOT_FOUND) { |
| result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); |
| } |
| if (result == UCOL_NOT_FOUND) { // Not found in master list |
| if (!isAtStartPrevIterate(data) && |
| ucol_contractionEndCP(ch, data->coll)) |
| { |
| result = UCOL_CONTRACTION; |
| } else { |
| if(coll->UCA) { |
| result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); |
| } |
| } |
| |
| if (result > UCOL_NOT_FOUND) { |
| if(coll->UCA) { |
| result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); |
| } |
| } |
| } |
| } |
| |
| if(result == UCOL_NOT_FOUND) { |
| result = getPrevImplicit(ch, data); |
| } |
| } |
| |
| return result; |
| } |
| |
| |
| /* ucol_getPrevCE, out-of-line version for use from other files. */ |
| U_CFUNC uint32_t U_EXPORT2 |
| ucol_getPrevCE(const UCollator *coll, collIterate *data, |
| UErrorCode *status) { |
| return ucol_IGetPrevCE(coll, data, status); |
| } |
| |
| |
| /* this should be connected to special Jamo handling */ |
| U_CFUNC uint32_t U_EXPORT2 |
| ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { |
| collIterate colIt; |
| IInit_collIterate(coll, &u, 1, &colIt, status); |
| if(U_FAILURE(*status)) { |
| return 0; |
| } |
| return ucol_IGetNextCE(coll, &colIt, status); |
| } |
| |
| /** |
| * Inserts the argument character into the end of the buffer pushing back the |
| * null terminator. |
| * @param data collIterate struct data |
| * @param ch character to be appended |
| * @return the position of the new addition |
| */ |
| static |
| inline const UChar * insertBufferEnd(collIterate *data, UChar ch) |
| { |
| int32_t oldLength = data->writableBuffer.length(); |
| return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; |
| } |
| |
| /** |
| * Inserts the argument string into the end of the buffer pushing back the |
| * null terminator. |
| * @param data collIterate struct data |
| * @param string to be appended |
| * @param length of the string to be appended |
| * @return the position of the new addition |
| */ |
| static |
| inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) |
| { |
| int32_t oldLength = data->writableBuffer.length(); |
| return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; |
| } |
| |
| /** |
| * Special normalization function for contraction in the forwards iterator. |
| * This normalization sequence will place the current character at source->pos |
| * and its following normalized sequence into the buffer. |
| * The fcd position, pos will be changed. |
| * pos will now point to positions in the buffer. |
| * Flags will be changed accordingly. |
| * @param data collation iterator data |
| */ |
| static |
| inline void normalizeNextContraction(collIterate *data) |
| { |
| int32_t strsize; |
| UErrorCode status = U_ZERO_ERROR; |
| /* because the pointer points to the next character */ |
| const UChar *pStart = data->pos - 1; |
| const UChar *pEnd; |
| |
| if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { |
| data->writableBuffer.setTo(*(pStart - 1)); |
| strsize = 1; |
| } |
| else { |
| strsize = data->writableBuffer.length(); |
| } |
| |
| pEnd = data->fcdPosition; |
| |
| data->writableBuffer.append( |
| data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); |
| if(U_FAILURE(status)) { |
| return; |
| } |
| |
| data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; |
| data->origFlags = data->flags; |
| data->flags |= UCOL_ITER_INNORMBUF; |
| data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| } |
| |
| /** |
| * Contraction character management function that returns the next character |
| * for the forwards iterator. |
| * Does nothing if the next character is in buffer and not the first character |
| * in it. |
| * Else it checks next character in data string to see if it is normalizable. |
| * If it is not, the character is simply copied into the buffer, else |
| * the whole normalized substring is copied into the buffer, including the |
| * current character. |
| * @param data collation element iterator data |
| * @return next character |
| */ |
| static |
| inline UChar getNextNormalizedChar(collIterate *data) |
| { |
| UChar nextch; |
| UChar ch; |
| // Here we need to add the iterator code. One problem is the way |
| // end of string is handled. If we just return next char, it could |
| // be the sentinel. Most of the cases already check for this, but we |
| // need to be sure. |
| if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { |
| /* if no normalization and not in buffer. */ |
| if(data->flags & UCOL_USE_ITERATOR) { |
| return (UChar)data->iterator->next(data->iterator); |
| } else { |
| return *(data->pos ++); |
| } |
| } |
| |
| //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { |
| //normalizeIterator(data); |
| //} |
| |
| UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
| if ((innormbuf && *data->pos != 0) || |
| (data->fcdPosition != NULL && !innormbuf && |
| data->pos < data->fcdPosition)) { |
| /* |
| if next character is in normalized buffer, no further normalization |
| is required |
| */ |
| return *(data->pos ++); |
| } |
| |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* in data string */ |
| if (data->pos + 1 == data->endp) { |
| return *(data->pos ++); |
| } |
| } |
| else { |
| if (innormbuf) { |
| // inside the normalization buffer, but at the end |
| // (since we encountered zero). This means, in the |
| // case we're using char iterator, that we need to |
| // do another round of normalization. |
| //if(data->origFlags & UCOL_USE_ITERATOR) { |
| // we need to restore original flags, |
| // otherwise, we'll lose them |
| //data->flags = data->origFlags; |
| //normalizeIterator(data); |
| //return *(data->pos++); |
| //} else { |
| /* |
| in writable buffer, at this point fcdPosition can not be |
| pointing to the end of the data string. see contracting tag. |
| */ |
| if(data->fcdPosition) { |
| if (*(data->fcdPosition + 1) == 0 || |
| data->fcdPosition + 1 == data->endp) { |
| /* at the end of the string, dump it into the normalizer */ |
| data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; |
| // Check if data->pos received a null pointer |
| if (data->pos == NULL) { |
| return (UChar)-1; // Return to indicate error. |
| } |
| return *(data->fcdPosition ++); |
| } |
| data->pos = data->fcdPosition; |
| } else if(data->origFlags & UCOL_USE_ITERATOR) { |
| // if we are here, we're using a normalizing iterator. |
| // we should just continue further. |
| data->flags = data->origFlags; |
| data->pos = NULL; |
| return (UChar)data->iterator->next(data->iterator); |
| } |
| //} |
| } |
| else { |
| if (*(data->pos + 1) == 0) { |
| return *(data->pos ++); |
| } |
| } |
| } |
| |
| ch = *data->pos ++; |
| nextch = *data->pos; |
| |
| /* |
| * if the current character is not fcd. |
| * Trailing combining class == 0. |
| */ |
| if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && |
| (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || |
| ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { |
| /* |
| Need a more complete FCD check and possible normalization. |
| normalize substring will be appended to buffer |
| */ |
| if (collIterFCD(data)) { |
| normalizeNextContraction(data); |
| return *(data->pos ++); |
| } |
| else if (innormbuf) { |
| /* fcdposition shifted even when there's no normalization, if we |
| don't input the rest into this, we'll get the wrong position when |
| we reach the end of the writableBuffer */ |
| int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); |
| data->pos = insertBufferEnd(data, data->pos - 1, length); |
| // Check if data->pos received a null pointer |
| if (data->pos == NULL) { |
| return (UChar)-1; // Return to indicate error. |
| } |
| return *(data->pos ++); |
| } |
| } |
| |
| if (innormbuf) { |
| /* |
| no normalization is to be done hence only one character will be |
| appended to the buffer. |
| */ |
| data->pos = insertBufferEnd(data, ch) + 1; |
| // Check if data->pos received a null pointer |
| if (data->pos == NULL) { |
| return (UChar)-1; // Return to indicate error. |
| } |
| } |
| |
| /* points back to the pos in string */ |
| return ch; |
| } |
| |
| |
| |
| /** |
| * Function to copy the buffer into writableBuffer and sets the fcd position to |
| * the correct position |
| * @param source data string source |
| * @param buffer character buffer |
| */ |
| static |
| inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) |
| { |
| /* okay confusing part here. to ensure that the skipped characters are |
| considered later, we need to place it in the appropriate position in the |
| normalization buffer and reassign the pos pointer. simple case if pos |
| reside in string, simply copy to normalization buffer and |
| fcdposition = pos, pos = start of normalization buffer. if pos in |
| normalization buffer, we'll insert the copy infront of pos and point pos |
| to the start of the normalization buffer. why am i doing these copies? |
| well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does |
| not require any changes, which be really painful. */ |
| if (source->flags & UCOL_ITER_INNORMBUF) { |
| int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); |
| source->writableBuffer.replace(0, replaceLength, buffer); |
| } |
| else { |
| source->fcdPosition = source->pos; |
| source->origFlags = source->flags; |
| source->flags |= UCOL_ITER_INNORMBUF; |
| source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); |
| source->writableBuffer = buffer; |
| } |
| |
| source->pos = source->writableBuffer.getTerminatedBuffer(); |
| } |
| |
| /** |
| * Function to get the discontiguos collation element within the source. |
| * Note this function will set the position to the appropriate places. |
| * @param coll current collator used |
| * @param source data string source |
| * @param constart index to the start character in the contraction table |
| * @return discontiguos collation element offset |
| */ |
| static |
| uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, |
| const UChar *constart) |
| { |
| /* source->pos currently points to the second combining character after |
| the start character */ |
| const UChar *temppos = source->pos; |
| UnicodeString buffer; |
| const UChar *tempconstart = constart; |
| uint8_t tempflags = source->flags; |
| UBool multicontraction = FALSE; |
| collIterateState discState; |
| |
| backupState(source, &discState); |
| |
| buffer.setTo(peekCharacter(source, -1)); |
| for (;;) { |
| UChar *UCharOffset; |
| UChar schar, |
| tchar; |
| uint32_t result; |
| |
| if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) |
| || (peekCharacter(source, 0) == 0 && |
| //|| (*source->pos == 0 && |
| ((source->flags & UCOL_ITER_INNORMBUF) == 0 || |
| source->fcdPosition == NULL || |
| source->fcdPosition == source->endp || |
| *(source->fcdPosition) == 0 || |
| u_getCombiningClass(*(source->fcdPosition)) == 0)) || |
| /* end of string in null terminated string or stopped by a |
| null character, note fcd does not always point to a base |
| character after the discontiguos change */ |
| u_getCombiningClass(peekCharacter(source, 0)) == 0) { |
| //u_getCombiningClass(*(source->pos)) == 0) { |
| //constart = (UChar *)coll->image + getContractOffset(CE); |
| if (multicontraction) { |
| source->pos = temppos - 1; |
| setDiscontiguosAttribute(source, buffer); |
| return *(coll->contractionCEs + |
| (tempconstart - coll->contractionIndex)); |
| } |
| constart = tempconstart; |
| break; |
| } |
| |
| UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ |
| schar = getNextNormalizedChar(source); |
| |
| while (schar > (tchar = *UCharOffset)) { |
| UCharOffset++; |
| } |
| |
| if (schar != tchar) { |
| /* not the correct codepoint. we stuff the current codepoint into |
| the discontiguos buffer and try the next character */ |
| buffer.append(schar); |
| continue; |
| } |
| else { |
| if (u_getCombiningClass(schar) == |
| u_getCombiningClass(peekCharacter(source, -2))) { |
| //u_getCombiningClass(*(source->pos - 2))) { |
| buffer.append(schar); |
| continue; |
| } |
| result = *(coll->contractionCEs + |
| (UCharOffset - coll->contractionIndex)); |
| } |
| |
| if (result == UCOL_NOT_FOUND) { |
| break; |
| } else if (isContraction(result)) { |
| /* this is a multi-contraction*/ |
| tempconstart = (UChar *)coll->image + getContractOffset(result); |
| if (*(coll->contractionCEs + (constart - coll->contractionIndex)) |
| != UCOL_NOT_FOUND) { |
| multicontraction = TRUE; |
| temppos = source->pos + 1; |
| } |
| } else { |
| setDiscontiguosAttribute(source, buffer); |
| return result; |
| } |
| } |
| |
| /* no problems simply reverting just like that, |
| if we are in string before getting into this function, points back to |
| string hence no problem. |
| if we are in normalization buffer before getting into this function, |
| since we'll never use another normalization within this function, we |
| know that fcdposition points to a base character. the normalization buffer |
| never change, hence this revert works. */ |
| loadState(source, &discState, TRUE); |
| goBackOne(source); |
| |
| //source->pos = temppos - 1; |
| source->flags = tempflags; |
| return *(coll->contractionCEs + (constart - coll->contractionIndex)); |
| } |
| |
| static |
| inline UBool isNonChar(UChar32 cp) { |
| return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)); |
| } |
| |
| /* now uses Mark's getImplicitPrimary code */ |
| static |
| inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { |
| if(isNonChar(cp)) { |
| return 0; |
| } |
| uint32_t r = uprv_uca_getImplicitPrimary(cp); |
| *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; |
| collationSource->offsetRepeatCount += 1; |
| return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' |
| } |
| |
| /** |
| * Inserts the argument character into the front of the buffer replacing the |
| * front null terminator. |
| * @param data collation element iterator data |
| * @param ch character to be appended |
| */ |
| static |
| inline void insertBufferFront(collIterate *data, UChar ch) |
| { |
| data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; |
| } |
| |
| /** |
| * Special normalization function for contraction in the previous iterator. |
| * This normalization sequence will place the current character at source->pos |
| * and its following normalized sequence into the buffer. |
| * The fcd position, pos will be changed. |
| * pos will now point to positions in the buffer. |
| * Flags will be changed accordingly. |
| * @param data collation iterator data |
| */ |
| static |
| inline void normalizePrevContraction(collIterate *data, UErrorCode *status) |
| { |
| const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ |
| const UChar *pStart; |
| |
| UnicodeString endOfBuffer; |
| if (data->flags & UCOL_ITER_HASLEN) { |
| /* |
| normalization buffer not used yet, we'll pull down the next |
| character into the end of the buffer |
| */ |
| endOfBuffer.setTo(*pEnd); |
| } |
| else { |
| endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL |
| } |
| |
| if (data->fcdPosition == NULL) { |
| pStart = data->string; |
| } |
| else { |
| pStart = data->fcdPosition + 1; |
| } |
| int32_t normLen = |
| data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), |
| data->writableBuffer, |
| *status). |
| length(); |
| if(U_FAILURE(*status)) { |
| return; |
| } |
| /* |
| this puts the null termination infront of the normalized string instead |
| of the end |
| */ |
| data->pos = |
| data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + |
| 1 + normLen; |
| data->origFlags = data->flags; |
| data->flags |= UCOL_ITER_INNORMBUF; |
| data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); |
| } |
| |
| /** |
| * Contraction character management function that returns the previous character |
| * for the backwards iterator. |
| * Does nothing if the previous character is in buffer and not the first |
| * character in it. |
| * Else it checks previous character in data string to see if it is |
| * normalizable. |
| * If it is not, the character is simply copied into the buffer, else |
| * the whole normalized substring is copied into the buffer, including the |
| * current character. |
| * @param data collation element iterator data |
| * @return previous character |
| */ |
| static |
| inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) |
| { |
| UChar prevch; |
| UChar ch; |
| const UChar *start; |
| UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); |
| if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || |
| (innormbuf && *(data->pos - 1) != 0)) { |
| /* |
| if no normalization. |
| if previous character is in normalized buffer, no further normalization |
| is required |
| */ |
| if(data->flags & UCOL_USE_ITERATOR) { |
| data->iterator->move(data->iterator, -1, UITER_CURRENT); |
| return (UChar)data->iterator->next(data->iterator); |
| } else { |
| return *(data->pos - 1); |
| } |
| } |
| |
| start = data->pos; |
| if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { |
| /* in data string */ |
| if ((start - 1) == data->string) { |
| return *(start - 1); |
| } |
| start --; |
| ch = *start; |
| prevch = *(start - 1); |
| } |
| else { |
| /* |
| in writable buffer, at this point fcdPosition can not be NULL. |
| see contracting tag. |
| */ |
| if (data->fcdPosition == data->string) { |
| /* at the start of the string, just dump it into the normalizer */ |
| insertBufferFront(data, *(data->fcdPosition)); |
| data->fcdPosition = NULL; |
| return *(data->pos - 1); |
| } |
| start = data->fcdPosition; |
| ch = *start; |
| prevch = *(start - 1); |
| } |
| /* |
| * if the current character is not fcd. |
| * Trailing combining class == 0. |
| */ |
| if (data->fcdPosition > start && |
| (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) |
| { |
| /* |
| Need a more complete FCD check and possible normalization. |
| normalize substring will be appended to buffer |
| */ |
| const UChar *backuppos = data->pos; |
| data->pos = start; |
| if (collPrevIterFCD(data)) { |
| normalizePrevContraction(data, status); |
| return *(data->pos - 1); |
| } |
| data->pos = backuppos; |
| data->fcdPosition ++; |
| } |
| |
| if (innormbuf) { |
| /* |
| no normalization is to be done hence only one character will be |
| appended to the buffer. |
| */ |
| insertBufferFront(data, ch); |
| data->fcdPosition --; |
| } |
| |
| return ch; |
| } |
| |
| /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ |
| /* It is called by getNextCE */ |
| |
| /* The following should be even */ |
| #define UCOL_MAX_DIGITS_FOR_NUMBER 254 |
| |
| uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { |
| collIterateState entryState; |
| backupState(source, &entryState); |
| UChar32 cp = ch; |
| |
| for (;;) { |
| // This loop will repeat only in the case of contractions, and only when a contraction |
| // is found and the first CE resulting from that contraction is itself a special |
| // (an expansion, for example.) All other special CE types are fully handled the |
| // first time through, and the loop exits. |
| |
| const uint32_t *CEOffset = NULL; |
| switch(getCETag(CE)) { |
| case NOT_FOUND_TAG: |
| /* This one is not found, and we'll let somebody else bother about it... no more games */ |
| return CE; |
| case SPEC_PROC_TAG: |
| { |
| // Special processing is getting a CE that is preceded by a certain prefix |
| // Currently this is only needed for optimizing Japanese length and iteration marks. |
| // When we encouter a special processing tag, we go backwards and try to see if |
| // we have a match. |
| // Contraction tables are used - so the whole process is not unlike contraction. |
| // prefix data is stored backwards in the table. |
| const UChar *UCharOffset; |
| UChar schar, tchar; |
| collIterateState prefixState; |
| backupState(source, &prefixState); |
| loadState(source, &entryState, TRUE); |
| goBackOne(source); // We want to look at the point where we entered - actually one |
| // before that... |
| |
| for(;;) { |
| // This loop will run once per source string character, for as long as we |
| // are matching a potential contraction sequence |
| |
| // First we position ourselves at the begining of contraction sequence |
| const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
| if (collIter_bos(source)) { |
| CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); |
| break; |
| } |
| schar = getPrevNormalizedChar(source, status); |
| goBackOne(source); |
| |
| while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
| UCharOffset++; |
| } |
| |
| if (schar == tchar) { |
| // Found the source string char in the table. |
| // Pick up the corresponding CE from the table. |
| CE = *(coll->contractionCEs + |
| (UCharOffset - coll->contractionIndex)); |
| } |
| else |
| { |
| // Source string char was not in the table. |
| // We have not found the prefix. |
| CE = *(coll->contractionCEs + |
| (ContractionStart - coll->contractionIndex)); |
| } |
| |
| if(!isPrefix(CE)) { |
| // The source string char was in the contraction table, and the corresponding |
| // CE is not a prefix CE. We found the prefix, break |
| // out of loop, this CE will end up being returned. This is the normal |
| // way out of prefix handling when the source actually contained |
| // the prefix. |
| break; |
| } |
| } |
| if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue |
| loadState(source, &prefixState, TRUE); |
| if(source->origFlags & UCOL_USE_ITERATOR) { |
| source->flags = source->origFlags; |
| } |
| } else { // prefix search was a failure, we have to backup all the way to the start |
| loadState(source, &entryState, TRUE); |
| } |
| break; |
| } |
| case CONTRACTION_TAG: |
| { |
| /* This should handle contractions */ |
| collIterateState state; |
| backupState(source, &state); |
| uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; |
| const UChar *UCharOffset; |
| UChar schar, tchar; |
| |
| for (;;) { |
| /* This loop will run once per source string character, for as long as we */ |
| /* are matching a potential contraction sequence */ |
| |
| /* First we position ourselves at the begining of contraction sequence */ |
| const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); |
| |
| if (collIter_eos(source)) { |
| // Ran off the end of the source string. |
| CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); |
| // So we'll pick whatever we have at the point... |
| if (CE == UCOL_NOT_FOUND) { |
| // back up the source over all the chars we scanned going into this contraction. |
| CE = firstCE; |
| loadState(source, &state, TRUE); |
| if(source->origFlags & UCOL_USE_ITERATOR) { |
| source->flags = source->origFlags; |
| } |
| } |
| break; |
| } |
| |
| uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ |
| uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); |
| |
| schar = getNextNormalizedChar(source); |
| while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
| UCharOffset++; |
| } |
| |
| if (schar == tchar) { |
| // Found the source string char in the contraction table. |
| // Pick up the corresponding CE from the table. |
| CE = *(coll->contractionCEs + |
| (UCharOffset - coll->contractionIndex)); |
| } |
| else |
| { |
| // Source string char was not in contraction table. |
| // Unless we have a discontiguous contraction, we have finished |
| // with this contraction. |
| // in order to do the proper detection, we |
| // need to see if we're dealing with a supplementary |
| /* We test whether the next two char are surrogate pairs. |
| * This test is done if the iterator is not NULL. |
| * If there is no surrogate pair, the iterator |
| * goes back one if needed. */ |
| UChar32 miss = schar; |
| if (source->iterator) { |
| UChar32 surrNextChar; /* the next char in the iteration to test */ |
| int32_t prevPos; /* holds the previous position before move forward of the source iterator */ |
| if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { |
| prevPos = source->iterator->index; |
| surrNextChar = getNextNormalizedChar(source); |
| if (U16_IS_TRAIL(surrNextChar)) { |
| miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); |
| } else if (prevPos < source->iterator->index){ |
| goBackOne(source); |
| } |
| } |
| } else if (U16_IS_LEAD(schar)) { |
| miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); |
| } |
| |
| uint8_t sCC; |
| if (miss < 0x300 || |
| maxCC == 0 || |
| (sCC = i_getCombiningClass(miss, coll)) == 0 || |
| sCC>maxCC || |
| (allSame != 0 && sCC == maxCC) || |
| collIter_eos(source)) |
| { |
| // Contraction can not be discontiguous. |
| goBackOne(source); // back up the source string by one, |
| // because the character we just looked at was |
| // not part of the contraction. */ |
| if(U_IS_SUPPLEMENTARY(miss)) { |
| goBackOne(source); |
| } |
| CE = *(coll->contractionCEs + |
| (ContractionStart - coll->contractionIndex)); |
| } else { |
| // |
| // Contraction is possibly discontiguous. |
| // Scan more of source string looking for a match |
| // |
| UChar tempchar; |
| /* find the next character if schar is not a base character |
| and we are not yet at the end of the string */ |
| tempchar = getNextNormalizedChar(source); |
| // probably need another supplementary thingie here |
| goBackOne(source); |
| if (i_getCombiningClass(tempchar, coll) == 0) { |
| goBackOne(source); |
| if(U_IS_SUPPLEMENTARY(miss)) { |
| goBackOne(source); |
| } |
| /* Spit out the last char of the string, wasn't tasty enough */ |
| CE = *(coll->contractionCEs + |
| (ContractionStart - coll->contractionIndex)); |
| } else { |
| CE = getDiscontiguous(coll, source, ContractionStart); |
| } |
| } |
| } // else after if(schar == tchar) |
| |
| if(CE == UCOL_NOT_FOUND) { |
| /* The Source string did not match the contraction that we were checking. */ |
| /* Back up the source position to undo the effects of having partially */ |
| /* scanned through what ultimately proved to not be a contraction. */ |
| loadState(source, &state, TRUE); |
| CE = firstCE; |
| break; |
| } |
| |
| if(!isContraction(CE)) { |
| // The source string char was in the contraction table, and the corresponding |
| // CE is not a contraction CE. We completed the contraction, break |
| // out of loop, this CE will end up being returned. This is the normal |
| // way out of contraction handling when the source actually contained |
| // the contraction. |
| break; |
| } |
| |
| |
| // The source string char was in the contraction table, and the corresponding |
| // CE is IS a contraction CE. We will continue looping to check the source |
| // string for the remaining chars in the contraction. |
| uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); |
| if(tempCE != UCOL_NOT_FOUND) { |
| // We have scanned a a section of source string for which there is a |
| // CE from the contraction table. Remember the CE and scan position, so |
| // that we can return to this point if further scanning fails to |
| // match a longer contraction sequence. |
| firstCE = tempCE; |
| |
| goBackOne(source); |
| backupState(source, &state); |
| getNextNormalizedChar(source); |
| |
| // Another way to do this is: |
| //collIterateState tempState; |
| //backupState(source, &tempState); |
| //goBackOne(source); |
| //backupState(source, &state); |
| //loadState(source, &tempState, TRUE); |
| |
| // The problem is that for incomplete contractions we have to remember the previous |
| // position. Before, the only thing I needed to do was state.pos--; |
| // After iterator introduction and especially after introduction of normalizing |
| // iterators, it became much more difficult to decrease the saved state. |
| // I'm not yet sure which of the two methods above is faster. |
| } |
| } // for(;;) |
| break; |
| } // case CONTRACTION_TAG: |
| case LONG_PRIMARY_TAG: |
| { |
| *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; |
| CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; |
| source->offsetRepeatCount += 1; |
| return CE; |
| } |
| case EXPANSION_TAG: |
| { |
| /* This should handle expansion. */ |
| /* NOTE: we can encounter both continuations and expansions in an expansion! */ |
| /* I have to decide where continuations are going to be dealt with */ |
| uint32_t size; |
| uint32_t i; /* general counter */ |
| |
| CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ |
| size = getExpansionCount(CE); |
| CE = *CEOffset++; |
| //source->offsetRepeatCount = -1; |
| |
| if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ |
| for(i = 1; i<size; i++) { |
| *(source->CEpos++) = *CEOffset++; |
| source->offsetRepeatCount += 1; |
| } |
| } else { /* else, we do */ |
| while(*CEOffset != 0) { |
| *(source->CEpos++) = *CEOffset++; |
| source->offsetRepeatCount += 1; |
| } |
| } |
| |
| return CE; |
| } |
| case DIGIT_TAG: |
| { |
| /* |
| We do a check to see if we want to collate digits as numbers; if so we generate |
| a custom collation key. Otherwise we pull out the value stored in the expansion table. |
| */ |
| //uint32_t size; |
| uint32_t i; /* general counter */ |
| |
| if (source->coll->numericCollation == UCOL_ON){ |
| collIterateState digitState = {0,0,0,0,0,0,0,0,0}; |
| UChar32 char32 = 0; |
| int32_t digVal = 0; |
| |
| uint32_t digIndx = 0; |
| uint32_t endIndex = 0; |
| uint32_t trailingZeroIndex = 0; |
| |
| uint8_t collateVal = 0; |
| |
| UBool nonZeroValReached = FALSE; |
| |
| uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. |
| /* |
| We parse the source string until we hit a char that's NOT a digit. |
| Use this u_charDigitValue. This might be slow because we have to |
| handle surrogates... |
| */ |
| /* |
| if (U16_IS_LEAD(ch)){ |
| if (!collIter_eos(source)) { |
| backupState(source, &digitState); |
| UChar trail = getNextNormalizedChar(source); |
| if(U16_IS_TRAIL(trail)) { |
| char32 = U16_GET_SUPPLEMENTARY(ch, trail); |
| } else { |
| loadState(source, &digitState, TRUE); |
| char32 = ch; |
| } |
| } else { |
| char32 = ch; |
| } |
| } else { |
| char32 = ch; |
| } |
| digVal = u_charDigitValue(char32); |
| */ |
| digVal = u_charDigitValue(cp); // if we have arrived here, we have |
| // already processed possible supplementaries that trigered the digit tag - |
| // all supplementaries are marked in the UCA. |
| /* |
| We pad a zero in front of the first element anyways. This takes |
| care of the (probably) most common case where people are sorting things followed |
| by a single digit |
| */ |
| digIndx++; |
| for(;;){ |
| // Make sure we have enough space. No longer needed; |
| // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER |
| // (it has been pre-incremented) so we just ensure that numTempBuf is big enough |
| // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). |
| |
| // Skipping over leading zeroes. |
| if (digVal != 0) { |
| nonZeroValReached = TRUE; |
| } |
| if (nonZeroValReached) { |
| /* |
| We parse the digit string into base 100 numbers (this fits into a byte). |
| We only add to the buffer in twos, thus if we are parsing an odd character, |
| that serves as the 'tens' digit while the if we are parsing an even one, that |
| is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into |
| a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid |
| overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less |
| than all the other bytes. |
| */ |
| |
| if (digIndx % 2 == 1){ |
| collateVal += (uint8_t)digVal; |
| |
| // We don't enter the low-order-digit case unless we've already seen |
| // the high order, or for the first digit, which is always non-zero. |
| if (collateVal != 0) |
| trailingZeroIndex = 0; |
| |
| numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; |
| collateVal = 0; |
| } |
| else{ |
| // We drop the collation value into the buffer so if we need to do |
| // a "front patch" we don't have to check to see if we're hitting the |
| // last element. |
| collateVal = (uint8_t)(digVal * 10); |
| |
| // Check for trailing zeroes. |
| if (collateVal == 0) |
| { |
| if (!trailingZeroIndex) |
| trailingZeroIndex = (digIndx/2) + 2; |
| } |
| else |
| trailingZeroIndex = 0; |
| |
| numTempBuf[(digIndx/2) + 2] = collateVal* |