blob: cbb6684f6de637f93f392fd5761928a1fce79e7e [file] [log] [blame] [edit]
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucol.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* Modification history
* Date Name Comments
* 1996-1999 various members of ICU team maintained C API for collation framework
* 02/16/2001 synwee Added internal method getPrevSpecialCE
* 03/01/2001 synwee Added maxexpansion functionality.
* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/coleitr.h"
#include "unicode/unorm.h"
#include "unicode/udata.h"
#include "unicode/ustring.h"
#include "ucol_imp.h"
#include "bocsu.h"
#include "normalizer2impl.h"
#include "unorm_it.h"
#include "umutex.h"
#include "cmemory.h"
#include "ucln_in.h"
#include "cstring.h"
#include "utracimp.h"
#include "putilimp.h"
#include "uassert.h"
#ifdef UCOL_DEBUG
#include <stdio.h>
#endif
U_NAMESPACE_USE
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define LAST_BYTE_MASK_ 0xFF
#define SECOND_LAST_BYTE_SHIFT_ 8
#define ZERO_CC_LIMIT_ 0xC0
// this is static pointer to the normalizer fcdTrieIndex
// it is always the same between calls to u_cleanup
// and therefore writing to it is not synchronized.
// It is cleaned in ucol_cleanup
static const uint16_t *fcdTrieIndex=NULL;
// Code points at fcdHighStart and above have a zero FCD value.
static UChar32 fcdHighStart = 0;
// These are values from UCA required for
// implicit generation and supressing sort key compression
// they should regularly be in the UCA, but if one
// is running without UCA, it could be a problem
static const int32_t maxRegularPrimary = 0xA0;
static const int32_t minImplicitPrimary = 0xE0;
static const int32_t maxImplicitPrimary = 0xE4;
U_CDECL_BEGIN
static UBool U_CALLCONV
ucol_cleanup(void)
{
fcdTrieIndex = NULL;
return TRUE;
}
static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data) {
return (int32_t)(data&0xFFFFFF);
}
U_CDECL_END
static
inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
int32_t sourceLen, collIterate *s,
UErrorCode *status)
{
(s)->string = (s)->pos = sourceString;
(s)->origFlags = 0;
(s)->flags = 0;
if (sourceLen >= 0) {
s->flags |= UCOL_ITER_HASLEN;
(s)->endp = (UChar *)sourceString+sourceLen;
}
else {
/* change to enable easier checking for end of string for fcdpositon */
(s)->endp = NULL;
}
(s)->extendCEs = NULL;
(s)->extendCEsSize = 0;
(s)->CEpos = (s)->toReturn = (s)->CEs;
(s)->offsetBuffer = NULL;
(s)->offsetBufferSize = 0;
(s)->offsetReturn = (s)->offsetStore = NULL;
(s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
(s)->coll = (collator);
(s)->nfd = Normalizer2Factory::getNFDInstance(*status);
(s)->fcdPosition = 0;
if(collator->normalizationMode == UCOL_ON) {
(s)->flags |= UCOL_ITER_NORM;
}
if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
(s)->flags |= UCOL_HIRAGANA_Q;
}
(s)->iterator = NULL;
//(s)->iteratorIndex = 0;
}
U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
int32_t sourceLen, collIterate *s,
UErrorCode *status) {
/* Out-of-line version for use from other files. */
IInit_collIterate(collator, sourceString, sourceLen, s, status);
}
U_CAPI collIterate * U_EXPORT2
uprv_new_collIterate(UErrorCode *status) {
if(U_FAILURE(*status)) {
return NULL;
}
collIterate *s = new collIterate;
if(s == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
return s;
}
U_CAPI void U_EXPORT2
uprv_delete_collIterate(collIterate *s) {
delete s;
}
U_CAPI UBool U_EXPORT2
uprv_collIterateAtEnd(collIterate *s) {
return s == NULL || s->pos == s->endp;
}
/**
* Backup the state of the collIterate struct data
* @param data collIterate to backup
* @param backup storage
*/
static
inline void backupState(const collIterate *data, collIterateState *backup)
{
backup->fcdPosition = data->fcdPosition;
backup->flags = data->flags;
backup->origFlags = data->origFlags;
backup->pos = data->pos;
backup->bufferaddress = data->writableBuffer.getBuffer();
backup->buffersize = data->writableBuffer.length();
backup->iteratorMove = 0;
backup->iteratorIndex = 0;
if(data->iterator != NULL) {
//backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
backup->iteratorIndex = data->iterator->getState(data->iterator);
// no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
if(backup->iteratorIndex == UITER_NO_STATE) {
while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
backup->iteratorMove++;
data->iterator->move(data->iterator, -1, UITER_CURRENT);
}
data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
}
}
}
/**
* Loads the state into the collIterate struct data
* @param data collIterate to backup
* @param backup storage
* @param forwards boolean to indicate if forwards iteration is used,
* false indicates backwards iteration
*/
static
inline void loadState(collIterate *data, const collIterateState *backup,
UBool forwards)
{
UErrorCode status = U_ZERO_ERROR;
data->flags = backup->flags;
data->origFlags = backup->origFlags;
if(data->iterator != NULL) {
//data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
if(backup->iteratorMove != 0) {
data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
}
}
data->pos = backup->pos;
if ((data->flags & UCOL_ITER_INNORMBUF) &&
data->writableBuffer.getBuffer() != backup->bufferaddress) {
/*
this is when a new buffer has been reallocated and we'll have to
calculate the new position.
note the new buffer has to contain the contents of the old buffer.
*/
if (forwards) {
data->pos = data->writableBuffer.getTerminatedBuffer() +
(data->pos - backup->bufferaddress);
}
else {
/* backwards direction */
int32_t temp = backup->buffersize -
(int32_t)(data->pos - backup->bufferaddress);
data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
}
}
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
/*
this is alittle tricky.
if we are initially not in the normalization buffer, even if we
normalize in the later stage, the data in the buffer will be
ignored, since we skip back up to the data string.
however if we are already in the normalization buffer, any
further normalization will pull data into the normalization
buffer and modify the fcdPosition.
since we are keeping the data in the buffer for use, the
fcdPosition can not be reverted back.
arrgghh....
*/
data->fcdPosition = backup->fcdPosition;
}
}
static UBool
reallocCEs(collIterate *data, int32_t newCapacity) {
uint32_t *oldCEs = data->extendCEs;
if(oldCEs == NULL) {
oldCEs = data->CEs;
}
int32_t length = data->CEpos - oldCEs;
uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
if(newCEs == NULL) {
return FALSE;
}
uprv_memcpy(newCEs, oldCEs, length * 4);
uprv_free(data->extendCEs);
data->extendCEs = newCEs;
data->extendCEsSize = newCapacity;
data->CEpos = newCEs + length;
return TRUE;
}
static UBool
increaseCEsCapacity(collIterate *data) {
int32_t oldCapacity;
if(data->extendCEs != NULL) {
oldCapacity = data->extendCEsSize;
} else {
oldCapacity = LENGTHOF(data->CEs);
}
return reallocCEs(data, 2 * oldCapacity);
}
static UBool
ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
int32_t oldCapacity;
if(data->extendCEs != NULL) {
oldCapacity = data->extendCEsSize;
} else {
oldCapacity = LENGTHOF(data->CEs);
}
if(minCapacity <= oldCapacity) {
return TRUE;
}
oldCapacity *= 2;
return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
}
/*
* collIter_eos()
* Checks for a collIterate being positioned at the end of
* its source string.
*
*/
static
inline UBool collIter_eos(collIterate *s) {
if(s->flags & UCOL_USE_ITERATOR) {
return !(s->iterator->hasNext(s->iterator));
}
if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
// Null terminated string, but not at null, so not at end.
// Whether in main or normalization buffer doesn't matter.
return FALSE;
}
// String with length. Can't be in normalization buffer, which is always
// null termintated.
if (s->flags & UCOL_ITER_HASLEN) {
return (s->pos == s->endp);
}
// We are at a null termination, could be either normalization buffer or main string.
if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
// At null at end of main string.
return TRUE;
}
// At null at end of normalization buffer. Need to check whether there there are
// any characters left in the main buffer.
if(s->origFlags & UCOL_USE_ITERATOR) {
return !(s->iterator->hasNext(s->iterator));
} else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
// Null terminated main string. fcdPosition is the 'return' position into main buf.
return (*s->fcdPosition == 0);
}
else {
// Main string with an end pointer.
return s->fcdPosition == s->endp;
}
}
/*
* collIter_bos()
* Checks for a collIterate being positioned at the start of
* its source string.
*
*/
static
inline UBool collIter_bos(collIterate *source) {
// if we're going backwards, we need to know whether there is more in the
// iterator, even if we are in the side buffer
if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
return !source->iterator->hasPrevious(source->iterator);
}
if (source->pos <= source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
return TRUE;
}
return FALSE;
}
/*static
inline UBool collIter_SimpleBos(collIterate *source) {
// if we're going backwards, we need to know whether there is more in the
// iterator, even if we are in the side buffer
if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
return !source->iterator->hasPrevious(source->iterator);
}
if (source->pos == source->string) {
return TRUE;
}
return FALSE;
}*/
//return (data->pos == data->string) ||
/****************************************************************************/
/* Following are the open/close functions */
/* */
/****************************************************************************/
static UCollator*
ucol_initFromBinary(const uint8_t *bin, int32_t length,
const UCollator *base,
UCollator *fillIn,
UErrorCode *status)
{
UCollator *result = fillIn;
if(U_FAILURE(*status)) {
return NULL;
}
/*
if(base == NULL) {
// we don't support null base yet
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
*/
// We need these and we could be running without UCA
uprv_uca_initImplicitConstants(status);
UCATableHeader *colData = (UCATableHeader *)bin;
// do we want version check here? We're trying to figure out whether collators are compatible
if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
colData->version[0] != UCOL_BUILDER_VERSION)
{
*status = U_COLLATOR_VERSION_MISMATCH;
return NULL;
}
else {
if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
if(U_FAILURE(*status)){
return NULL;
}
result->hasRealData = TRUE;
}
else {
if(base) {
result = ucol_initCollator(base->image, result, base, status);
ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
if(U_FAILURE(*status)){
return NULL;
}
result->hasRealData = FALSE;
}
else {
*status = U_USELESS_COLLATOR_ERROR;
return NULL;
}
}
result->freeImageOnClose = FALSE;
}
result->actualLocale = NULL;
result->validLocale = NULL;
result->requestedLocale = NULL;
result->rules = NULL;
result->rulesLength = 0;
result->freeRulesOnClose = FALSE;
result->ucaRules = NULL;
return result;
}
U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t *bin, int32_t length,
const UCollator *base,
UErrorCode *status)
{
return ucol_initFromBinary(bin, length, base, NULL, status);
}
U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator *coll,
uint8_t *buffer, int32_t capacity,
UErrorCode *status)
{
int32_t length = 0;
if(U_FAILURE(*status)) {
return length;
}
if(capacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return length;
}
if(coll->hasRealData == TRUE) {
length = coll->image->size;
if(length <= capacity) {
uprv_memcpy(buffer, coll->image, length);
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
} else {
length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
if(length <= capacity) {
/* build the UCATableHeader with minimal entries */
/* do not copy the header from the UCA file because its values are wrong! */
/* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
/* reset everything */
uprv_memset(buffer, 0, length);
/* set the tailoring-specific values */
UCATableHeader *myData = (UCATableHeader *)buffer;
myData->size = length;
/* offset for the options, the only part of the data that is present after the header */
myData->options = sizeof(UCATableHeader);
/* need to always set the expansion value for an upper bound of the options */
myData->expansion = myData->options + sizeof(UColOptionSet);
myData->magic = UCOL_HEADER_MAGIC;
myData->isBigEndian = U_IS_BIG_ENDIAN;
myData->charSetFamily = U_CHARSET_FAMILY;
/* copy UCA's version; genrb will override all but the builder version with tailoring data */
uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
myData->jamoSpecial = coll->image->jamoSpecial;
/* copy the collator options */
uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
}
return length;
}
U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
{
UCollator * localCollator;
int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
char *stackBufferChars = (char *)stackBuffer;
int32_t imageSize = 0;
int32_t rulesSize = 0;
int32_t rulesPadding = 0;
uint8_t *image;
UChar *rules;
UBool colAllocated = FALSE;
UBool imageAllocated = FALSE;
if (status == NULL || U_FAILURE(*status)){
return 0;
}
if ((stackBuffer && !pBufferSize) || !coll){
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (coll->rules && coll->freeRulesOnClose) {
rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
bufferSizeNeeded += rulesSize + rulesPadding;
}
if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
*pBufferSize = bufferSizeNeeded;
return 0;
}
/* Pointers on 64-bit platforms need to be aligned
* on a 64-bit boundry in memory.
*/
if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
if (*pBufferSize > offsetUp) {
*pBufferSize -= offsetUp;
stackBufferChars += offsetUp;
}
else {
/* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
*pBufferSize = 1;
}
}
stackBuffer = (void *)stackBufferChars;
if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
/* allocate one here...*/
stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
// Null pointer check.
if (stackBufferChars == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
colAllocated = TRUE;
if (U_SUCCESS(*status)) {
*status = U_SAFECLONE_ALLOCATED_WARNING;
}
}
localCollator = (UCollator *)stackBufferChars;
rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
{
UErrorCode tempStatus = U_ZERO_ERROR;
imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
}
if (coll->freeImageOnClose) {
image = (uint8_t *)uprv_malloc(imageSize);
// Null pointer check
if (image == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
ucol_cloneBinary(coll, image, imageSize, status);
imageAllocated = TRUE;
}
else {
image = (uint8_t *)coll->image;
}
localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
if (U_FAILURE(*status)) {
return NULL;
}
if (coll->rules) {
if (coll->freeRulesOnClose) {
localCollator->rules = u_strcpy(rules, coll->rules);
//bufferEnd += rulesSize;
}
else {
localCollator->rules = coll->rules;
}
localCollator->freeRulesOnClose = FALSE;
localCollator->rulesLength = coll->rulesLength;
}
int32_t i;
for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
}
// zero copies of pointers
localCollator->actualLocale = NULL;
localCollator->validLocale = NULL;
localCollator->requestedLocale = NULL;
localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
localCollator->freeOnClose = colAllocated;
localCollator->freeImageOnClose = imageAllocated;
return localCollator;
}
U_CAPI void U_EXPORT2
ucol_close(UCollator *coll)
{
UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
if(coll != NULL) {
// these are always owned by each UCollator struct,
// so we always free them
if(coll->validLocale != NULL) {
uprv_free(coll->validLocale);
}
if(coll->actualLocale != NULL) {
uprv_free(coll->actualLocale);
}
if(coll->requestedLocale != NULL) {
uprv_free(coll->requestedLocale);
}
if(coll->latinOneCEs != NULL) {
uprv_free(coll->latinOneCEs);
}
if(coll->options != NULL && coll->freeOptionsOnClose) {
uprv_free(coll->options);
}
if(coll->rules != NULL && coll->freeRulesOnClose) {
uprv_free((UChar *)coll->rules);
}
if(coll->image != NULL && coll->freeImageOnClose) {
uprv_free((UCATableHeader *)coll->image);
}
/* Here, it would be advisable to close: */
/* - UData for UCA (unless we stuff it in the root resb */
/* Again, do we need additional housekeeping... HMMM! */
UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
if(coll->freeOnClose){
/* for safeClone, if freeOnClose is FALSE,
don't free the other instance data */
uprv_free(coll);
}
}
UTRACE_EXIT();
}
/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
/* you should be able to get the binary chunk to write out... Doesn't look very full now */
U_CFUNC uint8_t* U_EXPORT2
ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
{
uint8_t *result = NULL;
if(U_FAILURE(*status)) {
return NULL;
}
if(coll->hasRealData == TRUE) {
*length = coll->image->size;
result = (uint8_t *)uprv_malloc(*length);
/* test for NULL */
if (result == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memcpy(result, coll->image, *length);
} else {
*length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
result = (uint8_t *)uprv_malloc(*length);
/* test for NULL */
if (result == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
/* build the UCATableHeader with minimal entries */
/* do not copy the header from the UCA file because its values are wrong! */
/* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
/* reset everything */
uprv_memset(result, 0, *length);
/* set the tailoring-specific values */
UCATableHeader *myData = (UCATableHeader *)result;
myData->size = *length;
/* offset for the options, the only part of the data that is present after the header */
myData->options = sizeof(UCATableHeader);
/* need to always set the expansion value for an upper bound of the options */
myData->expansion = myData->options + sizeof(UColOptionSet);
myData->magic = UCOL_HEADER_MAGIC;
myData->isBigEndian = U_IS_BIG_ENDIAN;
myData->charSetFamily = U_CHARSET_FAMILY;
/* copy UCA's version; genrb will override all but the builder version with tailoring data */
uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
myData->jamoSpecial = coll->image->jamoSpecial;
/* copy the collator options */
uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
}
return result;
}
void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
if(U_FAILURE(*status)) {
return;
}
result->caseFirst = (UColAttributeValue)opts->caseFirst;
result->caseLevel = (UColAttributeValue)opts->caseLevel;
result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
result->strength = (UColAttributeValue)opts->strength;
result->variableTopValue = opts->variableTopValue;
result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
result->numericCollation = (UColAttributeValue)opts->numericCollation;
result->caseFirstisDefault = TRUE;
result->caseLevelisDefault = TRUE;
result->frenchCollationisDefault = TRUE;
result->normalizationModeisDefault = TRUE;
result->strengthisDefault = TRUE;
result->variableTopValueisDefault = TRUE;
result->hiraganaQisDefault = TRUE;
result->numericCollationisDefault = TRUE;
ucol_updateInternalState(result, status);
result->options = opts;
}
/**
* Approximate determination if a character is at a contraction end.
* Guaranteed to be TRUE if a character is at the end of a contraction,
* otherwise it is not deterministic.
* @param c character to be determined
* @param coll collator
*/
static
inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
if (c < coll->minContrEndCP) {
return FALSE;
}
int32_t hash = c;
uint8_t htbyte;
if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
if (U16_IS_TRAIL(c)) {
return TRUE;
}
hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
}
htbyte = coll->contrEndCP[hash>>3];
return (((htbyte >> (hash & 7)) & 1) == 1);
}
/*
* i_getCombiningClass()
* A fast, at least partly inline version of u_getCombiningClass()
* This is a candidate for further optimization. Used heavily
* in contraction processing.
*/
static
inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
uint8_t sCC = 0;
if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
sCC = u_getCombiningClass(c);
}
return sCC;
}
UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
UChar c;
UCollator *result = fillIn;
if(U_FAILURE(*status) || image == NULL) {
return NULL;
}
if(result == NULL) {
result = (UCollator *)uprv_malloc(sizeof(UCollator));
if(result == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return result;
}
result->freeOnClose = TRUE;
} else {
result->freeOnClose = FALSE;
}
// init FCD data
if (fcdTrieIndex == NULL) {
// The result is constant, until the library is reloaded.
fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
}
result->image = image;
result->mapping.getFoldingOffset = _getFoldingOffset;
const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
if(U_FAILURE(*status)) {
if(result->freeOnClose == TRUE) {
uprv_free(result);
result = NULL;
}
return result;
}
/*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
result->freeOptionsOnClose = FALSE;
/* set attributes */
result->caseFirst = (UColAttributeValue)result->options->caseFirst;
result->caseLevel = (UColAttributeValue)result->options->caseLevel;
result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
result->strength = (UColAttributeValue)result->options->strength;
result->variableTopValue = result->options->variableTopValue;
result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
result->numericCollation = (UColAttributeValue)result->options->numericCollation;
result->caseFirstisDefault = TRUE;
result->caseLevelisDefault = TRUE;
result->frenchCollationisDefault = TRUE;
result->normalizationModeisDefault = TRUE;
result->strengthisDefault = TRUE;
result->variableTopValueisDefault = TRUE;
result->alternateHandlingisDefault = TRUE;
result->hiraganaQisDefault = TRUE;
result->numericCollationisDefault = TRUE;
/*result->scriptOrder = NULL;*/
result->rules = NULL;
result->rulesLength = 0;
result->freeRulesOnClose = FALSE;
/* get the version info from UCATableHeader and populate the Collator struct*/
result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
result->dataVersion[2] = 0;
result->dataVersion[3] = 0;
result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
result->minUnsafeCP = 0;
for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
if (ucol_unsafeCP(c, result)) break;
}
result->minUnsafeCP = c;
result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
result->minContrEndCP = 0;
for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
if (ucol_contractionEndCP(c, result)) break;
}
result->minContrEndCP = c;
/* max expansion tables */
result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
result->image->endExpansionCE);
result->lastEndExpansionCE = result->endExpansionCE +
result->image->endExpansionCECount - 1;
result->expansionCESize = (uint8_t*)result->image +
result->image->expansionCESize;
//result->errorCode = *status;
result->latinOneCEs = NULL;
result->latinOneRegenTable = FALSE;
result->latinOneFailed = FALSE;
result->UCA = UCA;
ucol_updateInternalState(result, status);
/* Normally these will be set correctly later. This is the default if you use UCA or the default. */
result->ucaRules = NULL;
result->actualLocale = NULL;
result->validLocale = NULL;
result->requestedLocale = NULL;
result->hasRealData = FALSE; // real data lives in .dat file...
result->freeImageOnClose = FALSE;
return result;
}
/* new Mark's code */
/**
* For generation of Implicit CEs
* @author Davis
*
* Cleaned up so that changes can be made more easily.
* Old values:
# First Implicit: E26A792D
# Last Implicit: E3DC70C0
# First CJK: E0030300
# Last CJK: E0A9DD00
# First CJK_A: E0A9DF00
# Last CJK_A: E0DE3100
*/
/* Following is a port of Mark's code for new treatment of implicits.
* It is positioned here, since ucol_initUCA need to initialize the
* variables below according to the data in the fractional UCA.
*/
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* (all contiguous)
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
* Warning: the we only pick up the compatibility characters that are
* NOT decomposed, so that block is smaller!
*/
// CONSTANTS
static const UChar32
NON_CJK_OFFSET = 0x110000,
UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
/**
* Precomputed by initImplicitConstants()
*/
static int32_t
final3Multiplier = 0,
final4Multiplier = 0,
final3Count = 0,
final4Count = 0,
medialCount = 0,
min3Primary = 0,
min4Primary = 0,
max4Primary = 0,
minTrail = 0,
maxTrail = 0,
max3Trail = 0,
max4Trail = 0,
min4Boundary = 0;
static const UChar32
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FFF+1,
CJK_COMPAT_USED_BASE = 0xFA0E,
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
CJK_A_BASE = 0x3400,
CJK_A_LIMIT = 0x4DBF+1,
CJK_B_BASE = 0x20000,
CJK_B_LIMIT = 0x2A6DF+1;
static UChar32 swapCJK(UChar32 i) {
if (i >= CJK_BASE) {
if (i < CJK_LIMIT) return i - CJK_BASE;
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromCodePoint(UChar32 i) {
return swapCJK(i)+1;
}
U_CAPI UChar32 U_EXPORT2
uprv_uca_getCodePointFromRaw(UChar32 i) {
i--;
UChar32 result = 0;
if(i >= NON_CJK_OFFSET) {
result = i - NON_CJK_OFFSET;
} else if(i >= CJK_B_BASE) {
result = i;
} else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
if(i < CJK_LIMIT - CJK_BASE) {
result = i + CJK_BASE;
} else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
} else {
result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
}
} else {
result = -1;
}
return result;
}
// GET IMPLICIT PRIMARY WEIGHTS
// Return value is left justified primary key
U_CAPI uint32_t U_EXPORT2
uprv_uca_getImplicitFromRaw(UChar32 cp) {
/*
if (cp < 0 || cp > UCOL_MAX_INPUT) {
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
}
*/
int32_t last0 = cp - min4Boundary;
if (last0 < 0) {
int32_t last1 = cp / final3Count;
last0 = cp % final3Count;
int32_t last2 = last1 / medialCount;
last1 %= medialCount;
last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = min3Primary + last2; // offset
/*
if (last2 >= min4Primary) {
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
}
*/
return (last2 << 24) + (last1 << 16) + (last0 << 8);
} else {
int32_t last1 = last0 / final4Count;
last0 %= final4Count;
int32_t last2 = last1 / medialCount;
last1 %= medialCount;
int32_t last3 = last2 / medialCount;
last2 %= medialCount;
last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = minTrail + last2; // offset
last3 = min4Primary + last3; // offset
/*
if (last3 > max4Primary) {
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
}
*/
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
}
}
static uint32_t U_EXPORT2
uprv_uca_getImplicitPrimary(UChar32 cp) {
//if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = swapCJK(cp);
cp++;
// we now have a range of numbers from 0 to 21FFFF.
//if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
return uprv_uca_getImplicitFromRaw(cp);
}
/**
* Converts implicit CE into raw integer ("code point")
* @param implicit
* @return -1 if illegal format
*/
U_CAPI UChar32 U_EXPORT2
uprv_uca_getRawFromImplicit(uint32_t implicit) {
UChar32 result;
UChar32 b3 = implicit & 0xFF;
UChar32 b2 = (implicit >> 8) & 0xFF;
UChar32 b1 = (implicit >> 16) & 0xFF;
UChar32 b0 = (implicit >> 24) & 0xFF;
// simple parameter checks
if (b0 < min3Primary || b0 > max4Primary
|| b1 < minTrail || b1 > maxTrail)
return -1;
// normal offsets
b1 -= minTrail;
// take care of the final values, and compose
if (b0 < min4Primary) {
if (b2 < minTrail || b2 > max3Trail || b3 != 0)
return -1;
b2 -= minTrail;
UChar32 remainder = b2 % final3Multiplier;
if (remainder != 0)
return -1;
b0 -= min3Primary;
b2 /= final3Multiplier;
result = ((b0 * medialCount) + b1) * final3Count + b2;
} else {
if (b2 < minTrail || b2 > maxTrail
|| b3 < minTrail || b3 > max4Trail)
return -1;
b2 -= minTrail;
b3 -= minTrail;
UChar32 remainder = b3 % final4Multiplier;
if (remainder != 0)
return -1;
b3 /= final4Multiplier;
b0 -= min4Primary;
result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
}
// final check
if (result < 0 || result > UCOL_MAX_INPUT)
return -1;
return result;
}
static inline int32_t divideAndRoundUp(int a, int b) {
return 1 + (a-1)/b;
}
/* this function is either called from initUCA or from genUCA before
* doing canonical closure for the UCA.
*/
/**
* Set up to generate implicits.
* Maintenance Note: this function may end up being called more than once, due
* to threading races during initialization. Make sure that
* none of the Constants is ever transiently assigned an
* incorrect value.
* @param minPrimary
* @param maxPrimary
* @param minTrail final byte
* @param maxTrail final byte
* @param gap3 the gap we leave for tailoring for 3-byte forms
* @param gap4 the gap we leave for tailoring for 4-byte forms
*/
static void initImplicitConstants(int minPrimary, int maxPrimary,
int minTrailIn, int maxTrailIn,
int gap3, int primaries3count,
UErrorCode *status) {
// some simple parameter checks
if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
|| (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
|| (primaries3count < 1))
{
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
};
minTrail = minTrailIn;
maxTrail = maxTrailIn;
min3Primary = minPrimary;
max4Primary = maxPrimary;
// compute constants for use later.
// number of values we can use in trailing bytes
// leave room for empty values between AND above, e.g. if gap = 2
// range 3..7 => +3 -4 -5 -6 -7: so 1 value
// range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
// range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
final3Multiplier = gap3 + 1;
final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
// medials can use full range
medialCount = (maxTrail - minTrail + 1);
// find out how many values fit in each form
int32_t threeByteCount = medialCount * final3Count;
// now determine where the 3/4 boundary is.
// we use 3 bytes below the boundary, and 4 above
int32_t primariesAvailable = maxPrimary - minPrimary + 1;
int32_t primaries4count = primariesAvailable - primaries3count;
int32_t min3ByteCoverage = primaries3count * threeByteCount;
min4Primary = minPrimary + primaries3count;
min4Boundary = min3ByteCoverage;
// Now expand out the multiplier for the 4 bytes, and redo.
int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
if (gap4 < 1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
final4Multiplier = gap4 + 1;
final4Count = neededPerFinalByte;
max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
}
/**
* Supply parameters for generating implicit CEs
*/
U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(UErrorCode *status) {
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
//initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
}
/* collIterNormalize Incremental Normalization happens here. */
/* pick up the range of chars identifed by FCD, */
/* normalize it into the collIterate's writable buffer, */
/* switch the collIterate's state to use the writable buffer. */
/* */
static
void collIterNormalize(collIterate *collationSource)
{
UErrorCode status = U_ZERO_ERROR;
const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
collationSource->writableBuffer,
status);
if (U_FAILURE(status)) {
#ifdef UCOL_DEBUG
fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
#endif
return;
}
collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer();
collationSource->origFlags = collationSource->flags;
collationSource->flags |= UCOL_ITER_INNORMBUF;
collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
}
// This function takes the iterator and extracts normalized stuff up to the next boundary
// It is similar in the end results to the collIterNormalize, but for the cases when we
// use an iterator
/*static
inline void normalizeIterator(collIterate *collationSource) {
UErrorCode status = U_ZERO_ERROR;
UBool wasNormalized = FALSE;
//int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
(int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
// reallocate and terminate
if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
&collationSource->writableBuffer,
(int32_t *)&collationSource->writableBufSize, normLen + 1,
0)
) {
#ifdef UCOL_DEBUG
fprintf(stderr, "normalizeIterator(), out of memory\n");
#endif
return;
}
status = U_ZERO_ERROR;
//collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
(int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
}
// Terminate the buffer - we already checked that it is big enough
collationSource->writableBuffer[normLen] = 0;
if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
collationSource->flags |= UCOL_ITER_ALLOCATED;
}
collationSource->pos = collationSource->writableBuffer;
collationSource->origFlags = collationSource->flags;
collationSource->flags |= UCOL_ITER_INNORMBUF;
collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
}*/
/* Incremental FCD check and normalize */
/* Called from getNextCE when normalization state is suspect. */
/* When entering, the state is known to be this: */
/* o We are working in the main buffer of the collIterate, not the side */
/* writable buffer. When in the side buffer, normalization mode is always off, */
/* so we won't get here. */
/* o The leading combining class from the current character is 0 or */
/* the trailing combining class of the previous char was zero. */
/* True because the previous call to this function will have always exited */
/* that way, and we get called for every char where cc might be non-zero. */
static
inline UBool collIterFCD(collIterate *collationSource) {
const UChar *srcP, *endP;
uint8_t leadingCC;
uint8_t prevTrailingCC = 0;
uint16_t fcd;
UBool needNormalize = FALSE;
srcP = collationSource->pos-1;
if (collationSource->flags & UCOL_ITER_HASLEN) {
endP = collationSource->endp;
} else {
endP = NULL;
}
// Get the trailing combining class of the current character. If it's zero,
// we are OK.
/* trie access */
fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
if (fcd != 0) {
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
if (prevTrailingCC != 0) {
// The current char has a non-zero trailing CC. Scan forward until we find
// a char with a leading cc of zero.
while (endP == NULL || srcP != endP)
{
const UChar *savedSrcP = srcP;
/* trie access */
fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (leadingCC == 0) {
srcP = savedSrcP; // Hit char that is not part of combining sequence.
// back up over it. (Could be surrogate pair!)
break;
}
if (leadingCC < prevTrailingCC) {
needNormalize = TRUE;
}
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
}
}
}
collationSource->fcdPosition = (UChar *)srcP;
return needNormalize;
}
/****************************************************************************/
/* Following are the CE retrieval functions */
/* */
/****************************************************************************/
static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
/* there should be a macro version of this function in the header file */
/* This is the first function that tries to fetch a collation element */
/* If it's not succesfull or it encounters a more difficult situation */
/* some more sofisticated and slower functions are invoked */
static
inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
uint32_t order = 0;
if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
order = *(collationSource->toReturn++); /* if so, return them */
if(collationSource->CEpos == collationSource->toReturn) {
collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
}
return order;
}
UChar ch = 0;
collationSource->offsetReturn = NULL;
for (;;) /* Loop handles case when incremental normalize switches */
{ /* to or from the side buffer / original string, and we */
/* need to start again to get the next character. */
if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
{
// The source string is null terminated and we're not working from the side buffer,
// and we're not normalizing. This is the fast path.
// (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
ch = *collationSource->pos++;
if (ch != 0) {
break;
}
else {
return UCOL_NO_MORE_CES;
}
}
if (collationSource->flags & UCOL_ITER_HASLEN) {
// Normal path for strings when length is specified.
// (We can't be in side buffer because it is always null terminated.)
if (collationSource->pos >= collationSource->endp) {
// Ran off of the end of the main source string. We're done.
return UCOL_NO_MORE_CES;
}
ch = *collationSource->pos++;
}
else if(collationSource->flags & UCOL_USE_ITERATOR) {
UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
if(iterCh == U_SENTINEL) {
return UCOL_NO_MORE_CES;
}
ch = (UChar)iterCh;
}
else
{
// Null terminated string.
ch = *collationSource->pos++;
if (ch == 0) {
// Ran off end of buffer.
if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
// Ran off end of main string. backing up one character.
collationSource->pos--;
return UCOL_NO_MORE_CES;
}
else
{
// Hit null in the normalize side buffer.
// Usually this means the end of the normalized data,
// except for one odd case: a null followed by combining chars,
// which is the case if we are at the start of the buffer.
if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
break;
}
// Null marked end of side buffer.
// Revert to the main string and
// loop back to top to try again to get a character.
collationSource->pos = collationSource->fcdPosition;
collationSource->flags = collationSource->origFlags;
continue;
}
}
}
if(collationSource->flags&UCOL_HIRAGANA_Q) {
/* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
* based on whether the previous codepoint was Hiragana or Katakana.
*/
if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
collationSource->flags |= UCOL_WAS_HIRAGANA;
} else {
collationSource->flags &= ~UCOL_WAS_HIRAGANA;
}
}
// We've got a character. See if there's any fcd and/or normalization stuff to do.
// Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
break;
}
if (collationSource->fcdPosition >= collationSource->pos) {
// An earlier FCD check has already covered the current character.
// We can go ahead and process this char.
break;
}
if (ch < ZERO_CC_LIMIT_ ) {
// Fast fcd safe path. Trailing combining class == 0. This char is OK.
break;
}
if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
// We need to peek at the next character in order to tell if we are FCD
if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
// We are at the last char of source string.
// It is always OK for FCD check.
break;
}
// Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
break;
}
}
// Need a more complete FCD check and possible normalization.
if (collIterFCD(collationSource)) {
collIterNormalize(collationSource);
}
if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
// No normalization was needed. Go ahead and process the char we already had.
break;
}
// Some normalization happened. Next loop iteration will pick up a char
// from the normalization buffer.
} // end for (;;)
if (ch <= 0xFF) {
/* For latin-1 characters we never need to fall back to the UCA table */
/* because all of the UCA data is replicated in the latinOneMapping array */
order = coll->latinOneMapping[ch];
if (order > UCOL_NOT_FOUND) {
order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
}
}
else
{
// Always use UCA for Han, Hangul
// (Han extension A is before main Han block)
// **** Han compatibility chars ?? ****
if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
(ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
// between the two target ranges; do normal lookup
// **** this range is YI, Modifier tone letters, ****
// **** Latin-D, Syloti Nagari, Phagas-pa. ****
// **** Latin-D might be tailored, so we need to ****
// **** do the normal lookup for these guys. ****
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
} else {
// in one of the target ranges; use UCA
order = UCOL_NOT_FOUND;
}
} else {
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
}
if(order > UCOL_NOT_FOUND) { /* if a CE is special */
order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
}
if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
/* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
}
}
}
if(order == UCOL_NOT_FOUND) {
order = getImplicit(ch, collationSource);
}
return order; /* return the CE */
}
/* ucol_getNextCE, out-of-line version for use from other files. */
U_CAPI uint32_t U_EXPORT2
ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
return ucol_IGetNextCE(coll, collationSource, status);
}
/**
* Incremental previous normalization happens here. Pick up the range of chars
* identifed by FCD, normalize it into the collIterate's writable buffer,
* switch the collIterate's state to use the writable buffer.
* @param data collation iterator data
*/
static
void collPrevIterNormalize(collIterate *data)
{
UErrorCode status = U_ZERO_ERROR;
const UChar *pEnd = data->pos; /* End normalize + 1 */
const UChar *pStart;
/* Start normalize */
if (data->fcdPosition == NULL) {
pStart = data->string;
}
else {
pStart = data->fcdPosition + 1;
}
int32_t normLen =
data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
data->writableBuffer,
status).
length();
if(U_FAILURE(status)) {
return;
}
/*
this puts the null termination infront of the normalized string instead
of the end
*/
data->writableBuffer.insert(0, (UChar)0);
if (data->offsetBuffer == NULL) {
int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
data->offsetBufferSize = len;
data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
data->offsetStore = data->offsetBuffer;
} else if(data->offsetBufferSize < normLen) {
int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer);
int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
if (tob != NULL) {
data->offsetBuffer = tob;
data->offsetStore = &data->offsetBuffer[storeIX];
data->offsetBufferSize = normLen + 1;
}
}
/*
* The usual case at this point is that we've got a base
* character followed by marks that were normalized. If
* fcdPosition is NULL, that means that we backed up to
* the beginning of the string and there's no base character.
*
* Forward processing will usually normalize when it sees
* the first mark, so that mark will get it's natural offset
* and the rest will get the offset of the character following
* the marks. The base character will also get its natural offset.
*
* We write the offset of the base character, if there is one,
* followed by the offset of the first mark and then the offsets
* of the rest of the marks.
*/
int32_t firstMarkOffset = 0;
int32_t trailOffset = (int32_t)(data->pos - data->string + 1);
int32_t trailCount = normLen - 1;
if (data->fcdPosition != NULL) {
int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
UChar baseChar = *data->fcdPosition;
firstMarkOffset = baseOffset + 1;
/*
* If the base character is the start of a contraction, forward processing
* will normalize the marks while checking for the contraction, which means
* that the offset of the first mark will the same as the other marks.
*
* **** THIS IS PROBABLY NOT A COMPLETE TEST ****
*/
if (baseChar >= 0x100) {
uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
}
if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
firstMarkOffset = trailOffset;
}
}
*(data->offsetStore++) = baseOffset;
}
*(data->offsetStore++) = firstMarkOffset;
for (int32_t i = 0; i < trailCount; i += 1) {
*(data->offsetStore++) = trailOffset;
}
data->offsetRepeatValue = trailOffset;
data->offsetReturn = data->offsetStore - 1;
if (data->offsetReturn == data->offsetBuffer) {
data->offsetStore = data->offsetBuffer;
}
data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
}
/**
* Incremental FCD check for previous iteration and normalize. Called from
* getPrevCE when normalization state is suspect.
* When entering, the state is known to be this:
* o We are working in the main buffer of the collIterate, not the side
* writable buffer. When in the side buffer, normalization mode is always
* off, so we won't get here.
* o The leading combining class from the current character is 0 or the
* trailing combining class of the previous char was zero.
* True because the previous call to this function will have always exited
* that way, and we get called for every char where cc might be non-zero.
* @param data collation iterate struct
* @return normalization status, TRUE for normalization to be done, FALSE
* otherwise
*/
static
inline UBool collPrevIterFCD(collIterate *data)
{
const UChar *src, *start;
uint8_t leadingCC;
uint8_t trailingCC = 0;
uint16_t fcd;
UBool result = FALSE;
start = data->string;
src = data->pos + 1;
/* Get the trailing combining class of the current character. */
fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (leadingCC != 0) {
/*
The current char has a non-zero leading combining class.
Scan backward until we find a char with a trailing cc of zero.
*/
for (;;)
{
if (start == src) {
data->fcdPosition = NULL;
return result;
}
fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
if (trailingCC == 0) {
break;
}
if (leadingCC < trailingCC) {
result = TRUE;
}
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
}
}
data->fcdPosition = (UChar *)src;
return result;
}
/** gets a character from the string at a given offset
* Handles both normal and iterative cases.
* No error checking - caller beware!
*/
inline static
UChar peekCharacter(collIterate *source, int32_t offset) {
if(source->pos != NULL) {
return *(source->pos + offset);
} else if(source->iterator != NULL) {
if(offset != 0) {
source->iterator->move(source->iterator, offset, UITER_CURRENT);
UChar toReturn = (UChar)source->iterator->next(source->iterator);
source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
return toReturn;
} else {
return (UChar)source->iterator->current(source->iterator);
}
} else {
return (UChar)U_SENTINEL;
}
}
/**
* Determines if we are at the start of the data string in the backwards
* collation iterator
* @param data collation iterator
* @return TRUE if we are at the start
*/
static
inline UBool isAtStartPrevIterate(collIterate *data) {
if(data->pos == NULL && data->iterator != NULL) {
return !data->iterator->hasPrevious(data->iterator);
}
//return (collIter_bos(data)) ||
return (data->pos == data->string) ||
((data->flags & UCOL_ITER_INNORMBUF) &&
*(data->pos - 1) == 0 && data->fcdPosition == NULL);
}
static
inline void goBackOne(collIterate *data) {
# if 0
// somehow, it looks like we need to keep iterator synced up
// at all times, as above.
if(data->pos) {
data->pos--;
}
if(data->iterator) {
data->iterator->previous(data->iterator);
}
#endif
if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
data->iterator->previous(data->iterator);
}
if(data->pos) {
data->pos --;
}
}
/**
* Inline function that gets a simple CE.
* So what it does is that it will first check the expansion buffer. If the
* expansion buffer is not empty, ie the end pointer to the expansion buffer
* is different from the string pointer, we return the collation element at the
* return pointer and decrement it.
* For more complicated CEs it resorts to getComplicatedCE.
* @param coll collator data
* @param data collation iterator struct
* @param status error status
*/
static
inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
UErrorCode *status)
{
uint32_t result = (uint32_t)UCOL_NULLORDER;
if (data->offsetReturn != NULL) {
if (data->offsetRepeatCount > 0) {
data->offsetRepeatCount -= 1;
} else {
if (data->offsetReturn == data->offsetBuffer) {
data->offsetReturn = NULL;
data->offsetStore = data->offsetBuffer;
} else {
data->offsetReturn -= 1;
}
}
}
if ((data->extendCEs && data->toReturn > data->extendCEs) ||
(!data->extendCEs && data->toReturn > data->CEs))
{
data->toReturn -= 1;
result = *(data->toReturn);
if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
data->CEpos = data->toReturn;
}
}
else {
UChar ch = 0;
/*
Loop handles case when incremental normalize switches to or from the
side buffer / original string, and we need to start again to get the
next character.
*/
for (;;) {
if (data->flags & UCOL_ITER_HASLEN) {
/*
Normal path for strings when length is specified.
Not in side buffer because it is always null terminated.
*/
if (data->pos <= data->string) {
/* End of the main source string */
return UCOL_NO_MORE_CES;
}
data->pos --;
ch = *data->pos;
}
// we are using an iterator to go back. Pray for us!
else if (data->flags & UCOL_USE_ITERATOR) {
UChar32 iterCh = data->iterator->previous(data->iterator);
if(iterCh == U_SENTINEL) {
return UCOL_NO_MORE_CES;
} else {
ch = (UChar)iterCh;
}
}
else {
data->pos --;
ch = *data->pos;
/* we are in the side buffer. */
if (ch == 0) {
/*
At the start of the normalize side buffer.
Go back to string.
Because pointer points to the last accessed character,
hence we have to increment it by one here.
*/
data->flags = data->origFlags;
data->offsetRepeatValue = 0;
if (data->fcdPosition == NULL) {
data->pos = data->string;
return UCOL_NO_MORE_CES;
}
else {
data->pos = data->fcdPosition + 1;
}
continue;
}
}
if(data->flags&UCOL_HIRAGANA_Q) {
if(ch>=0x3040 && ch<=0x309f) {
data->flags |= UCOL_WAS_HIRAGANA;
} else {
data->flags &= ~UCOL_WAS_HIRAGANA;
}
}
/*
* got a character to determine if there's fcd and/or normalization
* stuff to do.
* if the current character is not fcd.
* if current character is at the start of the string
* Trailing combining class == 0.
* Note if pos is in the writablebuffer, norm is always 0
*/
if (ch < ZERO_CC_LIMIT_ ||
// this should propel us out of the loop in the iterator case
(data->flags & UCOL_ITER_NORM) == 0 ||
(data->fcdPosition != NULL && data->fcdPosition <= data->pos)
|| data->string == data->pos) {
break;
}
if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
/* if next character is FCD */
if (data->pos == data->string) {
/* First char of string is always OK for FCD check */
break;
}
/* Not first char of string, do the FCD fast test */
if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
break;
}
}
/* Need a more complete FCD check and possible normalization. */
if (collPrevIterFCD(data)) {
collPrevIterNormalize(data);
}
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
/* No normalization. Go ahead and process the char. */
break;
}
/*
Some normalization happened.
Next loop picks up a char from the normalization buffer.
*/
}
/* attempt to handle contractions, after removal of the backwards
contraction
*/
if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
} else {
if (ch <= 0xFF) {
result = coll->latinOneMapping[ch];
}
else {
// Always use UCA for [3400..9FFF], [AC00..D7AF]
// **** [FA0E..FA2F] ?? ****
if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
(ch >= 0x3400 && ch <= 0xD7AF)) {
if (ch > 0x9FFF && ch < 0xAC00) {
// between the two target ranges; do normal lookup
// **** this range is YI, Modifier tone letters, ****
// **** Latin-D, Syloti Nagari, Phagas-pa. ****
// **** Latin-D might be tailored, so we need to ****
// **** do the normal lookup for these guys. ****
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
} else {
result = UCOL_NOT_FOUND;
}
} else {
result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
}
}
if (result > UCOL_NOT_FOUND) {
result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
}
if (result == UCOL_NOT_FOUND) { // Not found in master list
if (!isAtStartPrevIterate(data) &&
ucol_contractionEndCP(ch, data->coll))
{
result = UCOL_CONTRACTION;
} else {
if(coll->UCA) {
result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
}
}
if (result > UCOL_NOT_FOUND) {
if(coll->UCA) {
result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
}
}
}
}
if(result == UCOL_NOT_FOUND) {
result = getPrevImplicit(ch, data);
}
}
return result;
}
/* ucol_getPrevCE, out-of-line version for use from other files. */
U_CFUNC uint32_t U_EXPORT2
ucol_getPrevCE(const UCollator *coll, collIterate *data,
UErrorCode *status) {
return ucol_IGetPrevCE(coll, data, status);
}
/* this should be connected to special Jamo handling */
U_CFUNC uint32_t U_EXPORT2
ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
collIterate colIt;
IInit_collIterate(coll, &u, 1, &colIt, status);
if(U_FAILURE(*status)) {
return 0;
}
return ucol_IGetNextCE(coll, &colIt, status);
}
/**
* Inserts the argument character into the end of the buffer pushing back the
* null terminator.
* @param data collIterate struct data
* @param ch character to be appended
* @return the position of the new addition
*/
static
inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
{
int32_t oldLength = data->writableBuffer.length();
return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
}
/**
* Inserts the argument string into the end of the buffer pushing back the
* null terminator.
* @param data collIterate struct data
* @param string to be appended
* @param length of the string to be appended
* @return the position of the new addition
*/
static
inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
{
int32_t oldLength = data->writableBuffer.length();
return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
}
/**
* Special normalization function for contraction in the forwards iterator.
* This normalization sequence will place the current character at source->pos
* and its following normalized sequence into the buffer.
* The fcd position, pos will be changed.
* pos will now point to positions in the buffer.
* Flags will be changed accordingly.
* @param data collation iterator data
*/
static
inline void normalizeNextContraction(collIterate *data)
{
int32_t strsize;
UErrorCode status = U_ZERO_ERROR;
/* because the pointer points to the next character */
const UChar *pStart = data->pos - 1;
const UChar *pEnd;
if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
data->writableBuffer.setTo(*(pStart - 1));
strsize = 1;
}
else {
strsize = data->writableBuffer.length();
}
pEnd = data->fcdPosition;
data->writableBuffer.append(
data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
if(U_FAILURE(status)) {
return;
}
data->pos = data->writableBuffer.getTerminatedBuffer() + strsize;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
}
/**
* Contraction character management function that returns the next character
* for the forwards iterator.
* Does nothing if the next character is in buffer and not the first character
* in it.
* Else it checks next character in data string to see if it is normalizable.
* If it is not, the character is simply copied into the buffer, else
* the whole normalized substring is copied into the buffer, including the
* current character.
* @param data collation element iterator data
* @return next character
*/
static
inline UChar getNextNormalizedChar(collIterate *data)
{
UChar nextch;
UChar ch;
// Here we need to add the iterator code. One problem is the way
// end of string is handled. If we just return next char, it could
// be the sentinel. Most of the cases already check for this, but we
// need to be sure.
if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
/* if no normalization and not in buffer. */
if(data->flags & UCOL_USE_ITERATOR) {
return (UChar)data->iterator->next(data->iterator);
} else {
return *(data->pos ++);
}
}
//if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
//normalizeIterator(data);
//}
UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
if ((innormbuf && *data->pos != 0) ||
(data->fcdPosition != NULL && !innormbuf &&
data->pos < data->fcdPosition)) {
/*
if next character is in normalized buffer, no further normalization
is required
*/
return *(data->pos ++);
}
if (data->flags & UCOL_ITER_HASLEN) {
/* in data string */
if (data->pos + 1 == data->endp) {
return *(data->pos ++);
}
}
else {
if (innormbuf) {
// inside the normalization buffer, but at the end
// (since we encountered zero). This means, in the
// case we're using char iterator, that we need to
// do another round of normalization.
//if(data->origFlags & UCOL_USE_ITERATOR) {
// we need to restore original flags,
// otherwise, we'll lose them
//data->flags = data->origFlags;
//normalizeIterator(data);
//return *(data->pos++);
//} else {
/*
in writable buffer, at this point fcdPosition can not be
pointing to the end of the data string. see contracting tag.
*/
if(data->fcdPosition) {
if (*(data->fcdPosition + 1) == 0 ||
data->fcdPosition + 1 == data->endp) {
/* at the end of the string, dump it into the normalizer */
data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
// Check if data->pos received a null pointer
if (data->pos == NULL) {
return (UChar)-1; // Return to indicate error.
}
return *(data->fcdPosition ++);
}
data->pos = data->fcdPosition;
} else if(data->origFlags & UCOL_USE_ITERATOR) {
// if we are here, we're using a normalizing iterator.
// we should just continue further.
data->flags = data->origFlags;
data->pos = NULL;
return (UChar)data->iterator->next(data->iterator);
}
//}
}
else {
if (*(data->pos + 1) == 0) {
return *(data->pos ++);
}
}
}
ch = *data->pos ++;
nextch = *data->pos;
/*
* if the current character is not fcd.
* Trailing combining class == 0.
*/
if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
(nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
*/
if (collIterFCD(data)) {
normalizeNextContraction(data);
return *(data->pos ++);
}
else if (innormbuf) {
/* fcdposition shifted even when there's no normalization, if we
don't input the rest into this, we'll get the wrong position when
we reach the end of the writableBuffer */
int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
data->pos = insertBufferEnd(data, data->pos - 1, length);
// Check if data->pos received a null pointer
if (data->pos == NULL) {
return (UChar)-1; // Return to indicate error.
}
return *(data->pos ++);
}
}
if (innormbuf) {
/*
no normalization is to be done hence only one character will be
appended to the buffer.
*/
data->pos = insertBufferEnd(data, ch) + 1;
// Check if data->pos received a null pointer
if (data->pos == NULL) {
return (UChar)-1; // Return to indicate error.
}
}
/* points back to the pos in string */
return ch;
}
/**
* Function to copy the buffer into writableBuffer and sets the fcd position to
* the correct position
* @param source data string source
* @param buffer character buffer
*/
static
inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
{
/* okay confusing part here. to ensure that the skipped characters are
considered later, we need to place it in the appropriate position in the
normalization buffer and reassign the pos pointer. simple case if pos
reside in string, simply copy to normalization buffer and
fcdposition = pos, pos = start of normalization buffer. if pos in
normalization buffer, we'll insert the copy infront of pos and point pos
to the start of the normalization buffer. why am i doing these copies?
well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
not require any changes, which be really painful. */
if (source->flags & UCOL_ITER_INNORMBUF) {
int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
source->writableBuffer.replace(0, replaceLength, buffer);
}
else {
source->fcdPosition = source->pos;
source->origFlags = source->flags;
source->flags |= UCOL_ITER_INNORMBUF;
source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
source->writableBuffer = buffer;
}
source->pos = source->writableBuffer.getTerminatedBuffer();
}
/**
* Function to get the discontiguos collation element within the source.
* Note this function will set the position to the appropriate places.
* @param coll current collator used
* @param source data string source
* @param constart index to the start character in the contraction table
* @return discontiguos collation element offset
*/
static
uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
const UChar *constart)
{
/* source->pos currently points to the second combining character after
the start character */
const UChar *temppos = source->pos;
UnicodeString buffer;
const UChar *tempconstart = constart;
uint8_t tempflags = source->flags;
UBool multicontraction = FALSE;
collIterateState discState;
backupState(source, &discState);
buffer.setTo(peekCharacter(source, -1));
for (;;) {
UChar *UCharOffset;
UChar schar,
tchar;
uint32_t result;
if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
|| (peekCharacter(source, 0) == 0 &&
//|| (*source->pos == 0 &&
((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
source->fcdPosition == NULL ||
source->fcdPosition == source->endp ||
*(source->fcdPosition) == 0 ||
u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
/* end of string in null terminated string or stopped by a
null character, note fcd does not always point to a base
character after the discontiguos change */
u_getCombiningClass(peekCharacter(source, 0)) == 0) {
//u_getCombiningClass(*(source->pos)) == 0) {
//constart = (UChar *)coll->image + getContractOffset(CE);
if (multicontraction) {
source->pos = temppos - 1;
setDiscontiguosAttribute(source, buffer);
return *(coll->contractionCEs +
(tempconstart - coll->contractionIndex));
}
constart = tempconstart;
break;
}
UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
schar = getNextNormalizedChar(source);
while (schar > (tchar = *UCharOffset)) {
UCharOffset++;
}
if (schar != tchar) {
/* not the correct codepoint. we stuff the current codepoint into
the discontiguos buffer and try the next character */
buffer.append(schar);
continue;
}
else {
if (u_getCombiningClass(schar) ==
u_getCombiningClass(peekCharacter(source, -2))) {
//u_getCombiningClass(*(source->pos - 2))) {
buffer.append(schar);
continue;
}
result = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
}
if (result == UCOL_NOT_FOUND) {
break;
} else if (isContraction(result)) {
/* this is a multi-contraction*/
tempconstart = (UChar *)coll->image + getContractOffset(result);
if (*(coll->contractionCEs + (constart - coll->contractionIndex))
!= UCOL_NOT_FOUND) {
multicontraction = TRUE;
temppos = source->pos + 1;
}
} else {
setDiscontiguosAttribute(source, buffer);
return result;
}
}
/* no problems simply reverting just like that,
if we are in string before getting into this function, points back to
string hence no problem.
if we are in normalization buffer before getting into this function,
since we'll never use another normalization within this function, we
know that fcdposition points to a base character. the normalization buffer
never change, hence this revert works. */
loadState(source, &discState, TRUE);
goBackOne(source);
//source->pos = temppos - 1;
source->flags = tempflags;
return *(coll->contractionCEs + (constart - coll->contractionIndex));
}
static
inline UBool isNonChar(UChar32 cp) {
return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
}
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
if(isNonChar(cp)) {
return 0;
}
uint32_t r = uprv_uca_getImplicitPrimary(cp);
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
collationSource->offsetRepeatCount += 1;
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
}
/**
* Inserts the argument character into the front of the buffer replacing the
* front null terminator.
* @param data collation element iterator data
* @param ch character to be appended
*/
static
inline void insertBufferFront(collIterate *data, UChar ch)
{
data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
}
/**
* Special normalization function for contraction in the previous iterator.
* This normalization sequence will place the current character at source->pos
* and its following normalized sequence into the buffer.
* The fcd position, pos will be changed.
* pos will now point to positions in the buffer.
* Flags will be changed accordingly.
* @param data collation iterator data
*/
static
inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
{
const UChar *pEnd = data->pos + 1; /* End normalize + 1 */
const UChar *pStart;
UnicodeString endOfBuffer;
if (data->flags & UCOL_ITER_HASLEN) {
/*
normalization buffer not used yet, we'll pull down the next
character into the end of the buffer
*/
endOfBuffer.setTo(*pEnd);
}
else {
endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL
}
if (data->fcdPosition == NULL) {
pStart = data->string;
}
else {
pStart = data->fcdPosition + 1;
}
int32_t normLen =
data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
data->writableBuffer,
*status).
length();
if(U_FAILURE(*status)) {
return;
}
/*
this puts the null termination infront of the normalized string instead
of the end
*/
data->pos =
data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
1 + normLen;
data->origFlags = data->flags;
data->flags |= UCOL_ITER_INNORMBUF;
data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
}
/**
* Contraction character management function that returns the previous character
* for the backwards iterator.
* Does nothing if the previous character is in buffer and not the first
* character in it.
* Else it checks previous character in data string to see if it is
* normalizable.
* If it is not, the character is simply copied into the buffer, else
* the whole normalized substring is copied into the buffer, including the
* current character.
* @param data collation element iterator data
* @return previous character
*/
static
inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
{
UChar prevch;
UChar ch;
const UChar *start;
UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
(innormbuf && *(data->pos - 1) != 0)) {
/*
if no normalization.
if previous character is in normalized buffer, no further normalization
is required
*/
if(data->flags & UCOL_USE_ITERATOR) {
data->iterator->move(data->iterator, -1, UITER_CURRENT);
return (UChar)data->iterator->next(data->iterator);
} else {
return *(data->pos - 1);
}
}
start = data->pos;
if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
/* in data string */
if ((start - 1) == data->string) {
return *(start - 1);
}
start --;
ch = *start;
prevch = *(start - 1);
}
else {
/*
in writable buffer, at this point fcdPosition can not be NULL.
see contracting tag.
*/
if (data->fcdPosition == data->string) {
/* at the start of the string, just dump it into the normalizer */
insertBufferFront(data, *(data->fcdPosition));
data->fcdPosition = NULL;
return *(data->pos - 1);
}
start = data->fcdPosition;
ch = *start;
prevch = *(start - 1);
}
/*
* if the current character is not fcd.
* Trailing combining class == 0.
*/
if (data->fcdPosition > start &&
(ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
{
/*
Need a more complete FCD check and possible normalization.
normalize substring will be appended to buffer
*/
const UChar *backuppos = data->pos;
data->pos = start;
if (collPrevIterFCD(data)) {
normalizePrevContraction(data, status);
return *(data->pos - 1);
}
data->pos = backuppos;
data->fcdPosition ++;
}
if (innormbuf) {
/*
no normalization is to be done hence only one character will be
appended to the buffer.
*/
insertBufferFront(data, ch);
data->fcdPosition --;
}
return ch;
}
/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
/* It is called by getNextCE */
/* The following should be even */
#define UCOL_MAX_DIGITS_FOR_NUMBER 254
uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
collIterateState entryState;
backupState(source, &entryState);
UChar32 cp = ch;
for (;;) {
// This loop will repeat only in the case of contractions, and only when a contraction
// is found and the first CE resulting from that contraction is itself a special
// (an expansion, for example.) All other special CE types are fully handled the
// first time through, and the loop exits.
const uint32_t *CEOffset = NULL;
switch(getCETag(CE)) {
case NOT_FOUND_TAG:
/* This one is not found, and we'll let somebody else bother about it... no more games */
return CE;
case SPEC_PROC_TAG:
{
// Special processing is getting a CE that is preceded by a certain prefix
// Currently this is only needed for optimizing Japanese length and iteration marks.
// When we encouter a special processing tag, we go backwards and try to see if
// we have a match.
// Contraction tables are used - so the whole process is not unlike contraction.
// prefix data is stored backwards in the table.
const UChar *UCharOffset;
UChar schar, tchar;
collIterateState prefixState;
backupState(source, &prefixState);
loadState(source, &entryState, TRUE);
goBackOne(source); // We want to look at the point where we entered - actually one
// before that...
for(;;) {
// This loop will run once per source string character, for as long as we
// are matching a potential contraction sequence
// First we position ourselves at the begining of contraction sequence
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if (collIter_bos(source)) {
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
break;
}
schar = getPrevNormalizedChar(source, status);
goBackOne(source);
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++;
}
if (schar == tchar) {
// Found the source string char in the table.
// Pick up the corresponding CE from the table.
CE = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
}
else
{
// Source string char was not in the table.
// We have not found the prefix.
CE = *(coll->contractionCEs +
(ContractionStart - coll->contractionIndex));
}
if(!isPrefix(CE)) {
// The source string char was in the contraction table, and the corresponding
// CE is not a prefix CE. We found the prefix, break
// out of loop, this CE will end up being returned. This is the normal
// way out of prefix handling when the source actually contained
// the prefix.
break;
}
}
if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
loadState(source, &prefixState, TRUE);
if(source->origFlags & UCOL_USE_ITERATOR) {
source->flags = source->origFlags;
}
} else { // prefix search was a failure, we have to backup all the way to the start
loadState(source, &entryState, TRUE);
}
break;
}
case CONTRACTION_TAG:
{
/* This should handle contractions */
collIterateState state;
backupState(source, &state);
uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
const UChar *UCharOffset;
UChar schar, tchar;
for (;;) {
/* This loop will run once per source string character, for as long as we */
/* are matching a potential contraction sequence */
/* First we position ourselves at the begining of contraction sequence */
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if (collIter_eos(source)) {
// Ran off the end of the source string.
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
// So we'll pick whatever we have at the point...
if (CE == UCOL_NOT_FOUND) {
// back up the source over all the chars we scanned going into this contraction.
CE = firstCE;
loadState(source, &state, TRUE);
if(source->origFlags & UCOL_USE_ITERATOR) {
source->flags = source->origFlags;
}
}
break;
}
uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
schar = getNextNormalizedChar(source);
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++;
}
if (schar == tchar) {
// Found the source string char in the contraction table.
// Pick up the corresponding CE from the table.
CE = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
}
else
{
// Source string char was not in contraction table.
// Unless we have a discontiguous contraction, we have finished
// with this contraction.
// in order to do the proper detection, we
// need to see if we're dealing with a supplementary
/* We test whether the next two char are surrogate pairs.
* This test is done if the iterator is not NULL.
* If there is no surrogate pair, the iterator
* goes back one if needed. */
UChar32 miss = schar;
if (source->iterator) {
UChar32 surrNextChar; /* the next char in the iteration to test */
int32_t prevPos; /* holds the previous position before move forward of the source iterator */
if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
prevPos = source->iterator->index;
surrNextChar = getNextNormalizedChar(source);
if (U16_IS_TRAIL(surrNextChar)) {
miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
} else if (prevPos < source->iterator->index){
goBackOne(source);
}
}
} else if (U16_IS_LEAD(schar)) {
miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
}
uint8_t sCC;
if (miss < 0x300 ||
maxCC == 0 ||
(sCC = i_getCombiningClass(miss, coll)) == 0 ||
sCC>maxCC ||
(allSame != 0 && sCC == maxCC) ||
collIter_eos(source))
{
// Contraction can not be discontiguous.
goBackOne(source); // back up the source string by one,
// because the character we just looked at was
// not part of the contraction. */
if(U_IS_SUPPLEMENTARY(miss)) {
goBackOne(source);
}
CE = *(coll->contractionCEs +
(ContractionStart - coll->contractionIndex));
} else {
//
// Contraction is possibly discontiguous.
// Scan more of source string looking for a match
//
UChar tempchar;
/* find the next character if schar is not a base character
and we are not yet at the end of the string */
tempchar = getNextNormalizedChar(source);
// probably need another supplementary thingie here
goBackOne(source);
if (i_getCombiningClass(tempchar, coll) == 0) {
goBackOne(source);
if(U_IS_SUPPLEMENTARY(miss)) {
goBackOne(source);
}
/* Spit out the last char of the string, wasn't tasty enough */
CE = *(coll->contractionCEs +
(ContractionStart - coll->contractionIndex));
} else {
CE = getDiscontiguous(coll, source, ContractionStart);
}
}
} // else after if(schar == tchar)
if(CE == UCOL_NOT_FOUND) {
/* The Source string did not match the contraction that we were checking. */
/* Back up the source position to undo the effects of having partially */
/* scanned through what ultimately proved to not be a contraction. */
loadState(source, &state, TRUE);
CE = firstCE;
break;
}
if(!isContraction(CE)) {
// The source string char was in the contraction table, and the corresponding
// CE is not a contraction CE. We completed the contraction, break
// out of loop, this CE will end up being returned. This is the normal
// way out of contraction handling when the source actually contained
// the contraction.
break;
}
// The source string char was in the contraction table, and the corresponding
// CE is IS a contraction CE. We will continue looping to check the source
// string for the remaining chars in the contraction.
uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
if(tempCE != UCOL_NOT_FOUND) {
// We have scanned a a section of source string for which there is a
// CE from the contraction table. Remember the CE and scan position, so
// that we can return to this point if further scanning fails to
// match a longer contraction sequence.
firstCE = tempCE;
goBackOne(source);
backupState(source, &state);
getNextNormalizedChar(source);
// Another way to do this is:
//collIterateState tempState;
//backupState(source, &tempState);
//goBackOne(source);
//backupState(source, &state);
//loadState(source, &tempState, TRUE);
// The problem is that for incomplete contractions we have to remember the previous
// position. Before, the only thing I needed to do was state.pos--;
// After iterator introduction and especially after introduction of normalizing
// iterators, it became much more difficult to decrease the saved state.
// I'm not yet sure which of the two methods above is faster.
}
} // for(;;)
break;
} // case CONTRACTION_TAG:
case LONG_PRIMARY_TAG:
{
*(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
source->offsetRepeatCount += 1;
return CE;
}
case EXPANSION_TAG:
{
/* This should handle expansion. */
/* NOTE: we can encounter both continuations and expansions in an expansion! */
/* I have to decide where continuations are going to be dealt with */
uint32_t size;
uint32_t i; /* general counter */
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
size = getExpansionCount(CE);
CE = *CEOffset++;
//source->offsetRepeatCount = -1;
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
for(i = 1; i<size; i++) {
*(source->CEpos++) = *CEOffset++;
source->offsetRepeatCount += 1;
}
} else { /* else, we do */
while(*CEOffset != 0) {
*(source->CEpos++) = *CEOffset++;
source->offsetRepeatCount += 1;
}
}
return CE;
}
case DIGIT_TAG:
{
/*
We do a check to see if we want to collate digits as numbers; if so we generate
a custom collation key. Otherwise we pull out the value stored in the expansion table.
*/
//uint32_t size;
uint32_t i; /* general counter */
if (source->coll->numericCollation == UCOL_ON){
collIterateState digitState = {0,0,0,0,0,0,0,0,0};
UChar32 char32 = 0;
int32_t digVal = 0;
uint32_t digIndx = 0;
uint32_t endIndex = 0;
uint32_t trailingZeroIndex = 0;
uint8_t collateVal = 0;
UBool nonZeroValReached = FALSE;
uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
/*
We parse the source string until we hit a char that's NOT a digit.
Use this u_charDigitValue. This might be slow because we have to
handle surrogates...
*/
/*
if (U16_IS_LEAD(ch)){
if (!collIter_eos(source)) {
backupState(source, &digitState);
UChar trail = getNextNormalizedChar(source);
if(U16_IS_TRAIL(trail)) {
char32 = U16_GET_SUPPLEMENTARY(ch, trail);
} else {
loadState(source, &digitState, TRUE);
char32 = ch;
}
} else {
char32 = ch;
}
} else {
char32 = ch;
}
digVal = u_charDigitValue(char32);
*/
digVal = u_charDigitValue(cp); // if we have arrived here, we have
// already processed possible supplementaries that trigered the digit tag -
// all supplementaries are marked in the UCA.
/*
We pad a zero in front of the first element anyways. This takes
care of the (probably) most common case where people are sorting things followed
by a single digit
*/
digIndx++;
for(;;){
// Make sure we have enough space. No longer needed;
// at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
// (it has been pre-incremented) so we just ensure that numTempBuf is big enough
// (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
// Skipping over leading zeroes.
if (digVal != 0) {
nonZeroValReached = TRUE;
}
if (nonZeroValReached) {
/*
We parse the digit string into base 100 numbers (this fits into a byte).
We only add to the buffer in twos, thus if we are parsing an odd character,
that serves as the 'tens' digit while the if we are parsing an even one, that
is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
than all the other bytes.
*/
if (digIndx % 2 == 1){
collateVal += (uint8_t)digVal;
// We don't enter the low-order-digit case unless we've already seen
// the high order, or for the first digit, which is always non-zero.
if (collateVal != 0)
trailingZeroIndex = 0;
numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
collateVal = 0;
}
else{
// We drop the collation value into the buffer so if we need to do
// a "front patch" we don't have to check to see if we're hitting the
// last element.
collateVal = (uint8_t)(digVal * 10);
// Check for trailing zeroes.
if (collateVal == 0)
{
if (!trailingZeroIndex)
trailingZeroIndex = (digIndx/2) + 2;
}
else
trailingZeroIndex = 0;
numTempBuf[(digIndx/2) + 2] = collateVal*