| /* |
| ********************************************************************** |
| * Copyright (C) 1999-2008, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/17/99 aliu Creation. |
| ********************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_TRANSLITERATION |
| |
| #include "unicode/unifilt.h" |
| #include "unicode/uniset.h" |
| #include "cpdtrans.h" |
| #include "uvector.h" |
| #include "tridpars.h" |
| #include "cmemory.h" |
| |
| // keep in sync with Transliterator |
| //static const UChar ID_SEP = 0x002D; /*-*/ |
| static const UChar ID_DELIM = 0x003B; /*;*/ |
| static const UChar NEWLINE = 10; |
| |
| // Empty string |
| static const UChar EMPTY[] = {0}; //"" |
| static const UChar COLON_COLON[] = {0x3A, 0x3A, 0}; //"::" |
| |
| U_NAMESPACE_BEGIN |
| |
| const UChar CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass" |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator) |
| |
| /** |
| * Constructs a new compound transliterator given an array of |
| * transliterators. The array of transliterators may be of any |
| * length, including zero or one, however, useful compound |
| * transliterators have at least two components. |
| * @param transliterators array of <code>Transliterator</code> |
| * objects |
| * @param transliteratorCount The number of |
| * <code>Transliterator</code> objects in transliterators. |
| * @param filter the filter. Any character for which |
| * <tt>filter.contains()</tt> returns <tt>false</tt> will not be |
| * altered by this transliterator. If <tt>filter</tt> is |
| * <tt>null</tt> then no filtering is applied. |
| */ |
| CompoundTransliterator::CompoundTransliterator( |
| Transliterator* const transliterators[], |
| int32_t transliteratorCount, |
| UnicodeFilter* adoptedFilter) : |
| Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), |
| trans(0), count(0), numAnonymousRBTs(0) { |
| setTransliterators(transliterators, transliteratorCount); |
| } |
| |
| /** |
| * Splits an ID of the form "ID;ID;..." into a compound using each |
| * of the IDs. |
| * @param id of above form |
| * @param forward if false, does the list in reverse order, and |
| * takes the inverse of each ID. |
| */ |
| CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
| UTransDirection direction, |
| UnicodeFilter* adoptedFilter, |
| UParseError& /*parseError*/, |
| UErrorCode& status) : |
| Transliterator(id, adoptedFilter), |
| trans(0), numAnonymousRBTs(0) { |
| // TODO add code for parseError...currently unused, but |
| // later may be used by parsing code... |
| init(id, direction, TRUE, status); |
| } |
| |
| CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, |
| UParseError& /*parseError*/, |
| UErrorCode& status) : |
| Transliterator(id, 0), // set filter to 0 here! |
| trans(0), numAnonymousRBTs(0) { |
| // TODO add code for parseError...currently unused, but |
| // later may be used by parsing code... |
| init(id, UTRANS_FORWARD, TRUE, status); |
| } |
| |
| |
| /** |
| * Private constructor for use of TransliteratorAlias |
| */ |
| CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID, |
| UVector& list, |
| UnicodeFilter* adoptedFilter, |
| int32_t anonymousRBTs, |
| UParseError& /*parseError*/, |
| UErrorCode& status) : |
| Transliterator(newID, adoptedFilter), |
| trans(0), numAnonymousRBTs(anonymousRBTs) |
| { |
| init(list, UTRANS_FORWARD, FALSE, status); |
| } |
| |
| /** |
| * Private constructor for Transliterator from a vector of |
| * transliterators. The caller is responsible for fixing up the |
| * ID. |
| */ |
| CompoundTransliterator::CompoundTransliterator(UVector& list, |
| UParseError& /*parseError*/, |
| UErrorCode& status) : |
| Transliterator(EMPTY, NULL), |
| trans(0), numAnonymousRBTs(0) |
| { |
| // TODO add code for parseError...currently unused, but |
| // later may be used by parsing code... |
| init(list, UTRANS_FORWARD, FALSE, status); |
| // assume caller will fixup ID |
| } |
| |
| CompoundTransliterator::CompoundTransliterator(UVector& list, |
| int32_t anonymousRBTs, |
| UParseError& /*parseError*/, |
| UErrorCode& status) : |
| Transliterator(EMPTY, NULL), |
| trans(0), numAnonymousRBTs(anonymousRBTs) |
| { |
| init(list, UTRANS_FORWARD, FALSE, status); |
| } |
| |
| /** |
| * Finish constructing a transliterator: only to be called by |
| * constructors. Before calling init(), set trans and filter to NULL. |
| * @param id the id containing ';'-separated entries |
| * @param direction either FORWARD or REVERSE |
| * @param idSplitPoint the index into id at which the |
| * adoptedSplitTransliterator should be inserted, if there is one, or |
| * -1 if there is none. |
| * @param adoptedSplitTransliterator a transliterator to be inserted |
| * before the entry at offset idSplitPoint in the id string. May be |
| * NULL to insert no entry. |
| * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
| * entries by calling getID() of component entries. Some constructors |
| * do not require this because they apply a facade ID anyway. |
| * @param status the error code indicating success or failure |
| */ |
| void CompoundTransliterator::init(const UnicodeString& id, |
| UTransDirection direction, |
| UBool fixReverseID, |
| UErrorCode& status) { |
| // assert(trans == 0); |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| UVector list(status); |
| UnicodeSet* compoundFilter = NULL; |
| UnicodeString regenID; |
| if (!TransliteratorIDParser::parseCompoundID(id, direction, |
| regenID, list, compoundFilter)) { |
| status = U_INVALID_ID; |
| delete compoundFilter; |
| return; |
| } |
| |
| TransliteratorIDParser::instantiateList(list, status); |
| |
| init(list, direction, fixReverseID, status); |
| |
| if (compoundFilter != NULL) { |
| adoptFilter(compoundFilter); |
| } |
| } |
| |
| /** |
| * Finish constructing a transliterator: only to be called by |
| * constructors. Before calling init(), set trans and filter to NULL. |
| * @param list a vector of transliterator objects to be adopted. It |
| * should NOT be empty. The list should be in declared order. That |
| * is, it should be in the FORWARD order; if direction is REVERSE then |
| * the list order will be reversed. |
| * @param direction either FORWARD or REVERSE |
| * @param fixReverseID if TRUE, then reconstruct the ID of reverse |
| * entries by calling getID() of component entries. Some constructors |
| * do not require this because they apply a facade ID anyway. |
| * @param status the error code indicating success or failure |
| */ |
| void CompoundTransliterator::init(UVector& list, |
| UTransDirection direction, |
| UBool fixReverseID, |
| UErrorCode& status) { |
| // assert(trans == 0); |
| |
| // Allocate array |
| if (U_SUCCESS(status)) { |
| count = list.size(); |
| trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *)); |
| /* test for NULL */ |
| if (trans == 0) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| } |
| |
| if (U_FAILURE(status) || trans == 0) { |
| // assert(trans == 0); |
| return; |
| } |
| |
| // Move the transliterators from the vector into an array. |
| // Reverse the order if necessary. |
| int32_t i; |
| for (i=0; i<count; ++i) { |
| int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i; |
| trans[i] = (Transliterator*) list.elementAt(j); |
| } |
| |
| // If the direction is UTRANS_REVERSE then we may need to fix the |
| // ID. |
| if (direction == UTRANS_REVERSE && fixReverseID) { |
| UnicodeString newID; |
| for (i=0; i<count; ++i) { |
| if (i > 0) { |
| newID.append(ID_DELIM); |
| } |
| newID.append(trans[i]->getID()); |
| } |
| setID(newID); |
| } |
| |
| computeMaximumContextLength(); |
| } |
| |
| /** |
| * Return the IDs of the given list of transliterators, concatenated |
| * with ID_DELIM delimiting them. Equivalent to the perlish expression |
| * join(ID_DELIM, map($_.getID(), transliterators). |
| */ |
| UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[], |
| int32_t transCount) { |
| UnicodeString id; |
| for (int32_t i=0; i<transCount; ++i) { |
| if (i > 0) { |
| id.append(ID_DELIM); |
| } |
| id.append(transliterators[i]->getID()); |
| } |
| return id; // Return temporary |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) : |
| Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) { |
| *this = t; |
| } |
| |
| /** |
| * Destructor |
| */ |
| CompoundTransliterator::~CompoundTransliterator() { |
| freeTransliterators(); |
| } |
| |
| void CompoundTransliterator::freeTransliterators(void) { |
| if (trans != 0) { |
| for (int32_t i=0; i<count; ++i) { |
| delete trans[i]; |
| } |
| uprv_free(trans); |
| } |
| trans = 0; |
| count = 0; |
| } |
| |
| /** |
| * Assignment operator. |
| */ |
| CompoundTransliterator& CompoundTransliterator::operator=( |
| const CompoundTransliterator& t) |
| { |
| Transliterator::operator=(t); |
| int32_t i = 0; |
| UBool failed = FALSE; |
| if (trans != NULL) { |
| for (i=0; i<count; ++i) { |
| delete trans[i]; |
| trans[i] = 0; |
| } |
| } |
| if (t.count > count) { |
| if (trans != NULL) { |
| uprv_free(trans); |
| } |
| trans = (Transliterator **)uprv_malloc(t.count * sizeof(Transliterator *)); |
| } |
| count = t.count; |
| if (trans != NULL) { |
| for (i=0; i<count; ++i) { |
| trans[i] = t.trans[i]->clone(); |
| if (trans[i] == NULL) { |
| failed = TRUE; |
| break; |
| } |
| } |
| } |
| |
| // if memory allocation failed delete backwards trans array |
| if (failed && i > 0) { |
| int32_t n; |
| for (n = i-1; n >= 0; n--) { |
| uprv_free(trans[n]); |
| trans[n] = NULL; |
| } |
| } |
| numAnonymousRBTs = t.numAnonymousRBTs; |
| return *this; |
| } |
| |
| /** |
| * Transliterator API. |
| */ |
| Transliterator* CompoundTransliterator::clone(void) const { |
| return new CompoundTransliterator(*this); |
| } |
| |
| /** |
| * Returns the number of transliterators in this chain. |
| * @return number of transliterators in this chain. |
| */ |
| int32_t CompoundTransliterator::getCount(void) const { |
| return count; |
| } |
| |
| /** |
| * Returns the transliterator at the given index in this chain. |
| * @param index index into chain, from 0 to <code>getCount() - 1</code> |
| * @return transliterator at the given index |
| */ |
| const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const { |
| return *trans[index]; |
| } |
| |
| void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[], |
| int32_t transCount) { |
| Transliterator** a = (Transliterator **)uprv_malloc(transCount * sizeof(Transliterator *)); |
| if (a == NULL) { |
| return; |
| } |
| int32_t i = 0; |
| UBool failed = FALSE; |
| for (i=0; i<transCount; ++i) { |
| a[i] = transliterators[i]->clone(); |
| if (a[i] == NULL) { |
| failed = TRUE; |
| break; |
| } |
| } |
| if (failed && i > 0) { |
| int32_t n; |
| for (n = i-1; n >= 0; n--) { |
| uprv_free(a[n]); |
| a[n] = NULL; |
| } |
| return; |
| } |
| adoptTransliterators(a, transCount); |
| } |
| |
| void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[], |
| int32_t transCount) { |
| // First free trans[] and set count to zero. Once this is done, |
| // orphan the filter. Set up the new trans[]. |
| freeTransliterators(); |
| trans = adoptedTransliterators; |
| count = transCount; |
| computeMaximumContextLength(); |
| setID(joinIDs(trans, count)); |
| } |
| |
| /** |
| * Append c to buf, unless buf is empty or buf already ends in c. |
| */ |
| static void _smartAppend(UnicodeString& buf, UChar c) { |
| if (buf.length() != 0 && |
| buf.charAt(buf.length() - 1) != c) { |
| buf.append(c); |
| } |
| } |
| |
| UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, |
| UBool escapeUnprintable) const { |
| // We do NOT call toRules() on our component transliterators, in |
| // general. If we have several rule-based transliterators, this |
| // yields a concatenation of the rules -- not what we want. We do |
| // handle compound RBT transliterators specially -- those for which |
| // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, |
| // we do call toRules() recursively. |
| rulesSource.truncate(0); |
| if (numAnonymousRBTs >= 1 && getFilter() != NULL) { |
| // If we are a compound RBT and if we have a global |
| // filter, then emit it at the top. |
| UnicodeString pat; |
| rulesSource.append(COLON_COLON).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM); |
| } |
| for (int32_t i=0; i<count; ++i) { |
| UnicodeString rule; |
| |
| // Anonymous RuleBasedTransliterators (inline rules and |
| // ::BEGIN/::END blocks) are given IDs that begin with |
| // "%Pass": use toRules() to write all the rules to the output |
| // (and insert "::Null;" if we have two in a row) |
| if (trans[i]->getID().startsWith(PASS_STRING)) { |
| trans[i]->toRules(rule, escapeUnprintable); |
| if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING)) |
| rule = UNICODE_STRING_SIMPLE("::Null;") + rule; |
| |
| // we also use toRules() on CompoundTransliterators (which we |
| // check for by looking for a semicolon in the ID)-- this gets |
| // the list of their child transliterators output in the right |
| // format |
| } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) { |
| trans[i]->toRules(rule, escapeUnprintable); |
| |
| // for everything else, use Transliterator::toRules() |
| } else { |
| trans[i]->Transliterator::toRules(rule, escapeUnprintable); |
| } |
| _smartAppend(rulesSource, NEWLINE); |
| rulesSource.append(rule); |
| _smartAppend(rulesSource, ID_DELIM); |
| } |
| return rulesSource; |
| } |
| |
| /** |
| * Implement Transliterator framework |
| */ |
| void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const { |
| UnicodeSet set; |
| result.clear(); |
| for (int32_t i=0; i<count; ++i) { |
| result.addAll(trans[i]->getSourceSet(set)); |
| // Take the example of Hiragana-Latin. This is really |
| // Hiragana-Katakana; Katakana-Latin. The source set of |
| // these two is roughly [:Hiragana:] and [:Katakana:]. |
| // But the source set for the entire transliterator is |
| // actually [:Hiragana:] ONLY -- that is, the first |
| // non-empty source set. |
| |
| // This is a heuristic, and not 100% reliable. |
| if (!result.isEmpty()) { |
| break; |
| } |
| } |
| } |
| |
| /** |
| * Override Transliterator framework |
| */ |
| UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const { |
| UnicodeSet set; |
| result.clear(); |
| for (int32_t i=0; i<count; ++i) { |
| // This is a heuristic, and not 100% reliable. |
| result.addAll(trans[i]->getTargetSet(set)); |
| } |
| return result; |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, |
| UBool incremental) const { |
| /* Call each transliterator with the same contextStart and |
| * start, but with the limit as modified |
| * by preceding transliterators. The start index must be |
| * reset for each transliterator to give each a chance to |
| * transliterate the text. The initial contextStart index is known |
| * to still point to the same place after each transliterator |
| * is called because each transliterator will not change the |
| * text between contextStart and the initial start index. |
| * |
| * IMPORTANT: After the first transliterator, each subsequent |
| * transliterator only gets to transliterate text committed by |
| * preceding transliterators; that is, the start (output |
| * value) of transliterator i becomes the limit (input value) |
| * of transliterator i+1. Finally, the overall limit is fixed |
| * up before we return. |
| * |
| * Assumptions we make here: |
| * (1) contextStart <= start <= limit <= contextLimit <= text.length() |
| * (2) start <= start' <= limit' ;cursor doesn't move back |
| * (3) start <= limit' ;text before cursor unchanged |
| * - start' is the value of start after calling handleKT |
| * - limit' is the value of limit after calling handleKT |
| */ |
| |
| /** |
| * Example: 3 transliterators. This example illustrates the |
| * mechanics we need to implement. C, S, and L are the contextStart, |
| * start, and limit. gl is the globalLimit. contextLimit is |
| * equal to limit throughout. |
| * |
| * 1. h-u, changes hex to Unicode |
| * |
| * 4 7 a d 0 4 7 a |
| * abc/u0061/u => abca/u |
| * C S L C S L gl=f->a |
| * |
| * 2. upup, changes "x" to "XX" |
| * |
| * 4 7 a 4 7 a |
| * abca/u => abcAA/u |
| * C SL C S |
| * L gl=a->b |
| * 3. u-h, changes Unicode to hex |
| * |
| * 4 7 a 4 7 a d 0 3 |
| * abcAA/u => abc/u0041/u0041/u |
| * C S L C S |
| * L gl=b->15 |
| * 4. return |
| * |
| * 4 7 a d 0 3 |
| * abc/u0041/u0041/u |
| * C S L |
| */ |
| |
| if (count < 1) { |
| index.start = index.limit; |
| return; // Short circuit for empty compound transliterators |
| } |
| |
| // compoundLimit is the limit value for the entire compound |
| // operation. We overwrite index.limit with the previous |
| // index.start. After each transliteration, we update |
| // compoundLimit for insertions or deletions that have happened. |
| int32_t compoundLimit = index.limit; |
| |
| // compoundStart is the start for the entire compound |
| // operation. |
| int32_t compoundStart = index.start; |
| |
| int32_t delta = 0; // delta in length |
| |
| // Give each transliterator a crack at the run of characters. |
| // See comments at the top of the method for more detail. |
| for (int32_t i=0; i<count; ++i) { |
| index.start = compoundStart; // Reset start |
| int32_t limit = index.limit; |
| |
| if (index.start == index.limit) { |
| // Short circuit for empty range |
| break; |
| } |
| |
| trans[i]->filteredTransliterate(text, index, incremental); |
| |
| // In a properly written transliterator, start == limit after |
| // handleTransliterate() returns when incremental is false. |
| // Catch cases where the subclass doesn't do this, and throw |
| // an exception. (Just pinning start to limit is a bad idea, |
| // because what's probably happening is that the subclass |
| // isn't transliterating all the way to the end, and it should |
| // in non-incremental mode.) |
| if (!incremental && index.start != index.limit) { |
| // We can't throw an exception, so just fudge things |
| index.start = index.limit; |
| } |
| |
| // Cumulative delta for insertions/deletions |
| delta += index.limit - limit; |
| |
| if (incremental) { |
| // In the incremental case, only allow subsequent |
| // transliterators to modify what has already been |
| // completely processed by prior transliterators. In the |
| // non-incrmental case, allow each transliterator to |
| // process the entire text. |
| index.limit = index.start; |
| } |
| } |
| |
| compoundLimit += delta; |
| |
| // Start is good where it is -- where the last transliterator left |
| // it. Limit needs to be put back where it was, modulo |
| // adjustments for deletions/insertions. |
| index.limit = compoundLimit; |
| } |
| |
| /** |
| * Sets the length of the longest context required by this transliterator. |
| * This is <em>preceding</em> context. |
| */ |
| void CompoundTransliterator::computeMaximumContextLength(void) { |
| int32_t max = 0; |
| for (int32_t i=0; i<count; ++i) { |
| int32_t len = trans[i]->getMaximumContextLength(); |
| if (len > max) { |
| max = len; |
| } |
| } |
| setMaximumContextLength(max); |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| |
| /* eof */ |