| /* |
| ********************************************************************** |
| * Copyright (C) 2008-2009, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 05/11/2008 Andy Heninger Port from Java |
| ********************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION |
| |
| #include "unicode/unifilt.h" |
| #include "unicode/uchar.h" |
| #include "unicode/uniset.h" |
| #include "unicode/brkiter.h" |
| #include "brktrans.h" |
| #include "unicode/uchar.h" |
| #include "cmemory.h" |
| #include "uprops.h" |
| #include "uinvchar.h" |
| #include "util.h" |
| #include "uvectr32.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) |
| |
| static const UChar SPACE = 32; // ' ' |
| |
| |
| /** |
| * Constructs a transliterator with the default delimiters '{' and |
| * '}'. |
| */ |
| BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : |
| Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
| fInsertion(SPACE) { |
| bi = NULL; |
| UErrorCode status = U_ZERO_ERROR; |
| boundaries = new UVector32(status); |
| } |
| |
| |
| /** |
| * Destructor. |
| */ |
| BreakTransliterator::~BreakTransliterator() { |
| delete bi; |
| bi = NULL; |
| delete boundaries; |
| boundaries = NULL; |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : |
| Transliterator(o) { |
| bi = NULL; |
| if (o.bi != NULL) { |
| bi = o.bi->clone(); |
| } |
| fInsertion = o.fInsertion; |
| UErrorCode status = U_ZERO_ERROR; |
| boundaries = new UVector32(status); |
| } |
| |
| |
| /** |
| * Transliterator API. |
| */ |
| Transliterator* BreakTransliterator::clone(void) const { |
| return new BreakTransliterator(*this); |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, |
| UBool isIncremental ) const { |
| |
| UErrorCode status = U_ZERO_ERROR; |
| boundaries->removeAllElements(); |
| BreakTransliterator *nonConstThis = (BreakTransliterator *)this; |
| nonConstThis->getBreakIterator(); // Lazy-create it if necessary |
| UnicodeString sText = replaceableAsString(text); |
| bi->setText(sText); |
| bi->preceding(offsets.start); |
| |
| // To make things much easier, we will stack the boundaries, and then insert at the end. |
| // generally, we won't need too many, since we will be filtered. |
| |
| int32_t boundary; |
| for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { |
| if (boundary == 0) continue; |
| // HACK: Check to see that preceeding item was a letter |
| |
| UChar32 cp = sText.char32At(boundary-1); |
| int type = u_charType(cp); |
| //System.out.println(Integer.toString(cp,16) + " (before): " + type); |
| if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
| |
| cp = sText.char32At(boundary); |
| type = u_charType(cp); |
| //System.out.println(Integer.toString(cp,16) + " (after): " + type); |
| if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
| |
| boundaries->addElement(boundary, status); |
| // printf("Boundary at %d\n", boundary); |
| } |
| |
| int delta = 0; |
| int lastBoundary = 0; |
| |
| if (boundaries->size() != 0) { // if we found something, adjust |
| delta = boundaries->size() * fInsertion.length(); |
| lastBoundary = boundaries->lastElementi(); |
| |
| // we do this from the end backwards, so that we don't have to keep updating. |
| |
| while (boundaries->size() > 0) { |
| boundary = boundaries->popi(); |
| text.handleReplaceBetween(boundary, boundary, fInsertion); |
| } |
| } |
| |
| // Now fix up the return values |
| offsets.contextLimit += delta; |
| offsets.limit += delta; |
| offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; |
| |
| // TODO: do something with U_FAILURE(status); |
| // (need to look at transliterators overall, not just here.) |
| } |
| |
| // |
| // getInsertion() |
| // |
| const UnicodeString &BreakTransliterator::getInsertion() const { |
| return fInsertion; |
| } |
| |
| // |
| // setInsertion() |
| // |
| void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
| this->fInsertion = insertion; |
| } |
| |
| // |
| // getBreakIterator Lazily create the break iterator if it does |
| // not already exist. Copied from Java, probably |
| // better to just create it in the constructor. |
| // |
| BreakIterator *BreakTransliterator::getBreakIterator() { |
| UErrorCode status = U_ZERO_ERROR; |
| if (bi == NULL) { |
| // Note: Thai breaking behavior is universal, it is not |
| // tied to the Thai locale. |
| bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); |
| } |
| return bi; |
| } |
| |
| // |
| // replaceableAsString Hack to let break iterators work |
| // on the replaceable text from transliterators. |
| // In practice, the only real Replaceable type that we |
| // will be seeing is UnicodeString, so this function |
| // will normally be efficient. |
| // |
| UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { |
| if (r.getDynamicClassID() == UnicodeString::getStaticClassID()) { |
| return (UnicodeString &) r; |
| } |
| UnicodeString s; |
| r.extractBetween(0, r.length(), s); |
| return s; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |