| /* |
| ********************************************************************** |
| * Copyright (C) 2005-2009, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_CONVERSION |
| |
| #include "unicode/ucsdet.h" |
| |
| #include "csdetect.h" |
| #include "csmatch.h" |
| #include "uenumimp.h" |
| |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "umutex.h" |
| #include "ucln_in.h" |
| #include "uarrsort.h" |
| #include "inputext.h" |
| #include "csrsbcs.h" |
| #include "csrmbcs.h" |
| #include "csrutf8.h" |
| #include "csrucode.h" |
| #include "csr2022.h" |
| |
| #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| |
| #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
| #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| |
| U_CDECL_BEGIN |
| static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL; |
| |
| static int32_t fCSRecognizers_size = 0; |
| |
| static UBool U_CALLCONV csdet_cleanup(void) |
| { |
| if (fCSRecognizers != NULL) { |
| for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { |
| delete fCSRecognizers[r]; |
| fCSRecognizers[r] = NULL; |
| } |
| |
| DELETE_ARRAY(fCSRecognizers); |
| fCSRecognizers = NULL; |
| fCSRecognizers_size = 0; |
| } |
| |
| return TRUE; |
| } |
| |
| static int32_t U_CALLCONV |
| charsetMatchComparator(const void * /*context*/, const void *left, const void *right) |
| { |
| U_NAMESPACE_USE |
| |
| const CharsetMatch **csm_l = (const CharsetMatch **) left; |
| const CharsetMatch **csm_r = (const CharsetMatch **) right; |
| |
| // NOTE: compare is backwards to sort from highest to lowest. |
| return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); |
| } |
| |
| U_CDECL_END |
| |
| U_NAMESPACE_BEGIN |
| |
| void CharsetDetector::setRecognizers(UErrorCode &status) |
| { |
| UBool needsInit; |
| CharsetRecognizer **recognizers; |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); |
| |
| if (needsInit) { |
| CharsetRecognizer *tempArray[] = { |
| new CharsetRecog_UTF8(), |
| |
| new CharsetRecog_UTF_16_BE(), |
| new CharsetRecog_UTF_16_LE(), |
| new CharsetRecog_UTF_32_BE(), |
| new CharsetRecog_UTF_32_LE(), |
| |
| new CharsetRecog_8859_1_en(), |
| new CharsetRecog_8859_1_da(), |
| new CharsetRecog_8859_1_de(), |
| new CharsetRecog_8859_1_es(), |
| new CharsetRecog_8859_1_fr(), |
| new CharsetRecog_8859_1_it(), |
| new CharsetRecog_8859_1_nl(), |
| new CharsetRecog_8859_1_no(), |
| new CharsetRecog_8859_1_pt(), |
| new CharsetRecog_8859_1_sv(), |
| new CharsetRecog_8859_2_cs(), |
| new CharsetRecog_8859_2_hu(), |
| new CharsetRecog_8859_2_pl(), |
| new CharsetRecog_8859_2_ro(), |
| new CharsetRecog_8859_5_ru(), |
| new CharsetRecog_8859_6_ar(), |
| new CharsetRecog_8859_7_el(), |
| new CharsetRecog_8859_8_I_he(), |
| new CharsetRecog_8859_8_he(), |
| new CharsetRecog_windows_1251(), |
| new CharsetRecog_windows_1256(), |
| new CharsetRecog_KOI8_R(), |
| new CharsetRecog_8859_9_tr(), |
| new CharsetRecog_sjis(), |
| new CharsetRecog_gb_18030(), |
| new CharsetRecog_euc_jp(), |
| new CharsetRecog_euc_kr(), |
| new CharsetRecog_big5(), |
| |
| new CharsetRecog_2022JP(), |
| new CharsetRecog_2022KR(), |
| new CharsetRecog_2022CN(), |
| |
| new CharsetRecog_IBM424_he_rtl(), |
| new CharsetRecog_IBM424_he_ltr(), |
| new CharsetRecog_IBM420_ar_rtl(), |
| new CharsetRecog_IBM420_ar_ltr() |
| }; |
| int32_t rCount = ARRAY_SIZE(tempArray); |
| int32_t r; |
| |
| recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); |
| |
| if (recognizers == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } else { |
| for (r = 0; r < rCount; r += 1) { |
| recognizers[r] = tempArray[r]; |
| |
| if (recognizers[r] == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| } |
| } |
| |
| if (U_SUCCESS(status)) { |
| umtx_lock(NULL); |
| if (fCSRecognizers == NULL) { |
| fCSRecognizers_size = rCount; |
| fCSRecognizers = recognizers; |
| } |
| umtx_unlock(NULL); |
| } |
| |
| if (fCSRecognizers != recognizers) { |
| for (r = 0; r < rCount; r += 1) { |
| delete recognizers[r]; |
| recognizers[r] = NULL; |
| } |
| |
| DELETE_ARRAY(recognizers); |
| } |
| |
| recognizers = NULL; |
| ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); |
| } |
| } |
| |
| CharsetDetector::CharsetDetector(UErrorCode &status) |
| : textIn(new InputText(status)), resultArray(NULL), |
| resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| setRecognizers(status); |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); |
| |
| if (resultArray == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
| resultArray[i] = new CharsetMatch(); |
| |
| if (resultArray[i] == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| } |
| } |
| |
| CharsetDetector::~CharsetDetector() |
| { |
| delete textIn; |
| |
| for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
| delete resultArray[i]; |
| } |
| |
| uprv_free(resultArray); |
| } |
| |
| void CharsetDetector::setText(const char *in, int32_t len) |
| { |
| textIn->setText(in, len); |
| fFreshTextSet = TRUE; |
| } |
| |
| UBool CharsetDetector::setStripTagsFlag(UBool flag) |
| { |
| UBool temp = fStripTags; |
| fStripTags = flag; |
| fFreshTextSet = TRUE; |
| return temp; |
| } |
| |
| UBool CharsetDetector::getStripTagsFlag() const |
| { |
| return fStripTags; |
| } |
| |
| void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const |
| { |
| textIn->setDeclaredEncoding(encoding,len); |
| } |
| |
| int32_t CharsetDetector::getDetectableCount() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| setRecognizers(status); |
| |
| return fCSRecognizers_size; |
| } |
| |
| const CharsetMatch *CharsetDetector::detect(UErrorCode &status) |
| { |
| int32_t maxMatchesFound = 0; |
| |
| detectAll(maxMatchesFound, status); |
| |
| if(maxMatchesFound > 0) { |
| return resultArray[0]; |
| } else { |
| return NULL; |
| } |
| } |
| |
| const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) |
| { |
| if(!textIn->isSet()) { |
| status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set |
| |
| return NULL; |
| } else if(fFreshTextSet) { |
| CharsetRecognizer *csr; |
| int32_t detectResults; |
| int32_t confidence; |
| int32_t i; |
| |
| textIn->MungeInput(fStripTags); |
| |
| // Iterate over all possible charsets, remember all that |
| // give a match quality > 0. |
| resultCount = 0; |
| for (i = 0; i < fCSRecognizers_size; i += 1) { |
| csr = fCSRecognizers[i]; |
| detectResults = csr->match(textIn); |
| confidence = detectResults; |
| |
| if (confidence > 0) { |
| resultArray[resultCount++]->set(textIn, csr, confidence); |
| } |
| } |
| |
| for(i = resultCount; i < fCSRecognizers_size; i += 1) { |
| resultArray[i]->set(textIn, 0, 0); |
| } |
| |
| uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); |
| |
| // Remove duplicate charsets from the results. |
| // Simple minded, brute force approach - check each entry against all that follow. |
| // The first entry of any duplicated set is the one that should be kept because it will |
| // be the one with the highest confidence rating. |
| // (Duplicate matches have different languages, only the charset is the same) |
| // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually |
| // deleted, just reordered, with the unwanted duplicates placed after the good results. |
| int32_t j, k; |
| for (i=0; i<resultCount; i++) { |
| const char *charSetName = resultArray[i]->getName(); |
| for (j=i+1; j<resultCount; ) { |
| if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) { |
| // Not a duplicate. |
| j++; |
| } else { |
| // Duplicate entry at index j. |
| CharsetMatch *duplicate = resultArray[j]; |
| for (k=j; k<resultCount-1; k++) { |
| resultArray[k] = resultArray[k+1]; |
| } |
| resultCount--; |
| resultArray[resultCount] = duplicate; |
| } |
| } |
| } |
| |
| fFreshTextSet = FALSE; |
| } |
| |
| maxMatchesFound = resultCount; |
| |
| return resultArray; |
| } |
| |
| /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const |
| { |
| if( index > fCSRecognizers_size-1 || index < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| |
| return 0; |
| } else { |
| return fCSRecognizers[index]->getName(); |
| } |
| }*/ |
| |
| U_NAMESPACE_END |
| |
| U_CDECL_BEGIN |
| typedef struct { |
| int32_t currIndex; |
| } Context; |
| |
| |
| |
| static void U_CALLCONV |
| enumClose(UEnumeration *en) { |
| if(en->context != NULL) { |
| DELETE_ARRAY(en->context); |
| } |
| |
| DELETE_ARRAY(en); |
| } |
| |
| static int32_t U_CALLCONV |
| enumCount(UEnumeration *, UErrorCode *) { |
| return fCSRecognizers_size; |
| } |
| |
| static const char* U_CALLCONV |
| enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { |
| if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { |
| if(resultLength != NULL) { |
| *resultLength = 0; |
| } |
| return NULL; |
| } |
| const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); |
| if(resultLength != NULL) { |
| *resultLength = (int32_t)uprv_strlen(currName); |
| } |
| ((Context *)en->context)->currIndex++; |
| |
| return currName; |
| } |
| |
| static void U_CALLCONV |
| enumReset(UEnumeration *en, UErrorCode *) { |
| ((Context *)en->context)->currIndex = 0; |
| } |
| |
| static const UEnumeration gCSDetEnumeration = { |
| NULL, |
| NULL, |
| enumClose, |
| enumCount, |
| uenum_unextDefault, |
| enumNext, |
| enumReset |
| }; |
| |
| U_CAPI UEnumeration * U_EXPORT2 |
| ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) |
| { |
| U_NAMESPACE_USE |
| |
| if(U_FAILURE(*status)) { |
| return 0; |
| } |
| |
| /* Initialize recognized charsets. */ |
| CharsetDetector::getDetectableCount(); |
| |
| UEnumeration *en = NEW_ARRAY(UEnumeration, 1); |
| memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); |
| en->context = (void*)NEW_ARRAY(Context, 1); |
| uprv_memset(en->context, 0, sizeof(Context)); |
| return en; |
| } |
| U_CDECL_END |
| |
| #endif |
| |