2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
12 #include "unicode/ucsdet.h"
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
37 struct CSRecognizerInfo : public UMemory {
38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
41 ~CSRecognizerInfo() {delete recognizer;};
43 CharsetRecognizer *recognizer;
44 UBool isDefaultEnabled;
49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50 static icu::UInitOnce gCSRecognizersInitOnce;
51 static int32_t fCSRecognizers_size = 0;
54 static UBool U_CALLCONV csdet_cleanup(void)
57 if (fCSRecognizers != NULL) {
58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59 delete fCSRecognizers[r];
60 fCSRecognizers[r] = NULL;
63 DELETE_ARRAY(fCSRecognizers);
64 fCSRecognizers = NULL;
65 fCSRecognizers_size = 0;
67 gCSRecognizersInitOnce.reset();
72 static int32_t U_CALLCONV
73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
77 const CharsetMatch **csm_l = (const CharsetMatch **) left;
78 const CharsetMatch **csm_r = (const CharsetMatch **) right;
80 // NOTE: compare is backwards to sort from highest to lowest.
81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87 CSRecognizerInfo *tempArray[] = {
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
113 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
114 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
121 int32_t rCount = ARRAY_SIZE(tempArray);
123 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
125 if (fCSRecognizers == NULL) {
126 status = U_MEMORY_ALLOCATION_ERROR;
129 fCSRecognizers_size = rCount;
130 for (int32_t r = 0; r < rCount; r += 1) {
131 fCSRecognizers[r] = tempArray[r];
132 if (fCSRecognizers[r] == NULL) {
133 status = U_MEMORY_ALLOCATION_ERROR;
143 void CharsetDetector::setRecognizers(UErrorCode &status)
145 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148 CharsetDetector::CharsetDetector(UErrorCode &status)
149 : textIn(new InputText(status)), resultArray(NULL),
150 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
151 fEnabledRecognizers(NULL)
153 if (U_FAILURE(status)) {
157 setRecognizers(status);
159 if (U_FAILURE(status)) {
163 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
165 if (resultArray == NULL) {
166 status = U_MEMORY_ALLOCATION_ERROR;
170 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
171 resultArray[i] = new CharsetMatch();
173 if (resultArray[i] == NULL) {
174 status = U_MEMORY_ALLOCATION_ERROR;
180 CharsetDetector::~CharsetDetector()
184 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
185 delete resultArray[i];
188 uprv_free(resultArray);
190 if (fEnabledRecognizers) {
191 uprv_free(fEnabledRecognizers);
195 void CharsetDetector::setText(const char *in, int32_t len)
197 textIn->setText(in, len);
198 fFreshTextSet = TRUE;
201 UBool CharsetDetector::setStripTagsFlag(UBool flag)
203 UBool temp = fStripTags;
205 fFreshTextSet = TRUE;
209 UBool CharsetDetector::getStripTagsFlag() const
214 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
216 textIn->setDeclaredEncoding(encoding,len);
219 int32_t CharsetDetector::getDetectableCount()
221 UErrorCode status = U_ZERO_ERROR;
223 setRecognizers(status);
225 return fCSRecognizers_size;
228 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
230 int32_t maxMatchesFound = 0;
232 detectAll(maxMatchesFound, status);
234 if(maxMatchesFound > 0) {
235 return resultArray[0];
241 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
243 if(!textIn->isSet()) {
244 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
247 } else if (fFreshTextSet) {
248 CharsetRecognizer *csr;
251 textIn->MungeInput(fStripTags);
253 // Iterate over all possible charsets, remember all that
254 // give a match quality > 0.
256 for (i = 0; i < fCSRecognizers_size; i += 1) {
257 csr = fCSRecognizers[i]->recognizer;
258 if (csr->match(textIn, resultArray[resultCount])) {
263 if (resultCount > 1) {
264 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
266 fFreshTextSet = FALSE;
269 maxMatchesFound = resultCount;
274 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
276 if (U_FAILURE(status)) {
281 UBool isDefaultVal = FALSE;
282 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
283 CSRecognizerInfo *csrinfo = fCSRecognizers[i];
284 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
286 isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
291 // No matching encoding found
292 status = U_ILLEGAL_ARGUMENT_ERROR;
296 if (fEnabledRecognizers == NULL && !isDefaultVal) {
297 // Create an array storing the non default setting
298 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
299 if (fEnabledRecognizers == NULL) {
300 status = U_MEMORY_ALLOCATION_ERROR;
303 // Initialize the array with default info
304 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
305 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
309 if (fEnabledRecognizers != NULL) {
310 fEnabledRecognizers[modIdx] = enabled;
314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
316 if( index > fCSRecognizers_size-1 || index < 0) {
317 status = U_INDEX_OUTOFBOUNDS_ERROR;
321 return fCSRecognizers[index]->getName();
331 UBool *enabledRecognizers;
336 static void U_CALLCONV
337 enumClose(UEnumeration *en) {
338 if(en->context != NULL) {
339 DELETE_ARRAY(en->context);
345 static int32_t U_CALLCONV
346 enumCount(UEnumeration *en, UErrorCode *) {
347 if (((Context *)en->context)->all) {
348 // ucsdet_getAllDetectableCharsets, all charset detector names
349 return fCSRecognizers_size;
352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
354 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
355 if (enabledArray != NULL) {
357 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
358 if (enabledArray[i]) {
364 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365 if (fCSRecognizers[i]->isDefaultEnabled) {
373 static const char* U_CALLCONV
374 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
375 const char *currName = NULL;
377 if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
378 if (((Context *)en->context)->all) {
379 // ucsdet_getAllDetectableCharsets, all charset detector names
380 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
381 ((Context *)en->context)->currIndex++;
383 // ucsdet_getDetectableCharsets
384 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
385 if (enabledArray != NULL) {
387 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
388 if (enabledArray[((Context *)en->context)->currIndex]) {
389 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
391 ((Context *)en->context)->currIndex++;
395 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
396 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
397 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
399 ((Context *)en->context)->currIndex++;
405 if(resultLength != NULL) {
406 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
413 static void U_CALLCONV
414 enumReset(UEnumeration *en, UErrorCode *) {
415 ((Context *)en->context)->currIndex = 0;
418 static const UEnumeration gCSDetEnumeration = {
432 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
435 /* Initialize recognized charsets. */
436 setRecognizers(status);
438 if(U_FAILURE(status)) {
442 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
444 status = U_MEMORY_ALLOCATION_ERROR;
447 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
448 en->context = (void*)NEW_ARRAY(Context, 1);
449 if (en->context == NULL) {
450 status = U_MEMORY_ALLOCATION_ERROR;
454 uprv_memset(en->context, 0, sizeof(Context));
455 ((Context*)en->context)->all = TRUE;
459 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
461 if(U_FAILURE(status)) {
465 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
467 status = U_MEMORY_ALLOCATION_ERROR;
470 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
471 en->context = (void*)NEW_ARRAY(Context, 1);
472 if (en->context == NULL) {
473 status = U_MEMORY_ALLOCATION_ERROR;
477 uprv_memset(en->context, 0, sizeof(Context));
478 ((Context*)en->context)->all = FALSE;
479 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;