1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ****************************************************************************
5 * Copyright (c) 2005-2016, International Business Machines Corporation and *
6 * others. All Rights Reserved. *
7 ****************************************************************************
10 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/ustring.h"
22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
23 #define DELETE_ARRAY(array) free(array)
25 static void TestConstruction(void);
26 static void TestUTF8(void);
27 static void TestUTF16(void);
28 static void TestC1Bytes(void);
29 static void TestInputFilter(void);
30 static void TestChaining(void);
31 static void TestBufferOverflow(void);
32 static void TestIBM424(void);
33 static void TestIBM420(void);
35 void addUCsdetTest(TestNode** root);
37 void addUCsdetTest(TestNode** root)
39 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
40 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
41 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
42 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
43 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
44 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
45 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
46 #if !UCONFIG_NO_LEGACY_CONVERSION
47 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
48 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
52 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
56 char *dest, *destLimit = buffer + sizeof(buffer);
57 const UChar *srcLimit = src + length;
62 status = U_ZERO_ERROR;
63 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
64 result += (int32_t) (dest - buffer);
65 } while (status == U_BUFFER_OVERFLOW_ERROR);
70 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
72 UErrorCode status = U_ZERO_ERROR;
73 UConverter *cnv = ucnv_open(codepage, &status);
74 int32_t byteCount = preflight(src, length, cnv);
75 const UChar *srcLimit = src + length;
76 char *bytes = NEW_ARRAY(char, byteCount + 1);
77 char *dest = bytes, *destLimit = bytes + byteCount + 1;
79 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
82 *byteLength = byteCount;
86 static void freeBytes(char *bytes)
91 static void TestConstruction(void)
93 UErrorCode status = U_ZERO_ERROR;
94 UCharsetDetector *csd = ucsdet_open(&status);
95 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
97 int32_t count = uenum_count(e, &status);
100 for(i = 0; i < count; i += 1) {
101 name = uenum_next(e, &length, &status);
103 if(name == NULL || length <= 0) {
104 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
107 /* one past the list of all names must return NULL */
108 name = uenum_next(e, &length, &status);
109 if(name != NULL || length != 0 || U_FAILURE(status)) {
110 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
117 static void TestUTF8(void)
119 UErrorCode status = U_ZERO_ERROR;
120 static const char ss[] = "This is a string with some non-ascii characters that will "
121 "be converted to UTF-8, then shoved through the detection process. "
122 "\\u0391\\u0392\\u0393\\u0394\\u0395"
123 "Sure would be nice if our source could contain Unicode directly!";
124 int32_t byteLength = 0, sLength = 0, dLength = 0;
127 UCharsetDetector *csd = ucsdet_open(&status);
128 const UCharsetMatch *match;
129 UChar detected[sizeof(ss)];
131 sLength = u_unescape(ss, s, sizeof(ss));
132 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
134 ucsdet_setText(csd, bytes, byteLength, &status);
135 if (U_FAILURE(status)) {
136 log_err("status is %s\n", u_errorName(status));
140 match = ucsdet_detect(csd, &status);
143 log_err("Detection failure for UTF-8: got no matches.\n");
147 dLength = ucsdet_getUChars(match, detected, sLength, &status);
149 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
150 log_err("Round-trip test failed!\n");
153 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
160 static void TestUTF16(void)
162 UErrorCode status = U_ZERO_ERROR;
163 /* Notice the BOM on the start of this string */
164 static const UChar chars[] = {
165 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
166 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
167 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
168 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
169 0x064a, 0x062a, 0x0000};
170 int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
171 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
172 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
173 UCharsetDetector *csd = ucsdet_open(&status);
174 const UCharsetMatch *match;
178 ucsdet_setText(csd, beBytes, beLength, &status);
179 match = ucsdet_detect(csd, &status);
182 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
186 name = ucsdet_getName(match, &status);
187 conf = ucsdet_getConfidence(match, &status);
189 if (strcmp(name, "UTF-16BE") != 0) {
190 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
194 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
198 ucsdet_setText(csd, leBytes, leLength, &status);
199 match = ucsdet_detect(csd, &status);
202 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
206 name = ucsdet_getName(match, &status);
207 conf = ucsdet_getConfidence(match, &status);
210 if (strcmp(name, "UTF-16LE") != 0) {
211 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
215 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
224 static void TestC1Bytes(void)
226 #if !UCONFIG_NO_LEGACY_CONVERSION
227 UErrorCode status = U_ZERO_ERROR;
228 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
229 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
230 int32_t sISOLength = 0, sWindowsLength = 0;
231 UChar sISO[sizeof(ssISO)];
232 UChar sWindows[sizeof(ssWindows)];
233 int32_t lISO = 0, lWindows = 0;
236 UCharsetDetector *csd = ucsdet_open(&status);
237 const UCharsetMatch *match;
240 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
241 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
242 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
243 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
245 ucsdet_setText(csd, bWindows, lWindows, &status);
246 match = ucsdet_detect(csd, &status);
249 log_err("English test with C1 bytes got no matches.\n");
253 name = ucsdet_getName(match, &status);
255 if (strcmp(name, "windows-1252") != 0) {
256 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
259 ucsdet_setText(csd, bISO, lISO, &status);
260 match = ucsdet_detect(csd, &status);
263 log_err("English text without C1 bytes got no matches.\n");
267 name = ucsdet_getName(match, &status);
269 if (strcmp(name, "ISO-8859-1") != 0) {
270 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
281 static void TestInputFilter(void)
283 UErrorCode status = U_ZERO_ERROR;
284 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
287 int32_t byteLength = 0;
289 UCharsetDetector *csd = ucsdet_open(&status);
290 const UCharsetMatch *match;
291 const char *lang, *name;
293 sLength = u_unescape(ss, s, sizeof(ss));
294 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
296 ucsdet_enableInputFilter(csd, TRUE);
298 if (!ucsdet_isInputFilterEnabled(csd)) {
299 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
303 ucsdet_setText(csd, bytes, byteLength, &status);
304 match = ucsdet_detect(csd, &status);
307 log_err("Turning on the input filter resulted in no matches.\n");
311 name = ucsdet_getName(match, &status);
313 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
314 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
316 lang = ucsdet_getLanguage(match, &status);
318 if (lang == NULL || strcmp(lang, "fr") != 0) {
319 log_err("Input filter did not strip markup!\n");
324 ucsdet_enableInputFilter(csd, FALSE);
325 ucsdet_setText(csd, bytes, byteLength, &status);
326 match = ucsdet_detect(csd, &status);
329 log_err("Turning off the input filter resulted in no matches.\n");
333 name = ucsdet_getName(match, &status);
335 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
336 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
338 lang = ucsdet_getLanguage(match, &status);
340 if (lang == NULL || strcmp(lang, "en") != 0) {
341 log_err("Unfiltered input did not detect as English!\n");
350 static void TestChaining(void) {
351 UErrorCode status = U_USELESS_COLLATOR_ERROR;
353 ucsdet_open(&status);
354 ucsdet_setText(NULL, NULL, 0, &status);
355 ucsdet_getName(NULL, &status);
356 ucsdet_getConfidence(NULL, &status);
357 ucsdet_getLanguage(NULL, &status);
358 ucsdet_detect(NULL, &status);
359 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
360 ucsdet_detectAll(NULL, NULL, &status);
361 ucsdet_getUChars(NULL, NULL, 0, &status);
362 ucsdet_getUChars(NULL, NULL, 0, &status);
365 /* All of this code should have done nothing. */
366 if (status != U_USELESS_COLLATOR_ERROR) {
367 log_err("Status got changed to %s\n", u_errorName(status));
371 static void TestBufferOverflow(void) {
372 UErrorCode status = U_ZERO_ERROR;
373 static const char *testStrings[] = {
374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
377 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
378 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
379 "\xa1", /* Could be a single byte shift-jis at the end */
380 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
381 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
383 static const char *testResults[] = {
394 UCharsetDetector *csd = ucsdet_open(&status);
395 const UCharsetMatch *match;
397 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
399 if (U_FAILURE(status)) {
400 log_err("Couldn't open detector. %s\n", u_errorName(status));
404 for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
405 ucsdet_setText(csd, testStrings[idx], -1, &status);
406 match = ucsdet_detect(csd, &status);
409 if (testResults[idx] != NULL) {
410 log_err("Unexpectedly got no results at index %d.\n", idx);
413 log_verbose("Got no result as expected at index %d.\n", idx);
418 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
419 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
420 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
429 static void TestIBM424(void)
431 UErrorCode status = U_ZERO_ERROR;
433 static const UChar chars[] = {
434 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
435 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
436 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
437 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
438 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
439 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
440 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
441 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
442 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
443 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
444 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
445 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
446 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
447 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
448 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
449 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
450 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
453 static const UChar chars_reverse[] = {
454 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
455 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
456 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
457 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
458 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
459 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
460 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
461 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
462 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
463 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
464 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
465 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
466 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
467 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
468 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
469 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
470 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
474 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
476 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
477 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
479 UCharsetDetector *csd = ucsdet_open(&status);
480 const UCharsetMatch *match;
483 ucsdet_setText(csd, bytes, bLength, &status);
484 match = ucsdet_detect(csd, &status);
487 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
491 name = ucsdet_getName(match, &status);
492 if (strcmp(name, "IBM424_rtl") != 0) {
493 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
496 ucsdet_setText(csd, bytes_r, brLength, &status);
497 match = ucsdet_detect(csd, &status);
500 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
504 name = ucsdet_getName(match, &status);
505 if (strcmp(name, "IBM424_ltr") != 0) {
506 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
515 static void TestIBM420(void)
517 UErrorCode status = U_ZERO_ERROR;
519 static const UChar chars[] = {
520 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
521 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
522 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
523 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
524 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
525 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
526 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
527 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
528 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
529 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
530 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
531 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
532 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
535 static const UChar chars_reverse[] = {
536 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
537 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
538 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
539 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
540 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
541 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
542 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
543 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
544 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
545 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
546 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
547 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
548 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
552 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
554 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
555 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
557 UCharsetDetector *csd = ucsdet_open(&status);
558 const UCharsetMatch *match;
561 ucsdet_setText(csd, bytes, bLength, &status);
562 match = ucsdet_detect(csd, &status);
565 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
569 name = ucsdet_getName(match, &status);
570 if (strcmp(name, "IBM420_rtl") != 0) {
571 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
574 ucsdet_setText(csd, bytes_r, brLength, &status);
575 match = ucsdet_detect(csd, &status);
578 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
582 name = ucsdet_getName(match, &status);
583 if (strcmp(name, "IBM420_ltr") != 0) {
584 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);