1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
22 #define min(x,y) (((x)<(y))?(x):(y))
24 static const uint16_t commonChars_sjis [] = {
25 // TODO: This set of data comes from the character frequency-
26 // of-occurence analysis tool. The data needs to be moved
27 // into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
35 static const uint16_t commonChars_euc_jp[] = {
36 // TODO: This set of data comes from the character frequency-
37 // of-occurence analysis tool. The data needs to be moved
38 // into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
50 static const uint16_t commonChars_euc_kr[] = {
51 // TODO: This set of data comes from the character frequency-
52 // of-occurence analysis tool. The data needs to be moved
53 // into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
65 static const uint16_t commonChars_big5[] = {
66 // TODO: This set of data comes from the character frequency-
67 // of-occurence analysis tool. The data needs to be moved
68 // into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
80 static const uint16_t commonChars_gb_18030[] = {
81 // TODO: This set of data comes from the character frequency-
82 // of-occurence analysis tool. The data needs to be moved
83 // into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
97 int32_t start = 0, end = len-1;
98 int32_t mid = (start+end)/2;
100 while(start <= end) {
101 if(array[mid] == value) {
105 if(array[mid] < value){
117 IteratedChar::IteratedChar() :
118 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
120 // nothing else to do.
123 /*void IteratedChar::reset()
132 int32_t IteratedChar::nextByte(InputText *det)
134 if (nextIndex >= det->fRawLength) {
140 return det->fRawInput[nextIndex++];
143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149 int32_t singleByteCharCount = 0;
150 int32_t doubleByteCharCount = 0;
151 int32_t commonCharCount = 0;
152 int32_t badCharCount = 0;
153 int32_t totalCharCount = 0;
154 int32_t confidence = 0;
157 while (nextChar(&iter, det)) {
163 if (iter.charValue <= 0xFF) {
164 singleByteCharCount++;
166 doubleByteCharCount++;
168 if (commonChars != 0) {
169 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
170 commonCharCount += 1;
177 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178 // Bail out early if the byte data is not matching the encoding scheme.
179 // break detectBlock;
184 if (doubleByteCharCount <= 10 && badCharCount == 0) {
185 // Not many multi-byte chars.
186 if (doubleByteCharCount == 0 && totalCharCount < 10) {
187 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188 // We don't have enough data to have any confidence.
189 // Statistical analysis of single byte non-ASCII charcters would probably help here.
193 // ASCII or ISO file? It's probably not our encoding,
194 // but is not incompatible with our encoding, so don't give it a zero.
202 // No match if there are too many characters that don't fit the encoding scheme.
203 // (should we have zero tolerance for these?)
205 if (doubleByteCharCount < 20*badCharCount) {
211 if (commonChars == 0) {
212 // We have no statistics on frequently occuring characters.
213 // Assess confidence purely on having a reasonable number of
214 // multi-byte characters (the more the better)
215 confidence = 30 + doubleByteCharCount - 20*badCharCount;
217 if (confidence > 100) {
222 // Frequency of occurence statistics exist.
225 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226 double scaleFactor = 90.0 / maxVal;
227 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
229 confidence = min(confidence, 100);
232 if (confidence < 0) {
239 CharsetRecog_sjis::~CharsetRecog_sjis()
244 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245 it->index = it->nextIndex;
248 int32_t firstByte = it->charValue = it->nextByte(det);
254 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
258 int32_t secondByte = it->nextByte(det);
259 if (secondByte >= 0) {
260 it->charValue = (firstByte << 8) | secondByte;
262 // else we'll handle the error later.
264 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265 // Illegal second byte value.
272 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274 results->set(det, this, confidence);
275 return (confidence > 0);
278 const char *CharsetRecog_sjis::getName() const
283 const char *CharsetRecog_sjis::getLanguage() const
288 CharsetRecog_euc::~CharsetRecog_euc()
293 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294 int32_t firstByte = 0;
295 int32_t secondByte = 0;
296 int32_t thirdByte = 0;
298 it->index = it->nextIndex;
300 firstByte = it->charValue = it->nextByte(det);
303 // Ran off the end of the input data
307 if (firstByte <= 0x8D) {
312 secondByte = it->nextByte(det);
313 if (secondByte >= 0) {
314 it->charValue = (it->charValue << 8) | secondByte;
316 // else we'll handle the error later.
318 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
320 if (secondByte < 0xA1) {
327 if (firstByte == 0x8E) {
329 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331 // We don't know which we've got.
332 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
333 // bytes will look like a well formed 2 byte char.
334 if (secondByte < 0xA1) {
341 if (firstByte == 0x8F) {
343 // Three byte total char size, two bytes of actual char value.
344 thirdByte = it->nextByte(det);
345 it->charValue = (it->charValue << 8) | thirdByte;
347 if (thirdByte < 0xa1) {
348 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
357 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
362 const char *CharsetRecog_euc_jp::getName() const
367 const char *CharsetRecog_euc_jp::getLanguage() const
372 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
374 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375 results->set(det, this, confidence);
376 return (confidence > 0);
379 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
384 const char *CharsetRecog_euc_kr::getName() const
389 const char *CharsetRecog_euc_kr::getLanguage() const
394 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
396 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397 results->set(det, this, confidence);
398 return (confidence > 0);
401 CharsetRecog_big5::~CharsetRecog_big5()
406 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
410 it->index = it->nextIndex;
412 firstByte = it->charValue = it->nextByte(det);
418 if (firstByte <= 0x7F || firstByte == 0xFF) {
419 // single byte character.
423 int32_t secondByte = it->nextByte(det);
424 if (secondByte >= 0) {
425 it->charValue = (it->charValue << 8) | secondByte;
427 // else we'll handle the error later.
429 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
436 const char *CharsetRecog_big5::getName() const
441 const char *CharsetRecog_big5::getLanguage() const
446 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
448 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449 results->set(det, this, confidence);
450 return (confidence > 0);
453 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
458 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459 int32_t firstByte = 0;
460 int32_t secondByte = 0;
461 int32_t thirdByte = 0;
462 int32_t fourthByte = 0;
464 it->index = it->nextIndex;
466 firstByte = it->charValue = it->nextByte(det);
469 // Ran off the end of the input data
473 if (firstByte <= 0x80) {
478 secondByte = it->nextByte(det);
479 if (secondByte >= 0) {
480 it->charValue = (it->charValue << 8) | secondByte;
482 // else we'll handle the error later.
484 if (firstByte >= 0x81 && firstByte <= 0xFE) {
486 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
491 if (secondByte >= 0x30 && secondByte <= 0x39) {
492 thirdByte = it->nextByte(det);
494 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495 fourthByte = it->nextByte(det);
497 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
505 // Something wasn't valid, or we ran out of data (-1).
512 const char *CharsetRecog_gb_18030::getName() const
517 const char *CharsetRecog_gb_18030::getLanguage() const
522 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
524 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525 results->set(det, this, confidence);
526 return (confidence > 0);