source/i18n/csrmbcs.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  **********************************************************************
   5  *   Copyright (C) 2005-2016, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  **********************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_CONVERSION
  13
  14 #include "cmemory.h"
  15 #include "csmatch.h"
  16 #include "csrmbcs.h"
  17
  18 #include <math.h>
  19
  20 U_NAMESPACE_BEGIN
  21
  22 #define min(x,y) (((x)<(y))?(x):(y))
  23
  24 static const uint16_t commonChars_sjis [] = {
  25 // TODO:  This set of data comes from the character frequency-
  26 //        of-occurence analysis tool.  The data needs to be moved
  27 //        into a resource and loaded from there.
  28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
  29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
  30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
  31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
  32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
  33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
  34
  35 static const uint16_t commonChars_euc_jp[] = {
  36 // TODO:  This set of data comes from the character frequency-
  37 //        of-occurence analysis tool.  The data needs to be moved
  38 //        into a resource and loaded from there.
  39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
  40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
  41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
  42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
  43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
  44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
  45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
  46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
  47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
  48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
  49
  50 static const uint16_t commonChars_euc_kr[] = {
  51 // TODO:  This set of data comes from the character frequency-
  52 //        of-occurence analysis tool.  The data needs to be moved
  53 //        into a resource and loaded from there.
  54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
  55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
  56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
  57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
  58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
  59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
  60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
  61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
  62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
  63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
  64
  65 static const uint16_t commonChars_big5[] = {
  66 // TODO:  This set of data comes from the character frequency-
  67 //        of-occurence analysis tool.  The data needs to be moved
  68 //        into a resource and loaded from there.
  69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
  70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
  71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
  72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
  73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
  74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
  75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
  76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
  77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
  78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
  79
  80 static const uint16_t commonChars_gb_18030[] = {
  81 // TODO:  This set of data comes from the character frequency-
  82 //        of-occurence analysis tool.  The data needs to be moved
  83 //        into a resource and loaded from there.
  84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
  85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
  86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
  87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
  88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
  89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
  90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
  91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
  92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
  93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
  94
  95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
  96 {
  97     int32_t start = 0, end = len-1;
  98     int32_t mid = (start+end)/2;
  99
 100     while(start <= end) {
 101         if(array[mid] == value) {
 102             return mid;
 103         }
 104
 105         if(array[mid] < value){
 106             start = mid+1;
 107         } else {
 108             end = mid-1;
 109         }
 110
 111         mid = (start+end)/2;
 112     }
 113
 114     return -1;
 115 }
 116
 117 IteratedChar::IteratedChar() :
 118 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
 119 {
 120     // nothing else to do.
 121 }
 122
 123 /*void IteratedChar::reset()
 124 {
 125     charValue = 0;
 126     index     = -1;
 127     nextIndex = 0;
 128     error     = FALSE;
 129     done      = FALSE;
 130 }*/
 131
 132 int32_t IteratedChar::nextByte(InputText *det)
 133 {
 134     if (nextIndex >= det->fRawLength) {
 135         done = TRUE;
 136
 137         return -1;
 138     }
 139
 140     return det->fRawInput[nextIndex++];
 141 }
 142
 143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
 144 {
 145     // nothing to do.
 146 }
 147
 148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
 149     int32_t singleByteCharCount = 0;
 150     int32_t doubleByteCharCount = 0;
 151     int32_t commonCharCount     = 0;
 152     int32_t badCharCount        = 0;
 153     int32_t totalCharCount      = 0;
 154     int32_t confidence          = 0;
 155     IteratedChar iter;
 156
 157     while (nextChar(&iter, det)) {
 158         totalCharCount++;
 159
 160         if (iter.error) {
 161             badCharCount++;
 162         } else {
 163             if (iter.charValue <= 0xFF) {
 164                 singleByteCharCount++;
 165             } else {
 166                 doubleByteCharCount++;
 167
 168                 if (commonChars != 0) {
 169                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
 170                         commonCharCount += 1;
 171                     }
 172                 }
 173             }
 174         }
 175
 176
 177         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
 178             // Bail out early if the byte data is not matching the encoding scheme.
 179             // break detectBlock;
 180             return confidence;
 181         }
 182     }
 183
 184     if (doubleByteCharCount <= 10 && badCharCount == 0) {
 185         // Not many multi-byte chars.
 186         if (doubleByteCharCount == 0 && totalCharCount < 10) {
 187             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 188             // We don't have enough data to have any confidence.
 189             // Statistical analysis of single byte non-ASCII charcters would probably help here.
 190             confidence = 0;
 191         }
 192         else {
 193             //   ASCII or ISO file?  It's probably not our encoding,
 194             //   but is not incompatible with our encoding, so don't give it a zero.
 195             confidence = 10;
 196         }
 197
 198         return confidence;
 199     }
 200
 201     //
 202     //  No match if there are too many characters that don't fit the encoding scheme.
 203     //    (should we have zero tolerance for these?)
 204     //
 205     if (doubleByteCharCount < 20*badCharCount) {
 206         confidence = 0;
 207
 208         return confidence;
 209     }
 210
 211     if (commonChars == 0) {
 212         // We have no statistics on frequently occuring characters.
 213         //  Assess confidence purely on having a reasonable number of
 214         //  multi-byte characters (the more the better)
 215         confidence = 30 + doubleByteCharCount - 20*badCharCount;
 216
 217         if (confidence > 100) {
 218             confidence = 100;
 219         }
 220     } else {
 221         //
 222         // Frequency of occurence statistics exist.
 223         //
 224
 225         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
 226         double scaleFactor = 90.0 / maxVal;
 227         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
 228
 229         confidence = min(confidence, 100);
 230     }
 231
 232     if (confidence < 0) {
 233         confidence = 0;
 234     }
 235
 236     return confidence;
 237 }
 238
 239 CharsetRecog_sjis::~CharsetRecog_sjis()
 240 {
 241     // nothing to do
 242 }
 243
 244 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
 245     it->index = it->nextIndex;
 246     it->error = FALSE;
 247
 248     int32_t firstByte = it->charValue = it->nextByte(det);
 249
 250     if (firstByte < 0) {
 251         return FALSE;
 252     }
 253
 254     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
 255         return TRUE;
 256     }
 257
 258     int32_t secondByte = it->nextByte(det);
 259     if (secondByte >= 0) {
 260         it->charValue = (firstByte << 8) | secondByte;
 261     }
 262     // else we'll handle the error later.
 263
 264     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
 265         // Illegal second byte value.
 266         it->error = TRUE;
 267     }
 268
 269     return TRUE;
 270 }
 271
 272 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
 273     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
 274     results->set(det, this, confidence);
 275     return (confidence > 0);
 276 }
 277
 278 const char *CharsetRecog_sjis::getName() const
 279 {
 280     return "Shift_JIS";
 281 }
 282
 283 const char *CharsetRecog_sjis::getLanguage() const
 284 {
 285     return "ja";
 286 }
 287
 288 CharsetRecog_euc::~CharsetRecog_euc()
 289 {
 290     // nothing to do
 291 }
 292
 293 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
 294     int32_t firstByte  = 0;
 295     int32_t secondByte = 0;
 296     int32_t thirdByte  = 0;
 297
 298     it->index = it->nextIndex;
 299     it->error = FALSE;
 300     firstByte = it->charValue = it->nextByte(det);
 301
 302     if (firstByte < 0) {
 303         // Ran off the end of the input data
 304         return FALSE;
 305     }
 306
 307     if (firstByte <= 0x8D) {
 308         // single byte char
 309         return TRUE;
 310     }
 311
 312     secondByte = it->nextByte(det);
 313     if (secondByte >= 0) {
 314         it->charValue = (it->charValue << 8) | secondByte;
 315     }
 316     // else we'll handle the error later.
 317
 318     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
 319         // Two byte Char
 320         if (secondByte < 0xA1) {
 321             it->error = TRUE;
 322         }
 323
 324         return TRUE;
 325     }
 326
 327     if (firstByte == 0x8E) {
 328         // Code Set 2.
 329         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 330         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 331         // We don't know which we've got.
 332         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 333         //   bytes will look like a well formed 2 byte char.
 334         if (secondByte < 0xA1) {
 335             it->error = TRUE;
 336         }
 337
 338         return TRUE;
 339     }
 340
 341     if (firstByte == 0x8F) {
 342         // Code set 3.
 343         // Three byte total char size, two bytes of actual char value.
 344         thirdByte    = it->nextByte(det);
 345         it->charValue = (it->charValue << 8) | thirdByte;
 346
 347         if (thirdByte < 0xa1) {
 348             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
 349             it->error = TRUE;
 350         }
 351     }
 352
 353     return TRUE;
 354
 355 }
 356
 357 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
 358 {
 359     // nothing to do
 360 }
 361
 362 const char *CharsetRecog_euc_jp::getName() const
 363 {
 364     return "EUC-JP";
 365 }
 366
 367 const char *CharsetRecog_euc_jp::getLanguage() const
 368 {
 369     return "ja";
 370 }
 371
 372 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
 373 {
 374     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
 375     results->set(det, this, confidence);
 376     return (confidence > 0);
 377 }
 378
 379 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
 380 {
 381     // nothing to do
 382 }
 383
 384 const char *CharsetRecog_euc_kr::getName() const
 385 {
 386     return "EUC-KR";
 387 }
 388
 389 const char *CharsetRecog_euc_kr::getLanguage() const
 390 {
 391     return "ko";
 392 }
 393
 394 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
 395 {
 396     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
 397     results->set(det, this, confidence);
 398     return (confidence > 0);
 399 }
 400
 401 CharsetRecog_big5::~CharsetRecog_big5()
 402 {
 403     // nothing to do
 404 }
 405
 406 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
 407 {
 408     int32_t firstByte;
 409
 410     it->index = it->nextIndex;
 411     it->error = FALSE;
 412     firstByte = it->charValue = it->nextByte(det);
 413
 414     if (firstByte < 0) {
 415         return FALSE;
 416     }
 417
 418     if (firstByte <= 0x7F || firstByte == 0xFF) {
 419         // single byte character.
 420         return TRUE;
 421     }
 422
 423     int32_t secondByte = it->nextByte(det);
 424     if (secondByte >= 0)  {
 425         it->charValue = (it->charValue << 8) | secondByte;
 426     }
 427     // else we'll handle the error later.
 428
 429     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
 430         it->error = TRUE;
 431     }
 432
 433     return TRUE;
 434 }
 435
 436 const char *CharsetRecog_big5::getName() const
 437 {
 438     return "Big5";
 439 }
 440
 441 const char *CharsetRecog_big5::getLanguage() const
 442 {
 443     return "zh";
 444 }
 445
 446 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
 447 {
 448     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
 449     results->set(det, this, confidence);
 450     return (confidence > 0);
 451 }
 452
 453 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
 454 {
 455     // nothing to do
 456 }
 457
 458 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
 459     int32_t firstByte  = 0;
 460     int32_t secondByte = 0;
 461     int32_t thirdByte  = 0;
 462     int32_t fourthByte = 0;
 463
 464     it->index = it->nextIndex;
 465     it->error = FALSE;
 466     firstByte = it->charValue = it->nextByte(det);
 467
 468     if (firstByte < 0) {
 469         // Ran off the end of the input data
 470         return FALSE;
 471     }
 472
 473     if (firstByte <= 0x80) {
 474         // single byte char
 475         return TRUE;
 476     }
 477
 478     secondByte = it->nextByte(det);
 479     if (secondByte >= 0) {
 480         it->charValue = (it->charValue << 8) | secondByte;
 481     }
 482     // else we'll handle the error later.
 483
 484     if (firstByte >= 0x81 && firstByte <= 0xFE) {
 485         // Two byte Char
 486         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 487             return TRUE;
 488         }
 489
 490         // Four byte char
 491         if (secondByte >= 0x30 && secondByte <= 0x39) {
 492             thirdByte = it->nextByte(det);
 493
 494             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 495                 fourthByte = it->nextByte(det);
 496
 497                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 498                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
 499
 500                     return TRUE;
 501                 }
 502             }
 503         }
 504
 505         // Something wasn't valid, or we ran out of data (-1).
 506         it->error = TRUE;
 507     }
 508
 509     return TRUE;
 510 }
 511
 512 const char *CharsetRecog_gb_18030::getName() const
 513 {
 514     return "GB18030";
 515 }
 516
 517 const char *CharsetRecog_gb_18030::getLanguage() const
 518 {
 519     return "zh";
 520 }
 521
 522 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
 523 {
 524     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
 525     results->set(det, this, confidence);
 526     return (confidence > 0);
 527 }
 528
 529 U_NAMESPACE_END
 530 #endif