source/i18n/csrutf8.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  **********************************************************************
   5  *   Copyright (C) 2005-2014, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  **********************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_CONVERSION
  13
  14 #include "csrutf8.h"
  15 #include "csmatch.h"
  16
  17 U_NAMESPACE_BEGIN
  18
  19 CharsetRecog_UTF8::~CharsetRecog_UTF8()
  20 {
  21     // nothing to do
  22 }
  23
  24 const char *CharsetRecog_UTF8::getName() const
  25 {
  26     return "UTF-8";
  27 }
  28
  29 UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
  30     bool hasBOM = FALSE;
  31     int32_t numValid = 0;
  32     int32_t numInvalid = 0;
  33     const uint8_t *inputBytes = input->fRawInput;
  34     int32_t i;
  35     int32_t trailBytes = 0;
  36     int32_t confidence;
  37
  38     if (input->fRawLength >= 3 &&
  39         inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
  40             hasBOM = TRUE;
  41     }
  42
  43     // Scan for multi-byte sequences
  44     for (i=0; i < input->fRawLength; i += 1) {
  45         int32_t b = inputBytes[i];
  46
  47         if ((b & 0x80) == 0) {
  48             continue;   // ASCII
  49         }
  50
  51         // Hi bit on char found.  Figure out how long the sequence should be
  52         if ((b & 0x0E0) == 0x0C0) {
  53             trailBytes = 1;
  54         } else if ((b & 0x0F0) == 0x0E0) {
  55             trailBytes = 2;
  56         } else if ((b & 0x0F8) == 0xF0) {
  57             trailBytes = 3;
  58         } else {
  59             numInvalid += 1;
  60             continue;
  61         }
  62
  63         // Verify that we've got the right number of trail bytes in the sequence
  64         for (;;) {
  65             i += 1;
  66
  67             if (i >= input->fRawLength) {
  68                 break;
  69             }
  70
  71             b = inputBytes[i];
  72
  73             if ((b & 0xC0) != 0x080) {
  74                 numInvalid += 1;
  75                 break;
  76             }
  77
  78             if (--trailBytes == 0) {
  79                 numValid += 1;
  80                 break;
  81             }
  82         }
  83
  84     }
  85
  86     // Cook up some sort of confidence score, based on presence of a BOM
  87     //    and the existence of valid and/or invalid multi-byte sequences.
  88     confidence = 0;
  89     if (hasBOM && numInvalid == 0) {
  90         confidence = 100;
  91     } else if (hasBOM && numValid > numInvalid*10) {
  92         confidence = 80;
  93     } else if (numValid > 3 && numInvalid == 0) {
  94         confidence = 100;
  95     } else if (numValid > 0 && numInvalid == 0) {
  96         confidence = 80;
  97     } else if (numValid == 0 && numInvalid == 0) {
  98         // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
  99         //              accepts ASCII with confidence = 10.
 100         confidence = 15;
 101     } else if (numValid > numInvalid*10) {
 102         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
 103         confidence = 25;
 104     }
 105
 106     results->set(input, this, confidence);
 107     return (confidence > 0);
 108 }
 109
 110 U_NAMESPACE_END
 111 #endif