src/third_party/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc

   1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
   6
   7 #include "base/basictypes.h"
   8
   9 // Return true if current Tbl pointer is within state0 range
  10 // Note that unsigned compare checks both ends of range simultaneously
  11 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
  12   const uint8* Tbl0 = &st->state_table[st->state0];
  13   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
  14 }
  15
  16
  17 // Look up property of one UTF-8 character and advance over it
  18 // Return 0 if input length is zero
  19 // Return 0 and advance one byte if input is ill-formed
  20 uint8 UTF8GenericProperty(const UTF8PropObj* st,
  21                           const uint8** src,
  22                           int* srclen) {
  23   if (*srclen <= 0) {
  24     return 0;
  25   }
  26
  27   const uint8* lsrc = *src;
  28   const uint8* Tbl_0 = &st->state_table[st->state0];
  29   const uint8* Tbl = Tbl_0;
  30   int e;
  31   int eshift = st->entry_shift;
  32
  33   // Short series of tests faster than switch, optimizes 7-bit ASCII
  34   unsigned char c = lsrc[0];
  35   if (static_cast<signed char>(c) >= 0) {           // one byte
  36     e = Tbl[c];
  37     *src += 1;
  38     *srclen -= 1;
  39   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
  40     e = Tbl[c];
  41     Tbl = &Tbl_0[e << eshift];
  42     e = Tbl[lsrc[1]];
  43     *src += 2;
  44     *srclen -= 2;
  45   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
  46     e = Tbl[c];
  47     Tbl = &Tbl_0[e << eshift];
  48     e = Tbl[lsrc[1]];
  49     Tbl = &Tbl_0[e << eshift];
  50     e = Tbl[lsrc[2]];
  51     *src += 3;
  52     *srclen -= 3;
  53   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
  54     e = Tbl[c];
  55     Tbl = &Tbl_0[e << eshift];
  56     e = Tbl[lsrc[1]];
  57     Tbl = &Tbl_0[e << eshift];
  58     e = Tbl[lsrc[2]];
  59     Tbl = &Tbl_0[e << eshift];
  60     e = Tbl[lsrc[3]];
  61     *src += 4;
  62     *srclen -= 4;
  63   } else {                                                // Ill-formed
  64     e = 0;
  65     *src += 1;
  66     *srclen -= 1;
  67   }
  68   return e;
  69 }
  70
  71 // BigOneByte versions are needed for tables > 240 states, but most
  72 // won't need the TwoByte versions.
  73 // Internally, to next-to-last offset is multiplied by 16 and the last
  74 // offset is relative instead of absolute.
  75 // Look up property of one UTF-8 character and advance over it
  76 // Return 0 if input length is zero
  77 // Return 0 and advance one byte if input is ill-formed
  78 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
  79                           const uint8** src,
  80                           int* srclen) {
  81   if (*srclen <= 0) {
  82     return 0;
  83   }
  84
  85   const uint8* lsrc = *src;
  86   const uint8* Tbl_0 = &st->state_table[st->state0];
  87   const uint8* Tbl = Tbl_0;
  88   int e;
  89   int eshift = st->entry_shift;
  90
  91   // Short series of tests faster than switch, optimizes 7-bit ASCII
  92   unsigned char c = lsrc[0];
  93   if (static_cast<signed char>(c) >= 0) {           // one byte
  94     e = Tbl[c];
  95     *src += 1;
  96     *srclen -= 1;
  97   } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
  98     e = Tbl[c];
  99     Tbl = &Tbl_0[e << eshift];
 100     e = Tbl[lsrc[1]];
 101     *src += 2;
 102     *srclen -= 2;
 103   } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
 104     e = Tbl[c];
 105     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
 106     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
 107     Tbl = &Tbl[e << eshift];          // Relative +/-
 108     e = Tbl[lsrc[2]];
 109     *src += 3;
 110     *srclen -= 3;
 111   }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
 112     e = Tbl[c];
 113     Tbl = &Tbl_0[e << eshift];
 114     e = Tbl[lsrc[1]];
 115     Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
 116     e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
 117     Tbl = &Tbl[e << eshift];          // Relative +/-
 118     e = Tbl[lsrc[3]];
 119     *src += 4;
 120     *srclen -= 4;
 121   } else {                                                // Ill-formed
 122     e = 0;
 123     *src += 1;
 124     *srclen -= 1;
 125   }
 126   return e;
 127 }
 128
 129 // Scan a UTF-8 stringpiece based on a state table.
 130 // Always scan complete UTF-8 characters
 131 // Set number of bytes scanned. Return reason for exiting
 132 int UTF8GenericScan(const UTF8ScanObj* st,
 133                     const uint8* str,
 134                     const int len,
 135                     int* bytes_consumed) {
 136   int eshift = st->entry_shift;        // 6 (space optimized) or 8
 137   // int nEntries = (1 << eshift);       // 64 or 256 entries per state
 138
 139   const uint8* isrc = str;
 140     //reinterpret_cast<const uint8*>(str.data());
 141   const uint8* src = isrc;
 142   //const int len = str.length();
 143   const uint8* srclimit = isrc + len;
 144   const uint8* srclimit8 = srclimit - 7;
 145   *bytes_consumed = 0;
 146   if (len == 0) return kExitOK;
 147
 148   const uint8* Tbl_0 = &st->state_table[st->state0];
 149
 150 DoAgain:
 151   // Do state-table scan
 152   int e = 0;
 153   uint8 c;
 154
 155   // Do fast for groups of 8 identity bytes.
 156   // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
 157   // including slowing slightly on cr/lf/ht
 158   //----------------------------
 159   const uint8* Tbl2 = &st->fast_state[0];
 160   uint32 losub = st->losub;
 161   uint32 hiadd = st->hiadd;
 162   while (src < srclimit8) {
 163     uint32 s0123 = UnalignedLoad32(src);
 164     uint32 s4567 = UnalignedLoad32(src + 4);
 165     src += 8;
 166     // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
 167     uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
 168                   (s4567 - losub) | (s4567 + hiadd);
 169     if ((temp & 0x80808080) != 0) {
 170       // We typically end up here on cr/lf/ht; src was incremented
 171       int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
 172                   (Tbl2[src[-6]] | Tbl2[src[-5]]);
 173       if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
 174       e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
 175               (Tbl2[src[-2]] | Tbl2[src[-1]]);
 176       if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
 177       // Else OK, go around again
 178     }
 179   }
 180   //----------------------------
 181
 182   // Byte-at-a-time scan
 183   //----------------------------
 184   const uint8* Tbl = Tbl_0;
 185   while (src < srclimit) {
 186     c = *src;
 187     e = Tbl[c];
 188     src++;
 189     if (e >= kExitIllegalStructure) {break;}
 190     Tbl = &Tbl_0[e << eshift];
 191   }
 192   //----------------------------
 193
 194
 195   // Exit posibilities:
 196   //  Some exit code, !state0, back up over last char
 197   //  Some exit code, state0, back up one byte exactly
 198   //  source consumed, !state0, back up over partial char
 199   //  source consumed, state0, exit OK
 200   // For illegal byte in state0, avoid backup up over PREVIOUS char
 201   // For truncated last char, back up to beginning of it
 202
 203   if (e >= kExitIllegalStructure) {
 204     // Back up over exactly one byte of rejected/illegal UTF-8 character
 205     src--;
 206     // Back up more if needed
 207     if (!InStateZero(st, Tbl)) {
 208       do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
 209     }
 210   } else if (!InStateZero(st, Tbl)) {
 211     // Back up over truncated UTF-8 character
 212     e = kExitIllegalStructure;
 213     do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
 214   } else {
 215     // Normal termination, source fully consumed
 216     e = kExitOK;
 217   }
 218
 219   if (e == kExitDoAgain) {
 220     // Loop back up to the fast scan
 221     goto DoAgain;
 222   }
 223
 224   *bytes_consumed = src - isrc;
 225   return e;
 226 }