src/third_party/cld_2/src/internal/getonescriptspan.cc

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 //
  18
  19
  20 #include "getonescriptspan.h"
  21 #include <string.h>
  22
  23 #include "fixunicodevalue.h"
  24 #include "lang_script.h"
  25 #include "port.h"
  26 #include "utf8statetable.h"
  27
  28 #include "utf8prop_lettermarkscriptnum.h"
  29 #include "utf8repl_lettermarklower.h"
  30 #include "utf8scannot_lettermarkspecial.h"
  31
  32
  33 namespace CLD2 {
  34
  35 // Alphabetical order for binary search, from
  36 // generated_entities.cc
  37 extern const int kNameToEntitySize;
  38 extern const CharIntPair kNameToEntity[];
  39
  40 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
  41                                                   // else make shorter
  42 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
  43                                                   // to round to word boundary,
  44                                                   // direction above
  45
  46 static const char kSpecialSymbol[256] = {       // true for < > &
  47   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  48   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
  49   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  50   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  51
  52   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  53   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  54   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  55   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  56 };
  57
  58
  59
  60 #define LT 0      // <
  61 #define GT 1      // >
  62 #define EX 2      // !
  63 #define HY 3      // -
  64 #define QU 4      // "
  65 #define AP 5      // '
  66 #define SL 6      // /
  67 #define S_ 7
  68 #define C_ 8
  69 #define R_ 9
  70 #define I_ 10
  71 #define P_ 11
  72 #define T_ 12
  73 #define Y_ 13
  74 #define L_ 14
  75 #define E_ 15
  76 #define CR 16     // <cr> or <lf>
  77 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
  78 #define PL 18     // possible letter, incl. &
  79 #define xx 19     // <unused>
  80
  81 // Map byte to one of ~20 interesting categories for cheap tag parsing
  82 static const uint8 kCharToSub[256] = {
  83   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
  84   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  85   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
  86   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
  87
  88   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  89   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
  90   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  91   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
  92
  93   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  94   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  95   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  96   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  97
  98   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  99   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 100   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 101   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
 102 };
 103
 104 #undef LT
 105 #undef GT
 106 #undef EX
 107 #undef HY
 108 #undef QU
 109 #undef AP
 110 #undef SL
 111 #undef S_
 112 #undef C_
 113 #undef R_
 114 #undef I_
 115 #undef P_
 116 #undef T_
 117 #undef Y_
 118 #undef L_
 119 #undef E_
 120 #undef CR
 121 #undef NL
 122 #undef PL
 123 #undef xx
 124
 125
 126 #define OK 0
 127 #define X_ 1
 128
 129
 130 static const int kMaxExitStateLettersMarksOnly = 1;
 131 static const int kMaxExitStateAllText = 2;
 132
 133
 134 // State machine to do cheap parse of non-letter strings incl. tags
 135 // advances <tag>
 136 //          |    |
 137 // advances <tag> ... </tag>  for <script> <style>
 138 //          |               |
 139 // advances <!-- ... <tag> ... -->
 140 //          |                     |
 141 // advances <tag
 142 //          ||  (0)
 143 // advances <tag <tag2>
 144 //          ||  (0)
 145 //
 146 // We start in state [0] at a non-letter and make at least one transition
 147 // When scanning for just letters, arriving back at state [0] or [1] exits
 148 //   the state machine.
 149 // When scanning for any non-tag text, arriving at state [2] also exits
 150 static const uint8 kTagParseTbl_0[] = {
 151 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 152    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state
 153   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
 154    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]
 155   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
 156   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
 157   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
 158    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
 159    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
 160    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
 161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
 162   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
 163   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
 164   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
 165
 166 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 167   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
 168   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
 169   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
 170   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
 171   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
 172   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
 173   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
 174   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
 175   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
 176   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
 177   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
 178   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
 179   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
 180   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
 181   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
 182
 183 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
 184   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
 185   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
 186   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
 187   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
 188   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
 189   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
 190   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
 191   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
 192   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
 193   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
 194   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
 195   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
 196 };
 197
 198 #undef OK
 199 #undef X_
 200
 201 enum
 202 {
 203   UTFmax        = 4,            // maximum bytes per rune
 204   Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
 205   Runeself      = 0x80,         // rune and UTF sequences are the same (<)
 206   Runeerror     = 0xFFFD,       // decoding error in UTF
 207   Runemax       = 0x10FFFF,     // maximum rune value
 208 };
 209
 210 // Debugging. Not thread safe.
 211 static char gDisplayPiece[32];
 212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
 213 char* DisplayPiece(const char* next_byte_, int byte_length_) {
 214   // Copy up to 8 UTF-8 chars to buffer
 215   int k = 0;    // byte count
 216   int n = 0;    // character count
 217   for (int i = 0; i < byte_length_; ++i) {
 218     char c = next_byte_[i];
 219     if ((c & 0xc0) != 0x80) {
 220       // Beginning of a UTF-8 character
 221       int charlen = gCharlen[static_cast<uint8>(c) >> 4];
 222       if (i + charlen > byte_length_) {break;} // Not enough room for full char
 223       if (k >= (32 - 7)) {break;}   // Not necessarily enough room
 224       if (n >= 8) {break;}          // Enough characters already
 225       ++n;
 226     }
 227     if (c == '<') {
 228       memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
 229     } else if (c == '>') {
 230       memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
 231     } else if (c == '&') {
 232       memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
 233     } else if (c == '\'') {
 234       memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
 235     } else if (c == '"') {
 236       memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
 237     } else {
 238       gDisplayPiece[k++] = c;
 239     }
 240   }
 241   gDisplayPiece[k++] = '\0';
 242   return gDisplayPiece;
 243 }
 244
 245
 246
 247 // runetochar copies (encodes) one rune, pointed to by r, to at most
 248 // UTFmax bytes starting at s and returns the number of bytes generated.
 249 int runetochar(char *str, const char32 *rune) {
 250   // Convert to unsigned for range check.
 251   unsigned long c;
 252
 253   // 1 char 00-7F
 254   c = *rune;
 255   if(c <= 0x7F) {
 256     str[0] = c;
 257     return 1;
 258   }
 259
 260   // 2 char 0080-07FF
 261   if(c <= 0x07FF) {
 262     str[0] = 0xC0 | (c >> 1*6);
 263     str[1] = 0x80 | (c & 0x3F);
 264     return 2;
 265   }
 266
 267   // Range check
 268   if (c > Runemax) {
 269     c = Runeerror;
 270   }
 271
 272   // 3 char 0800-FFFF
 273   if (c <= 0xFFFF) {
 274     str[0] = 0xE0 |  (c >> 2*6);
 275     str[1] = 0x80 | ((c >> 1*6) & 0x3F);
 276     str[2] = 0x80 |  (c & 0x3F);
 277     return 3;
 278   }
 279
 280   // 4 char 10000-1FFFFF
 281   str[0] = 0xF0 | (c >> 3*6);
 282   str[1] = 0x80 | ((c >> 2*6) & 0x3F);
 283   str[2] = 0x80 | ((c >> 1*6) & 0x3F);
 284   str[3] = 0x80 | (c & 0x3F);
 285   return 4;
 286 }
 287
 288
 289
 290 // Useful for converting an entity to an ascii value.
 291 // RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
 292 int LookupEntity(const char* entity_name, int entity_len) {
 293   // Make a C string
 294   if (entity_len >= 16) {return -1;}    // All real entities are shorter
 295   char temp[16];
 296   memcpy(temp, entity_name, entity_len);
 297   temp[entity_len] = '\0';
 298   int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
 299   if (match >= 0) {return kNameToEntity[match].i;}
 300   return -1;
 301 }
 302
 303 bool ascii_isdigit(char c) {
 304   return ('0' <= c) && (c <= '9');
 305 }
 306 bool ascii_isxdigit(char c) {
 307   if (('0' <= c) && (c <= '9')) {return true;}
 308   if (('a' <= c) && (c <= 'f')) {return true;}
 309   if (('A' <= c) && (c <= 'F')) {return true;}
 310   return false;
 311 }
 312 bool ascii_isalnum(char c) {
 313   if (('0' <= c) && (c <= '9')) {return true;}
 314   if (('a' <= c) && (c <= 'z')) {return true;}
 315   if (('A' <= c) && (c <= 'Z')) {return true;}
 316   return false;
 317 }
 318 int hex_digit_to_int(char c) {
 319   if (('0' <= c) && (c <= '9')) {return c - '0';}
 320   if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
 321   if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
 322   return 0;
 323 }
 324
 325 static int32 strto32_base10(const char* nptr, const char* limit,
 326                             const char **endptr) {
 327   *endptr = nptr;
 328   while (nptr < limit && *nptr == '0') {
 329     ++nptr;
 330   }
 331   if (nptr == limit || !ascii_isdigit(*nptr))
 332     return -1;
 333   const char* end_digits_run = nptr;
 334   while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
 335     ++end_digits_run;
 336   }
 337   *endptr = end_digits_run;
 338   const int num_digits = end_digits_run - nptr;
 339   // kint32max == 2147483647.
 340   if (num_digits < 9 ||
 341       (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
 342     int value = 0;
 343     for (; nptr < end_digits_run; ++nptr) {
 344       value *= 10;
 345       value += *nptr - '0';
 346     }
 347     // Overflow past the last valid unicode codepoint
 348     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
 349     return FixUnicodeValue(value);
 350   } else {
 351     // Overflow: can't fit in an int32;
 352     // returns the replacement character 0xFFFD.
 353     return 0xFFFD;
 354   }
 355 }
 356
 357 static int32 strto32_base16(const char* nptr, const char* limit,
 358                             const char **endptr) {
 359   *endptr = nptr;
 360   while (nptr < limit && *nptr == '0') {
 361     ++nptr;
 362   }
 363   if (nptr == limit || !ascii_isxdigit(*nptr)) {
 364     return -1;
 365   }
 366   const char* end_xdigits_run = nptr;
 367   while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
 368     ++end_xdigits_run;
 369   }
 370   *endptr = end_xdigits_run;
 371   const int num_xdigits = end_xdigits_run - nptr;
 372   // kint32max == 0x7FFFFFFF.
 373   if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
 374     int value = 0;
 375     for (; nptr < end_xdigits_run; ++nptr) {
 376       value <<= 4;
 377       value += hex_digit_to_int(*nptr);
 378     }
 379     // Overflow past the last valid unicode codepoint
 380     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
 381     return FixUnicodeValue(value);
 382   } else {
 383     // Overflow: can't fit in an int32;
 384     // returns the replacement character 0xFFFD.
 385     return 0xFFFD;
 386   }
 387 }
 388
 389 // Unescape the current character pointed to by src.  SETS the number
 390 // of chars read for the conversion (in UTF8).  If src isn't a valid entity,
 391 // just consume the & and RETURN -1.  If src doesn't point to & -- which it
 392 // should -- set src_consumed to 0 and RETURN -1.
 393 int ReadEntity(const char* src, int srcn, int* src_consumed) {
 394   const char* const srcend = src + srcn;
 395
 396   if (srcn == 0 || *src != '&') {      // input should start with an ampersand
 397     *src_consumed = 0;
 398     return -1;
 399   }
 400   *src_consumed = 1;                   // we'll get the & at least
 401
 402   // The standards are a bit unclear on when an entity ends.  Certainly a ";"
 403   // ends one, but spaces probably do too.  We follow the lead of both IE and
 404   // Netscape, which as far as we can tell end numeric entities (1st case below)
 405   // at any non-digit, and end character entities (2nd case) at any non-alnum.
 406   const char* entstart, *entend;  // where the entity starts and ends
 407   entstart = src + 1;             // read past the &
 408   int entval;                     // UCS2 value of the entity
 409   if ( *entstart == '#' ) {       // -- 1st case: numeric entity
 410     if ( entstart + 2 >= srcend ) {
 411       return -1;                  // no way a legitimate number could fit
 412     } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric
 413       entval = strto32_base16(entstart + 2, srcend, &entend);
 414     } else {                                  // decimal numeric entity
 415       entval = strto32_base10(entstart+1, srcend, &entend);
 416     }
 417     if (entval == -1 || entend > srcend) {
 418       return -1;                 // not entirely correct, but close enough
 419     }
 420   } else {                       // -- 2nd case: character entity
 421     for (entend = entstart;
 422          entend < srcend && ascii_isalnum(*entend);
 423          ++entend ) {
 424       // entity consists of alphanumeric chars
 425     }
 426     entval = LookupEntity(entstart, entend - entstart);
 427     if (entval < 0) {
 428       return -1;  // not a legal entity name
 429     }
 430     // Now we do a strange-seeming IE6-compatibility check: if entval is
 431     // >= 256, it *must* be followed by a semicolon or it's not considered
 432     // an entity.  The problem is lots of the newfangled entity names, like
 433     // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
 434     // When these links are written in HTML, it would be really bad if the
 435     // "&lang" were treated as an entity, which is what the spec says
 436     // *should* happen (even when the HTML is inside an "A HREF" tag!)
 437     // IE ignores the spec for these new, high-value entities, so we do too.
 438     if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
 439       return -1;                 // make non-;-terminated entity illegal
 440     }
 441   }
 442
 443   // Finally, figure out how much src was consumed
 444   if ( entend < srcend && *entend == ';' ) {
 445     entend++;                    // standard says ; terminator is special
 446   }
 447   *src_consumed = entend - src;
 448   return entval;
 449 }
 450
 451
 452 // Src points to '&'
 453 // Writes entity value to dst. Returns take(src), put(dst) byte counts
 454 void EntityToBuffer(const char* src, int len, char* dst,
 455                     int* tlen, int* plen) {
 456   char32 entval = ReadEntity(src, len, tlen);
 457
 458   // ReadEntity does this already: entval = FixUnicodeValue(entval);
 459
 460   // Convert UTF-32 to UTF-8
 461   if (entval > 0) {
 462     *plen = runetochar(dst, &entval);
 463   } else {
 464     // Illegal entity; ignore the '&'
 465     *tlen = 1;
 466     *plen = 0;
 467   }
 468 }
 469
 470 // Returns true if character is < > or &, none of which are letters
 471 bool inline IsSpecial(char c) {
 472   if ((c & 0xe0) == 0x20) {
 473     return kSpecialSymbol[static_cast<uint8>(c)];
 474   }
 475   return false;
 476 }
 477
 478 // Quick Skip to next letter or < > & or to end of string (eos)
 479 // Always return is_letter for eos
 480 int ScanToLetterOrSpecial(const char* src, int len) {
 481   int bytes_consumed;
 482   StringPiece str(src, len);
 483   UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
 484   return bytes_consumed;
 485 }
 486
 487
 488
 489
 490 // src points to non-letter, such as tag-opening '<'
 491 // Return length from here to next possible letter
 492 // On another < before >, return 1
 493 // advances <tag>
 494 //          |    |
 495 // advances <tag> ... </tag>  for <script> <style>
 496 //          |               |
 497 // advances <!-- ... <tag> ... -->
 498 //          |                     |
 499 // advances <tag
 500 //          |    | end of string
 501 // advances <tag <tag2>
 502 //          ||
 503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
 504   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 505   const uint8* srclimit = src + len;
 506   const uint8* tagParseTbl = kTagParseTbl_0;
 507   int e = 0;
 508   while (src < srclimit) {
 509     e = tagParseTbl[kCharToSub[*src++]];
 510     if (e <= max_exit_state) {
 511       // We overshot by one byte
 512       --src;
 513       break;
 514     }
 515     tagParseTbl = &kTagParseTbl_0[e * 20];
 516   }
 517
 518   if (src >= srclimit) {
 519     // We fell off the end of the text.
 520     // It looks like the most common case for this is a truncated file, not
 521     // mismatched angle brackets. So we pretend that the last char was '>'
 522     return len;
 523   }
 524
 525   // OK to be in state 0 or state 2 at exit
 526   if ((e != 0) && (e != 2)) {
 527     // Error, '<' followed by '<'
 528     // We want to back up to first <, then advance by one byte past it
 529     int offset = src - reinterpret_cast<const uint8*>(isrc);
 530
 531     // Backscan to first '<' and return enough length to just get past it
 532     --offset;   // back up over the second '<', which caused us to stop
 533     while ((0 < offset) && (isrc[offset] != '<')) {
 534       // Find the first '<', which is unmatched
 535       --offset;
 536     }
 537     // skip to just beyond first '<'
 538     return offset + 1;
 539   }
 540
 541   return src - reinterpret_cast<const uint8*>(isrc);
 542 }
 543
 544
 545 ScriptScanner::ScriptScanner(const char* buffer,
 546                              int buffer_length,
 547                              bool is_plain_text)
 548   : start_byte_(buffer),
 549   next_byte_(buffer),
 550   next_byte_limit_(buffer + buffer_length),
 551   byte_length_(buffer_length),
 552   is_plain_text_(is_plain_text),
 553   letters_marks_only_(true),
 554   one_script_only_(true),
 555   exit_state_(kMaxExitStateLettersMarksOnly) {
 556     script_buffer_ = new char[kMaxScriptBuffer];
 557     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
 558     map2original_.Clear();    // map from script_buffer_ to buffer
 559     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
 560 }
 561
 562 // Extended version to allow spans of any non-tag text and spans of mixed script
 563 ScriptScanner::ScriptScanner(const char* buffer,
 564                              int buffer_length,
 565                              bool is_plain_text,
 566                              bool any_text,
 567                              bool any_script)
 568   : start_byte_(buffer),
 569   next_byte_(buffer),
 570   next_byte_limit_(buffer + buffer_length),
 571   byte_length_(buffer_length),
 572   is_plain_text_(is_plain_text),
 573   letters_marks_only_(!any_text),
 574   one_script_only_(!any_script),
 575   exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
 576     script_buffer_ = new char[kMaxScriptBuffer];
 577     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
 578     map2original_.Clear();    // map from script_buffer_ to buffer
 579     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
 580 }
 581
 582
 583 ScriptScanner::~ScriptScanner() {
 584   delete[] script_buffer_;
 585   delete[] script_buffer_lower_;
 586 }
 587
 588
 589
 590
 591 // Get to the first real non-tag letter or entity that is a letter
 592 // Sets script of that letter
 593 // Return len if no more letters
 594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
 595   int sc = UNKNOWN_ULSCRIPT;
 596   int skip = 0;
 597   int tlen, plen;
 598
 599   // Do run of non-letters (tag | &NL | NL)*
 600   tlen = 0;
 601   while (skip < len) {
 602     // Do fast scan to next interesting byte
 603     // int oldskip = skip;
 604     skip += ScanToLetterOrSpecial(src + skip, len - skip);
 605
 606     // Check for no more letters/specials
 607     if (skip >= len) {
 608       // All done
 609       *script = sc;
 610       return len;
 611     }
 612
 613     // We are at a letter, nonletter, tag, or entity
 614     if (IsSpecial(src[skip]) && !is_plain_text_) {
 615       if (src[skip] == '<') {
 616         // Begining of tag; skip to end and go around again
 617         tlen = ScanToPossibleLetter(src + skip, len - skip,
 618                                     exit_state_);
 619         sc = 0;
 620       } else if (src[skip] == '>') {
 621         // Unexpected end of tag; skip it and go around again
 622         tlen = 1;         // Over the >
 623         sc = 0;
 624       } else if (src[skip] == '&') {
 625         // Expand entity, no advance
 626         char temp[4];
 627         EntityToBuffer(src + skip, len - skip,
 628                        temp, &tlen, &plen);
 629         sc = GetUTF8LetterScriptNum(temp);
 630       }
 631     } else {
 632       // Update 1..4 bytes
 633       tlen = UTF8OneCharLen(src + skip);
 634       sc = GetUTF8LetterScriptNum(src + skip);
 635     }
 636     if (sc != 0) {break;}           // Letter found
 637     skip += tlen;                   // Else advance
 638   }
 639
 640   *script = sc;
 641   return skip;
 642 }
 643
 644
 645 // These are for ASCII-only tag names
 646 // Compare one letter uplow to c, ignoring case of uplowp
 647 inline bool EqCase(char uplow, char c) {
 648   return (uplow | 0x20) == c;
 649 }
 650
 651 // These are for ASCII-only tag names
 652 // Return true for space / < > etc. all less than 0x40
 653 inline bool NeqLetter(char c) {
 654   return c < 0x40;
 655 }
 656
 657 // These are for ASCII-only tag names
 658 // Return true for space \n false for \r
 659 inline bool WS(char c) {
 660   return (c == ' ') || (c == '\n');
 661 }
 662
 663 // Canonical CR or LF
 664 static const char LF = '\n';
 665
 666
 667 // The naive loop scans from next_byte_ to script_buffer_ until full.
 668 // But this can leave an awkward hard-to-identify short fragment at the
 669 // end of the input. We would prefer to make the next-to-last fragment
 670 // shorter and the last fragment longer.
 671
 672 // Copy next run of non-tag characters to buffer [NUL terminated]
 673 // This just replaces tags with space or \n and removes entities.
 674 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
 675 // including \r or \n are replaced by \n. All other tags and skipped text
 676 // are replaced with ASCII space.
 677 //
 678 // Buffer ALWAYS has leading space and trailing space space space NUL
 679 bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
 680   span->text = script_buffer_;
 681   span->text_bytes = 0;
 682   span->offset = next_byte_ - start_byte_;
 683   span->ulscript = UNKNOWN_ULSCRIPT;
 684   span->lang = UNKNOWN_LANGUAGE;
 685   span->truncated = false;
 686
 687   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
 688   if ((kMaxScriptBytes <= byte_length_) &&
 689       (byte_length_ < (2 * kMaxScriptBytes))) {
 690     // Try to split the last two fragments in half
 691     put_soft_limit = byte_length_ / 2;
 692   }
 693
 694   script_buffer_[0] = ' ';  // Always a space at front of output
 695   script_buffer_[1] = '\0';
 696   int take = 0;
 697   int put = 1;              // Start after the initial space
 698   int tlen, plen;
 699
 700   if (byte_length_ <= 0) {
 701     return false;          // No more text to be found
 702   }
 703
 704   // Go over alternating spans of text and tags,
 705   // copying letters to buffer with single spaces for each run of non-letters
 706   bool last_byte_was_space = false;
 707   while (take < byte_length_) {
 708     char c = next_byte_[take];
 709     if (c == '\r') {c = LF;}      // Canonical CR or LF
 710     if (c == '\n') {c = LF;}      // Canonical CR or LF
 711
 712     if (IsSpecial(c) && !is_plain_text_) {
 713       if (c == '<') {
 714         // Replace tag with space
 715         c = ' ';                      // for almost-full test below
 716         // or if <p> <br> <tr>, replace with \n
 717         if (take < (byte_length_ - 3)) {
 718           if (EqCase(next_byte_[take + 1], 'p') &&
 719               NeqLetter(next_byte_[take + 2])) {
 720             c = LF;
 721           }
 722           if (EqCase(next_byte_[take + 1], 'b') &&
 723               EqCase(next_byte_[take + 2], 'r') &&
 724               NeqLetter(next_byte_[take + 3])) {
 725             c = LF;
 726           }
 727           if (EqCase(next_byte_[take + 1], 't') &&
 728               EqCase(next_byte_[take + 2], 'r') &&
 729               NeqLetter(next_byte_[take + 3])) {
 730             c = LF;
 731           }
 732         }
 733         // Begining of tag; skip to end and go around again
 734         tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
 735                                     exit_state_);
 736         // Copy one byte, compressing spaces
 737         if (!last_byte_was_space || !WS(c)) {
 738           script_buffer_[put++] = c;      // Advance dest
 739           last_byte_was_space = WS(c);
 740         }
 741       } else if (c == '>') {
 742         // Unexpected end of tag; copy it and go around again
 743         tlen = 1;         // Over the >
 744         script_buffer_[put++] = c;    // Advance dest
 745       } else if (c == '&') {
 746         // Expand entity, no advance
 747         EntityToBuffer(next_byte_ + take, byte_length_ - take,
 748                        script_buffer_ + put, &tlen, &plen);
 749         put += plen;                  // Advance dest
 750       }
 751       take += tlen;                   // Advance source
 752     } else {
 753       // Copy one byte, compressing spaces
 754       if (!last_byte_was_space || !WS(c)) {
 755         script_buffer_[put++] = c;      // Advance dest
 756         last_byte_was_space = WS(c);
 757       }
 758       ++take;                         // Advance source
 759     }
 760
 761     if (WS(c) &&
 762         (put >= put_soft_limit)) {
 763       // Buffer is almost full
 764       span->truncated = true;
 765       break;
 766     }
 767     if (put >= kMaxScriptBytes) {
 768       // Buffer is completely full
 769       span->truncated = true;
 770       break;
 771     }
 772   }
 773
 774   // Almost done. Back up to a character boundary if needed
 775   while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
 776     // Back up over continuation byte
 777     --take;
 778     --put;
 779   }
 780
 781   // Update input position
 782   next_byte_ += take;
 783   byte_length_ -= take;
 784
 785   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
 786   //                          kMaxScriptBytes |   | put
 787   script_buffer_[put + 0] = ' ';
 788   script_buffer_[put + 1] = ' ';
 789   script_buffer_[put + 2] = ' ';
 790   script_buffer_[put + 3] = '\0';
 791
 792   span->text_bytes = put;       // Does not include the last four chars above
 793   return true;
 794 }
 795
 796
 797 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
 798 // Buffer ALWAYS has leading space and trailing space space space NUL
 799 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
 800   if (!letters_marks_only_) {
 801     // Return non-tag text, including punctuation and digits
 802     return GetOneTextSpan(span);
 803   }
 804
 805   span->text = script_buffer_;
 806   span->text_bytes = 0;
 807   span->offset = next_byte_ - start_byte_;
 808   span->ulscript = UNKNOWN_ULSCRIPT;
 809   span->lang = UNKNOWN_LANGUAGE;
 810   span->truncated = false;
 811
 812   // struct timeval script_start, script_mid, script_end;
 813
 814   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
 815   if ((kMaxScriptBytes <= byte_length_) &&
 816       (byte_length_ < (2 * kMaxScriptBytes))) {
 817     // Try to split the last two fragments in half
 818     put_soft_limit = byte_length_ / 2;
 819   }
 820
 821
 822   int spanscript;           // The script of this span
 823   int sc = UNKNOWN_ULSCRIPT;  // The script of next character
 824   int tlen = 0;
 825   int plen = 0;
 826
 827   script_buffer_[0] = ' ';  // Always a space at front of output
 828   script_buffer_[1] = '\0';
 829   int take = 0;
 830   int put = 1;              // Start after the initial space
 831
 832   // Build offsets from span->text back to start_byte_ + span->offset
 833   // This mapping reflects deletion of non-letters, expansion of
 834   // entities, etc.
 835   map2original_.Clear();
 836   map2original_.Delete(span->offset);   // So that MapBack(0) gives offset
 837
 838   // Get to the first real non-tag letter or entity that is a letter
 839   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
 840   next_byte_ += skip;
 841   byte_length_ -= skip;
 842
 843   if (skip != 1) {
 844     map2original_.Delete(skip);
 845     map2original_.Insert(1);
 846   } else {
 847     map2original_.Copy(1);
 848   }
 849   if (byte_length_ <= 0) {
 850     map2original_.Reset();
 851     return false;               // No more letters to be found
 852   }
 853
 854   // There is at least one letter, so we know the script for this span
 855   span->ulscript = (ULScript)spanscript;
 856
 857
 858   // Go over alternating spans of same-script letters and non-letters,
 859   // copying letters to buffer with single spaces for each run of non-letters
 860   while (take < byte_length_) {
 861     // Copy run of letters in same script (&LS | LS)*
 862     int letter_count = 0;              // Keep track of word length
 863     bool need_break = false;
 864
 865     while (take < byte_length_) {
 866       // We are at a letter, nonletter, tag, or entity
 867       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
 868         if (next_byte_[take] == '<') {
 869           // Begining of tag
 870           sc = 0;
 871           break;
 872         } else if (next_byte_[take] == '>') {
 873           // Unexpected end of tag
 874           sc = 0;
 875           break;
 876         } else if (next_byte_[take] == '&') {
 877           // Copy entity, no advance
 878           EntityToBuffer(next_byte_ + take, byte_length_ - take,
 879                          script_buffer_ + put, &tlen, &plen);
 880           sc = GetUTF8LetterScriptNum(script_buffer_ + put);
 881         }
 882       } else {
 883         // Real letter, safely copy up to 4 bytes, increment by 1..4
 884         // Will update by 1..4 bytes at Advance, below
 885         tlen = plen = UTF8OneCharLen(next_byte_ + take);
 886         if (take < (byte_length_ - 3)) {
 887           // X86 fast case, does unaligned load/store
 888           UNALIGNED_STORE32(script_buffer_ + put,
 889                             UNALIGNED_LOAD32(next_byte_ + take));
 890
 891         } else {
 892           // Slow case, happens 1-3 times per input document
 893           memcpy(script_buffer_ + put, next_byte_ + take, plen);
 894         }
 895         sc = GetUTF8LetterScriptNum(next_byte_ + take);
 896       }
 897
 898       // Allow continue across a single letter in a different script:
 899       // A B D = three scripts, c = common script, i = inherited script,
 900       // - = don't care, ( = take position before the += below
 901       //  AAA(A-    continue
 902       //
 903       //  AAA(BA    continue
 904       //  AAA(BB    break
 905       //  AAA(Bc    continue (breaks after B)
 906       //  AAA(BD    break
 907       //  AAA(Bi    break
 908       //
 909       //  AAA(c-    break
 910       //
 911       //  AAA(i-    continue
 912       //
 913
 914       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
 915         // Might need to break this script span
 916         if (sc == ULScript_Common) {
 917           need_break = true;
 918         } else {
 919           // Look at next following character, ignoring entity as Common
 920           int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
 921           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
 922             // We found a non-trivial change of script
 923             if (one_script_only_) {
 924               need_break = true;
 925             }
 926           }
 927         }
 928       }
 929       if (need_break) {break;}  // Non-letter or letter in wrong script
 930
 931       take += tlen;                   // Advance
 932       put += plen;                    // Advance
 933
 934       // Update the offset map to reflect take/put lengths
 935       if (tlen == plen) {
 936         map2original_.Copy(tlen);
 937       } else if (tlen < plen) {
 938         map2original_.Copy(tlen);
 939         map2original_.Insert(plen - tlen);
 940       } else {    // plen < tlen
 941         map2original_.Copy(plen);
 942         map2original_.Delete(tlen - plen);
 943       }
 944
 945       ++letter_count;
 946       if (put >= kMaxScriptBytes) {
 947         // Buffer is full
 948         span->truncated = true;
 949         break;
 950       }
 951     }     // End while letters
 952
 953     // Do run of non-letters (tag | &NL | NL)*
 954     while (take < byte_length_) {
 955       // Do fast scan to next interesting byte
 956       tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
 957       take += tlen;
 958       map2original_.Delete(tlen);
 959       if (take >= byte_length_) {break;}    // Might have scanned to end
 960
 961       // We are at a letter, nonletter, tag, or entity
 962       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
 963         if (next_byte_[take] == '<') {
 964           // Begining of tag; skip to end and go around again
 965           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
 966                                       exit_state_);
 967           sc = 0;
 968         } else if (next_byte_[take] == '>') {
 969           // Unexpected end of tag; skip it and go around again
 970           tlen = 1;         // Over the >
 971           sc = 0;
 972         } else if (next_byte_[take] == '&') {
 973           // Expand entity, no advance
 974           EntityToBuffer(next_byte_ + take, byte_length_ - take,
 975                          script_buffer_ + put, &tlen, &plen);
 976           sc = GetUTF8LetterScriptNum(script_buffer_ + put);
 977         }
 978       } else {
 979         // Update 1..4
 980         tlen = UTF8OneCharLen(next_byte_ + take);
 981         sc = GetUTF8LetterScriptNum(next_byte_ + take);
 982       }
 983       if (sc != 0) {break;}           // Letter found
 984       take += tlen;                   // Else advance
 985       map2original_.Delete(tlen);
 986     }     // End while not-letters
 987
 988     script_buffer_[put++] = ' ';
 989     map2original_.Insert(1);
 990
 991     // Letter in wrong script ?
 992     if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
 993     if (put >= put_soft_limit) {
 994       // Buffer is almost full
 995       span->truncated = true;
 996       break;
 997     }
 998   }
 999
1000   // Almost done. Back up to a character boundary if needed
1001   while ((0 < take) && (take < byte_length_) &&
1002          ((next_byte_[take] & 0xc0) == 0x80)) {
1003     // Back up over continuation byte
1004     --take;
1005     --put;
1006   }
1007
1008   // Update input position
1009   next_byte_ += take;
1010   byte_length_ -= take;
1011
1012   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
1013   //                          kMaxScriptBytes |   | put
1014   script_buffer_[put + 0] = ' ';
1015   script_buffer_[put + 1] = ' ';
1016   script_buffer_[put + 2] = ' ';
1017   script_buffer_[put + 3] = '\0';
1018   map2original_.Insert(4);
1019   map2original_.Reset();
1020
1021   span->text_bytes = put;       // Does not include the last four chars above
1022   return true;
1023 }
1024
1025 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
1026 // List changes with each version of Unicode, so just always lowercase
1027 // Unicode 6.2.0:
1028 //   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
1029 void ScriptScanner::LowerScriptSpan(LangSpan* span) {
1030   // If needed, lowercase all the text. If we do it sooner, might miss
1031   // lowercasing an entity such as &Aacute;
1032   // We only need to do this for Latn and Cyrl scripts
1033   map2uplow_.Clear();
1034   // Full Unicode lowercase of the entire buffer, including
1035   // four pad bytes off the end.
1036   // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
1037   // bytes and put the 0x00 in explicitly.
1038   // Build an offset map from script_buffer_lower_ back to script_buffer_
1039   int consumed, filled, changed;
1040   StringPiece istr(span->text, span->text_bytes + 3);
1041   StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
1042
1043   UTF8GenericReplace(&utf8repl_lettermarklower_obj,
1044                             istr, ostr, is_plain_text_,
1045                             &consumed, &filled, &changed, &map2uplow_);
1046   script_buffer_lower_[filled] = '\0';
1047   span->text = script_buffer_lower_;
1048   span->text_bytes = filled - 3;
1049   map2uplow_.Reset();
1050 }
1051
1052 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
1053 // Force Latin, Cyrillic, Greek scripts to be lowercase
1054 // Buffer ALWAYS has leading space and trailing space space space NUL
1055 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
1056   bool ok = GetOneScriptSpan(span);
1057   LowerScriptSpan(span);
1058   return ok;
1059 }
1060
1061
1062 // Maps byte offset in most recent GetOneScriptSpan/Lower
1063 // span->text [0..text_bytes] into an additional byte offset from
1064 // span->offset, to get back to corresponding text in the original
1065 // input buffer.
1066 // text_offset must be the first byte
1067 // of a UTF-8 character, or just beyond the last character. Normally this
1068 // routine is called with the first byte of an interesting range and
1069 // again with the first byte of the following range.
1070 int ScriptScanner::MapBack(int text_offset) {
1071   return map2original_.MapBack(map2uplow_.MapBack(text_offset));
1072 }
1073
1074
1075 // Gets lscript number for letters; always returns
1076 //   0 (common script) for non-letters
1077 int GetUTF8LetterScriptNum(const char* src) {
1078   int srclen = UTF8OneCharLen(src);
1079   const uint8* usrc = reinterpret_cast<const uint8*>(src);
1080   return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
1081                                     &usrc, &srclen);
1082 }
1083
1084 }  // namespace CLD2
1085
1086