1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
20 #include "getonescriptspan.h"
23 #include "fixunicodevalue.h"
24 #include "lang_script.h"
26 #include "utf8statetable.h"
28 #include "utf8prop_lettermarkscriptnum.h"
29 #include "utf8repl_lettermarklower.h"
30 #include "utf8scannot_lettermarkspecial.h"
35 // Alphabetical order for binary search, from
36 // generated_entities.cc
37 extern const int kNameToEntitySize;
38 extern const CharIntPair kNameToEntity[];
40 static const int kMaxUpToWordBoundary = 50; // span < this make longer,
42 static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
43 // to round to word boundary,
46 static const char kSpecialSymbol[256] = { // true for < > &
47 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
49 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
52 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
53 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
54 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
55 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
76 #define CR 16 // <cr> or <lf>
77 #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
78 #define PL 18 // possible letter, incl. &
79 #define xx 19 // <unused>
81 // Map byte to one of ~20 interesting categories for cheap tag parsing
82 static const uint8 kCharToSub[256] = {
83 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
84 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
85 NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
86 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
88 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
89 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
90 PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
91 P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
93 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
94 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
95 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
96 NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
98 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
99 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
100 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
101 PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
130 static const int kMaxExitStateLettersMarksOnly = 1;
131 static const int kMaxExitStateAllText = 2;
134 // State machine to do cheap parse of non-letter strings incl. tags
137 // advances <tag> ... </tag> for <script> <style>
139 // advances <!-- ... <tag> ... -->
143 // advances <tag <tag2>
146 // We start in state [0] at a non-letter and make at least one transition
147 // When scanning for just letters, arriving back at state [0] or [1] exits
148 // the state machine.
149 // When scanning for any non-tag text, arriving at state [2] also exits
150 static const uint8 kTagParseTbl_0[] = {
151 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
152 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state
153 X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
154 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state]
155 X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
156 X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
157 X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
158 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
159 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
160 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
161 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
162 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
163 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
164 X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
166 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
167 X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
168 X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
169 X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
170 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
171 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
172 X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
173 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
174 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
175 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
176 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
177 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
178 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
179 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
180 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
181 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
183 // < > ! - " ' / S C R I P T Y L E CR NL PL xx
184 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
185 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
186 X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
187 X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
188 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
189 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
190 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
191 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
192 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
193 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
194 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
195 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
203 UTFmax = 4, // maximum bytes per rune
204 Runesync = 0x80, // cannot represent part of a UTF sequence (<)
205 Runeself = 0x80, // rune and UTF sequences are the same (<)
206 Runeerror = 0xFFFD, // decoding error in UTF
207 Runemax = 0x10FFFF, // maximum rune value
210 // Debugging. Not thread safe.
211 static char gDisplayPiece[32];
212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
213 char* DisplayPiece(const char* next_byte_, int byte_length_) {
214 // Copy up to 8 UTF-8 chars to buffer
215 int k = 0; // byte count
216 int n = 0; // character count
217 for (int i = 0; i < byte_length_; ++i) {
218 char c = next_byte_[i];
219 if ((c & 0xc0) != 0x80) {
220 // Beginning of a UTF-8 character
221 int charlen = gCharlen[static_cast<uint8>(c) >> 4];
222 if (i + charlen > byte_length_) {break;} // Not enough room for full char
223 if (k >= (32 - 7)) {break;} // Not necessarily enough room
224 if (n >= 8) {break;} // Enough characters already
228 memcpy(&gDisplayPiece[k], "<", 4); k += 4;
229 } else if (c == '>') {
230 memcpy(&gDisplayPiece[k], ">", 4); k += 4;
231 } else if (c == '&') {
232 memcpy(&gDisplayPiece[k], "&", 5); k += 5;
233 } else if (c == '\'') {
234 memcpy(&gDisplayPiece[k], "'", 6); k += 6;
235 } else if (c == '"') {
236 memcpy(&gDisplayPiece[k], """, 6); k += 6;
238 gDisplayPiece[k++] = c;
241 gDisplayPiece[k++] = '\0';
242 return gDisplayPiece;
247 // runetochar copies (encodes) one rune, pointed to by r, to at most
248 // UTFmax bytes starting at s and returns the number of bytes generated.
249 int runetochar(char *str, const char32 *rune) {
250 // Convert to unsigned for range check.
262 str[0] = 0xC0 | (c >> 1*6);
263 str[1] = 0x80 | (c & 0x3F);
274 str[0] = 0xE0 | (c >> 2*6);
275 str[1] = 0x80 | ((c >> 1*6) & 0x3F);
276 str[2] = 0x80 | (c & 0x3F);
280 // 4 char 10000-1FFFFF
281 str[0] = 0xF0 | (c >> 3*6);
282 str[1] = 0x80 | ((c >> 2*6) & 0x3F);
283 str[2] = 0x80 | ((c >> 1*6) & 0x3F);
284 str[3] = 0x80 | (c & 0x3F);
290 // Useful for converting an entity to an ascii value.
291 // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ;
292 int LookupEntity(const char* entity_name, int entity_len) {
294 if (entity_len >= 16) {return -1;} // All real entities are shorter
296 memcpy(temp, entity_name, entity_len);
297 temp[entity_len] = '\0';
298 int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
299 if (match >= 0) {return kNameToEntity[match].i;}
303 bool ascii_isdigit(char c) {
304 return ('0' <= c) && (c <= '9');
306 bool ascii_isxdigit(char c) {
307 if (('0' <= c) && (c <= '9')) {return true;}
308 if (('a' <= c) && (c <= 'f')) {return true;}
309 if (('A' <= c) && (c <= 'F')) {return true;}
312 bool ascii_isalnum(char c) {
313 if (('0' <= c) && (c <= '9')) {return true;}
314 if (('a' <= c) && (c <= 'z')) {return true;}
315 if (('A' <= c) && (c <= 'Z')) {return true;}
318 int hex_digit_to_int(char c) {
319 if (('0' <= c) && (c <= '9')) {return c - '0';}
320 if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
321 if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
325 static int32 strto32_base10(const char* nptr, const char* limit,
326 const char **endptr) {
328 while (nptr < limit && *nptr == '0') {
331 if (nptr == limit || !ascii_isdigit(*nptr))
333 const char* end_digits_run = nptr;
334 while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
337 *endptr = end_digits_run;
338 const int num_digits = end_digits_run - nptr;
339 // kint32max == 2147483647.
340 if (num_digits < 9 ||
341 (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
343 for (; nptr < end_digits_run; ++nptr) {
345 value += *nptr - '0';
347 // Overflow past the last valid unicode codepoint
348 // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
349 return FixUnicodeValue(value);
351 // Overflow: can't fit in an int32;
352 // returns the replacement character 0xFFFD.
357 static int32 strto32_base16(const char* nptr, const char* limit,
358 const char **endptr) {
360 while (nptr < limit && *nptr == '0') {
363 if (nptr == limit || !ascii_isxdigit(*nptr)) {
366 const char* end_xdigits_run = nptr;
367 while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
370 *endptr = end_xdigits_run;
371 const int num_xdigits = end_xdigits_run - nptr;
372 // kint32max == 0x7FFFFFFF.
373 if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
375 for (; nptr < end_xdigits_run; ++nptr) {
377 value += hex_digit_to_int(*nptr);
379 // Overflow past the last valid unicode codepoint
380 // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
381 return FixUnicodeValue(value);
383 // Overflow: can't fit in an int32;
384 // returns the replacement character 0xFFFD.
389 // Unescape the current character pointed to by src. SETS the number
390 // of chars read for the conversion (in UTF8). If src isn't a valid entity,
391 // just consume the & and RETURN -1. If src doesn't point to & -- which it
392 // should -- set src_consumed to 0 and RETURN -1.
393 int ReadEntity(const char* src, int srcn, int* src_consumed) {
394 const char* const srcend = src + srcn;
396 if (srcn == 0 || *src != '&') { // input should start with an ampersand
400 *src_consumed = 1; // we'll get the & at least
402 // The standards are a bit unclear on when an entity ends. Certainly a ";"
403 // ends one, but spaces probably do too. We follow the lead of both IE and
404 // Netscape, which as far as we can tell end numeric entities (1st case below)
405 // at any non-digit, and end character entities (2nd case) at any non-alnum.
406 const char* entstart, *entend; // where the entity starts and ends
407 entstart = src + 1; // read past the &
408 int entval; // UCS2 value of the entity
409 if ( *entstart == '#' ) { // -- 1st case: numeric entity
410 if ( entstart + 2 >= srcend ) {
411 return -1; // no way a legitimate number could fit
412 } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric
413 entval = strto32_base16(entstart + 2, srcend, &entend);
414 } else { // decimal numeric entity
415 entval = strto32_base10(entstart+1, srcend, &entend);
417 if (entval == -1 || entend > srcend) {
418 return -1; // not entirely correct, but close enough
420 } else { // -- 2nd case: character entity
421 for (entend = entstart;
422 entend < srcend && ascii_isalnum(*entend);
424 // entity consists of alphanumeric chars
426 entval = LookupEntity(entstart, entend - entstart);
428 return -1; // not a legal entity name
430 // Now we do a strange-seeming IE6-compatibility check: if entval is
431 // >= 256, it *must* be followed by a semicolon or it's not considered
432 // an entity. The problem is lots of the newfangled entity names, like
433 // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
434 // When these links are written in HTML, it would be really bad if the
435 // "&lang" were treated as an entity, which is what the spec says
436 // *should* happen (even when the HTML is inside an "A HREF" tag!)
437 // IE ignores the spec for these new, high-value entities, so we do too.
438 if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
439 return -1; // make non-;-terminated entity illegal
443 // Finally, figure out how much src was consumed
444 if ( entend < srcend && *entend == ';' ) {
445 entend++; // standard says ; terminator is special
447 *src_consumed = entend - src;
453 // Writes entity value to dst. Returns take(src), put(dst) byte counts
454 void EntityToBuffer(const char* src, int len, char* dst,
455 int* tlen, int* plen) {
456 char32 entval = ReadEntity(src, len, tlen);
458 // ReadEntity does this already: entval = FixUnicodeValue(entval);
460 // Convert UTF-32 to UTF-8
462 *plen = runetochar(dst, &entval);
464 // Illegal entity; ignore the '&'
470 // Returns true if character is < > or &, none of which are letters
471 bool inline IsSpecial(char c) {
472 if ((c & 0xe0) == 0x20) {
473 return kSpecialSymbol[static_cast<uint8>(c)];
478 // Quick Skip to next letter or < > & or to end of string (eos)
479 // Always return is_letter for eos
480 int ScanToLetterOrSpecial(const char* src, int len) {
482 StringPiece str(src, len);
483 UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
484 return bytes_consumed;
490 // src points to non-letter, such as tag-opening '<'
491 // Return length from here to next possible letter
492 // On another < before >, return 1
495 // advances <tag> ... </tag> for <script> <style>
497 // advances <!-- ... <tag> ... -->
501 // advances <tag <tag2>
503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
504 const uint8* src = reinterpret_cast<const uint8*>(isrc);
505 const uint8* srclimit = src + len;
506 const uint8* tagParseTbl = kTagParseTbl_0;
508 while (src < srclimit) {
509 e = tagParseTbl[kCharToSub[*src++]];
510 if (e <= max_exit_state) {
511 // We overshot by one byte
515 tagParseTbl = &kTagParseTbl_0[e * 20];
518 if (src >= srclimit) {
519 // We fell off the end of the text.
520 // It looks like the most common case for this is a truncated file, not
521 // mismatched angle brackets. So we pretend that the last char was '>'
525 // OK to be in state 0 or state 2 at exit
526 if ((e != 0) && (e != 2)) {
527 // Error, '<' followed by '<'
528 // We want to back up to first <, then advance by one byte past it
529 int offset = src - reinterpret_cast<const uint8*>(isrc);
531 // Backscan to first '<' and return enough length to just get past it
532 --offset; // back up over the second '<', which caused us to stop
533 while ((0 < offset) && (isrc[offset] != '<')) {
534 // Find the first '<', which is unmatched
537 // skip to just beyond first '<'
541 return src - reinterpret_cast<const uint8*>(isrc);
545 ScriptScanner::ScriptScanner(const char* buffer,
548 : start_byte_(buffer),
550 next_byte_limit_(buffer + buffer_length),
551 byte_length_(buffer_length),
552 is_plain_text_(is_plain_text),
553 letters_marks_only_(true),
554 one_script_only_(true),
555 exit_state_(kMaxExitStateLettersMarksOnly) {
556 script_buffer_ = new char[kMaxScriptBuffer];
557 script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
558 map2original_.Clear(); // map from script_buffer_ to buffer
559 map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
562 // Extended version to allow spans of any non-tag text and spans of mixed script
563 ScriptScanner::ScriptScanner(const char* buffer,
568 : start_byte_(buffer),
570 next_byte_limit_(buffer + buffer_length),
571 byte_length_(buffer_length),
572 is_plain_text_(is_plain_text),
573 letters_marks_only_(!any_text),
574 one_script_only_(!any_script),
575 exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
576 script_buffer_ = new char[kMaxScriptBuffer];
577 script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
578 map2original_.Clear(); // map from script_buffer_ to buffer
579 map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
583 ScriptScanner::~ScriptScanner() {
584 delete[] script_buffer_;
585 delete[] script_buffer_lower_;
591 // Get to the first real non-tag letter or entity that is a letter
592 // Sets script of that letter
593 // Return len if no more letters
594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
595 int sc = UNKNOWN_ULSCRIPT;
599 // Do run of non-letters (tag | &NL | NL)*
602 // Do fast scan to next interesting byte
603 // int oldskip = skip;
604 skip += ScanToLetterOrSpecial(src + skip, len - skip);
606 // Check for no more letters/specials
613 // We are at a letter, nonletter, tag, or entity
614 if (IsSpecial(src[skip]) && !is_plain_text_) {
615 if (src[skip] == '<') {
616 // Begining of tag; skip to end and go around again
617 tlen = ScanToPossibleLetter(src + skip, len - skip,
620 } else if (src[skip] == '>') {
621 // Unexpected end of tag; skip it and go around again
622 tlen = 1; // Over the >
624 } else if (src[skip] == '&') {
625 // Expand entity, no advance
627 EntityToBuffer(src + skip, len - skip,
629 sc = GetUTF8LetterScriptNum(temp);
633 tlen = UTF8OneCharLen(src + skip);
634 sc = GetUTF8LetterScriptNum(src + skip);
636 if (sc != 0) {break;} // Letter found
637 skip += tlen; // Else advance
645 // These are for ASCII-only tag names
646 // Compare one letter uplow to c, ignoring case of uplowp
647 inline bool EqCase(char uplow, char c) {
648 return (uplow | 0x20) == c;
651 // These are for ASCII-only tag names
652 // Return true for space / < > etc. all less than 0x40
653 inline bool NeqLetter(char c) {
657 // These are for ASCII-only tag names
658 // Return true for space \n false for \r
659 inline bool WS(char c) {
660 return (c == ' ') || (c == '\n');
663 // Canonical CR or LF
664 static const char LF = '\n';
667 // The naive loop scans from next_byte_ to script_buffer_ until full.
668 // But this can leave an awkward hard-to-identify short fragment at the
669 // end of the input. We would prefer to make the next-to-last fragment
670 // shorter and the last fragment longer.
672 // Copy next run of non-tag characters to buffer [NUL terminated]
673 // This just replaces tags with space or \n and removes entities.
674 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
675 // including \r or \n are replaced by \n. All other tags and skipped text
676 // are replaced with ASCII space.
678 // Buffer ALWAYS has leading space and trailing space space space NUL
679 bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
680 span->text = script_buffer_;
681 span->text_bytes = 0;
682 span->offset = next_byte_ - start_byte_;
683 span->ulscript = UNKNOWN_ULSCRIPT;
684 span->lang = UNKNOWN_LANGUAGE;
685 span->truncated = false;
687 int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
688 if ((kMaxScriptBytes <= byte_length_) &&
689 (byte_length_ < (2 * kMaxScriptBytes))) {
690 // Try to split the last two fragments in half
691 put_soft_limit = byte_length_ / 2;
694 script_buffer_[0] = ' '; // Always a space at front of output
695 script_buffer_[1] = '\0';
697 int put = 1; // Start after the initial space
700 if (byte_length_ <= 0) {
701 return false; // No more text to be found
704 // Go over alternating spans of text and tags,
705 // copying letters to buffer with single spaces for each run of non-letters
706 bool last_byte_was_space = false;
707 while (take < byte_length_) {
708 char c = next_byte_[take];
709 if (c == '\r') {c = LF;} // Canonical CR or LF
710 if (c == '\n') {c = LF;} // Canonical CR or LF
712 if (IsSpecial(c) && !is_plain_text_) {
714 // Replace tag with space
715 c = ' '; // for almost-full test below
716 // or if <p> <br> <tr>, replace with \n
717 if (take < (byte_length_ - 3)) {
718 if (EqCase(next_byte_[take + 1], 'p') &&
719 NeqLetter(next_byte_[take + 2])) {
722 if (EqCase(next_byte_[take + 1], 'b') &&
723 EqCase(next_byte_[take + 2], 'r') &&
724 NeqLetter(next_byte_[take + 3])) {
727 if (EqCase(next_byte_[take + 1], 't') &&
728 EqCase(next_byte_[take + 2], 'r') &&
729 NeqLetter(next_byte_[take + 3])) {
733 // Begining of tag; skip to end and go around again
734 tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
736 // Copy one byte, compressing spaces
737 if (!last_byte_was_space || !WS(c)) {
738 script_buffer_[put++] = c; // Advance dest
739 last_byte_was_space = WS(c);
741 } else if (c == '>') {
742 // Unexpected end of tag; copy it and go around again
743 tlen = 1; // Over the >
744 script_buffer_[put++] = c; // Advance dest
745 } else if (c == '&') {
746 // Expand entity, no advance
747 EntityToBuffer(next_byte_ + take, byte_length_ - take,
748 script_buffer_ + put, &tlen, &plen);
749 put += plen; // Advance dest
751 take += tlen; // Advance source
753 // Copy one byte, compressing spaces
754 if (!last_byte_was_space || !WS(c)) {
755 script_buffer_[put++] = c; // Advance dest
756 last_byte_was_space = WS(c);
758 ++take; // Advance source
762 (put >= put_soft_limit)) {
763 // Buffer is almost full
764 span->truncated = true;
767 if (put >= kMaxScriptBytes) {
768 // Buffer is completely full
769 span->truncated = true;
774 // Almost done. Back up to a character boundary if needed
775 while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
776 // Back up over continuation byte
781 // Update input position
783 byte_length_ -= take;
785 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
786 // kMaxScriptBytes | | put
787 script_buffer_[put + 0] = ' ';
788 script_buffer_[put + 1] = ' ';
789 script_buffer_[put + 2] = ' ';
790 script_buffer_[put + 3] = '\0';
792 span->text_bytes = put; // Does not include the last four chars above
797 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
798 // Buffer ALWAYS has leading space and trailing space space space NUL
799 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
800 if (!letters_marks_only_) {
801 // Return non-tag text, including punctuation and digits
802 return GetOneTextSpan(span);
805 span->text = script_buffer_;
806 span->text_bytes = 0;
807 span->offset = next_byte_ - start_byte_;
808 span->ulscript = UNKNOWN_ULSCRIPT;
809 span->lang = UNKNOWN_LANGUAGE;
810 span->truncated = false;
812 // struct timeval script_start, script_mid, script_end;
814 int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
815 if ((kMaxScriptBytes <= byte_length_) &&
816 (byte_length_ < (2 * kMaxScriptBytes))) {
817 // Try to split the last two fragments in half
818 put_soft_limit = byte_length_ / 2;
822 int spanscript; // The script of this span
823 int sc = UNKNOWN_ULSCRIPT; // The script of next character
827 script_buffer_[0] = ' '; // Always a space at front of output
828 script_buffer_[1] = '\0';
830 int put = 1; // Start after the initial space
832 // Build offsets from span->text back to start_byte_ + span->offset
833 // This mapping reflects deletion of non-letters, expansion of
835 map2original_.Clear();
836 map2original_.Delete(span->offset); // So that MapBack(0) gives offset
838 // Get to the first real non-tag letter or entity that is a letter
839 int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
841 byte_length_ -= skip;
844 map2original_.Delete(skip);
845 map2original_.Insert(1);
847 map2original_.Copy(1);
849 if (byte_length_ <= 0) {
850 map2original_.Reset();
851 return false; // No more letters to be found
854 // There is at least one letter, so we know the script for this span
855 span->ulscript = (ULScript)spanscript;
858 // Go over alternating spans of same-script letters and non-letters,
859 // copying letters to buffer with single spaces for each run of non-letters
860 while (take < byte_length_) {
861 // Copy run of letters in same script (&LS | LS)*
862 int letter_count = 0; // Keep track of word length
863 bool need_break = false;
865 while (take < byte_length_) {
866 // We are at a letter, nonletter, tag, or entity
867 if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
868 if (next_byte_[take] == '<') {
872 } else if (next_byte_[take] == '>') {
873 // Unexpected end of tag
876 } else if (next_byte_[take] == '&') {
877 // Copy entity, no advance
878 EntityToBuffer(next_byte_ + take, byte_length_ - take,
879 script_buffer_ + put, &tlen, &plen);
880 sc = GetUTF8LetterScriptNum(script_buffer_ + put);
883 // Real letter, safely copy up to 4 bytes, increment by 1..4
884 // Will update by 1..4 bytes at Advance, below
885 tlen = plen = UTF8OneCharLen(next_byte_ + take);
886 if (take < (byte_length_ - 3)) {
887 // X86 fast case, does unaligned load/store
888 UNALIGNED_STORE32(script_buffer_ + put,
889 UNALIGNED_LOAD32(next_byte_ + take));
892 // Slow case, happens 1-3 times per input document
893 memcpy(script_buffer_ + put, next_byte_ + take, plen);
895 sc = GetUTF8LetterScriptNum(next_byte_ + take);
898 // Allow continue across a single letter in a different script:
899 // A B D = three scripts, c = common script, i = inherited script,
900 // - = don't care, ( = take position before the += below
905 // AAA(Bc continue (breaks after B)
914 if ((sc != spanscript) && (sc != ULScript_Inherited)) {
915 // Might need to break this script span
916 if (sc == ULScript_Common) {
919 // Look at next following character, ignoring entity as Common
920 int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
921 if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
922 // We found a non-trivial change of script
923 if (one_script_only_) {
929 if (need_break) {break;} // Non-letter or letter in wrong script
931 take += tlen; // Advance
932 put += plen; // Advance
934 // Update the offset map to reflect take/put lengths
936 map2original_.Copy(tlen);
937 } else if (tlen < plen) {
938 map2original_.Copy(tlen);
939 map2original_.Insert(plen - tlen);
940 } else { // plen < tlen
941 map2original_.Copy(plen);
942 map2original_.Delete(tlen - plen);
946 if (put >= kMaxScriptBytes) {
948 span->truncated = true;
951 } // End while letters
953 // Do run of non-letters (tag | &NL | NL)*
954 while (take < byte_length_) {
955 // Do fast scan to next interesting byte
956 tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
958 map2original_.Delete(tlen);
959 if (take >= byte_length_) {break;} // Might have scanned to end
961 // We are at a letter, nonletter, tag, or entity
962 if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
963 if (next_byte_[take] == '<') {
964 // Begining of tag; skip to end and go around again
965 tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
968 } else if (next_byte_[take] == '>') {
969 // Unexpected end of tag; skip it and go around again
970 tlen = 1; // Over the >
972 } else if (next_byte_[take] == '&') {
973 // Expand entity, no advance
974 EntityToBuffer(next_byte_ + take, byte_length_ - take,
975 script_buffer_ + put, &tlen, &plen);
976 sc = GetUTF8LetterScriptNum(script_buffer_ + put);
980 tlen = UTF8OneCharLen(next_byte_ + take);
981 sc = GetUTF8LetterScriptNum(next_byte_ + take);
983 if (sc != 0) {break;} // Letter found
984 take += tlen; // Else advance
985 map2original_.Delete(tlen);
986 } // End while not-letters
988 script_buffer_[put++] = ' ';
989 map2original_.Insert(1);
991 // Letter in wrong script ?
992 if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
993 if (put >= put_soft_limit) {
994 // Buffer is almost full
995 span->truncated = true;
1000 // Almost done. Back up to a character boundary if needed
1001 while ((0 < take) && (take < byte_length_) &&
1002 ((next_byte_[take] & 0xc0) == 0x80)) {
1003 // Back up over continuation byte
1008 // Update input position
1010 byte_length_ -= take;
1012 // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
1013 // kMaxScriptBytes | | put
1014 script_buffer_[put + 0] = ' ';
1015 script_buffer_[put + 1] = ' ';
1016 script_buffer_[put + 2] = ' ';
1017 script_buffer_[put + 3] = '\0';
1018 map2original_.Insert(4);
1019 map2original_.Reset();
1021 span->text_bytes = put; // Does not include the last four chars above
1025 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
1026 // List changes with each version of Unicode, so just always lowercase
1028 // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
1029 void ScriptScanner::LowerScriptSpan(LangSpan* span) {
1030 // If needed, lowercase all the text. If we do it sooner, might miss
1031 // lowercasing an entity such as Á
1032 // We only need to do this for Latn and Cyrl scripts
1034 // Full Unicode lowercase of the entire buffer, including
1035 // four pad bytes off the end.
1036 // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
1037 // bytes and put the 0x00 in explicitly.
1038 // Build an offset map from script_buffer_lower_ back to script_buffer_
1039 int consumed, filled, changed;
1040 StringPiece istr(span->text, span->text_bytes + 3);
1041 StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
1043 UTF8GenericReplace(&utf8repl_lettermarklower_obj,
1044 istr, ostr, is_plain_text_,
1045 &consumed, &filled, &changed, &map2uplow_);
1046 script_buffer_lower_[filled] = '\0';
1047 span->text = script_buffer_lower_;
1048 span->text_bytes = filled - 3;
1052 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
1053 // Force Latin, Cyrillic, Greek scripts to be lowercase
1054 // Buffer ALWAYS has leading space and trailing space space space NUL
1055 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
1056 bool ok = GetOneScriptSpan(span);
1057 LowerScriptSpan(span);
1062 // Maps byte offset in most recent GetOneScriptSpan/Lower
1063 // span->text [0..text_bytes] into an additional byte offset from
1064 // span->offset, to get back to corresponding text in the original
1066 // text_offset must be the first byte
1067 // of a UTF-8 character, or just beyond the last character. Normally this
1068 // routine is called with the first byte of an interesting range and
1069 // again with the first byte of the following range.
1070 int ScriptScanner::MapBack(int text_offset) {
1071 return map2original_.MapBack(map2uplow_.MapBack(text_offset));
1075 // Gets lscript number for letters; always returns
1076 // 0 (common script) for non-letters
1077 int GetUTF8LetterScriptNum(const char* src) {
1078 int srclen = UTF8OneCharLen(src);
1079 const uint8* usrc = reinterpret_cast<const uint8*>(src);
1080 return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,