1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
28 #include "integral_types.h"
29 #include "lang_script.h"
30 #include "utf8acceptinterchange.h"
31 #include "utf8statetable.h"
33 #ifdef CLD2_DYNAMIC_MODE
34 #include "cld2_dynamic_data.h"
35 #include "cld2_dynamic_data_loader.h"
37 #include "cld2tablesummary.h"
38 #include "compact_lang_det_impl.h"
39 #include "compact_lang_det_hint_code.h"
40 #include "getonescriptspan.h"
48 // Linker supplies the right tables, From files
49 // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc
50 // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc
51 // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc
52 // cld2_generated_distinctocta*.cc
53 // cld_generated_score_quad_octa_1024_256.cc
55 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
56 // sizes that are 1/3/5 times a power of two, instead of just powers of two.
57 // Gives more flexibility of total footprint for CLD2.
59 extern const int kLanguageToPLangSize;
60 extern const int kCloseSetSize;
62 extern const UTF8PropObj cld_generated_CjkUni_obj;
63 extern const CLD2TableSummary kCjkCompat_obj;
64 extern const CLD2TableSummary kCjkDeltaBi_obj;
65 extern const CLD2TableSummary kDistinctBiTable_obj;
66 extern const CLD2TableSummary kQuad_obj;
67 extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables
68 extern const CLD2TableSummary kDeltaOcta_obj;
69 extern const CLD2TableSummary kDistinctOcta_obj;
70 extern const short kAvgDeltaOctaScore[];
72 // Returns the length in bytes of the prefix of src that is all
73 // interchange valid UTF-8
74 int SpanInterchangeValid(const char* src, int byte_length) {
76 const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
77 StringPiece str(src, byte_length);
78 UTF8GenericScan(st, str, &bytes_consumed);
79 return bytes_consumed;
82 #ifdef CLD2_DYNAMIC_MODE
83 // CLD2_DYNAMIC_MODE is defined:
84 // Data will be read from an mmap opened at runtime.
86 // Convenience for nulling things out completely at any point.
87 static ScoringTables NULL_TABLES = {
88 NULL, //&cld_generated_CjkUni_obj,
89 NULL, //&kCjkCompat_obj,
90 NULL, //&kCjkDeltaBi_obj,
91 NULL, //&kDistinctBiTable_obj,
94 NULL, //&kDeltaOcta_obj,
95 NULL, //&kDistinctOcta_obj,
96 NULL, //kAvgDeltaOctaScore,
98 static ScoringTables kScoringtables = NULL_TABLES; // copy constructed
99 static bool dynamicDataLoaded = false;
100 static bool dataSourceIsFile = false;
101 static ScoringTables* dynamicTables = NULL;
102 static void* mmapAddress = NULL;
103 static uint32_t mmapLength = 0;
105 bool isDataLoaded() { return dynamicDataLoaded; }
106 bool isDataDynamic() { return true; } // Because CLD2_DYNAMIC_MODE is defined
108 void loadDataFromFile(const char* fileName) {
109 if (isDataLoaded()) {
112 ScoringTables* result = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
113 if (result == NULL) {
114 fprintf(stderr, "WARNING: Dynamic data loading failed.\n");
117 dynamicTables = result;
118 kScoringtables = *dynamicTables;
119 dataSourceIsFile = true;
120 dynamicDataLoaded = true;
123 void loadDataFromRawAddress(const void* rawAddress, const uint32_t length) {
124 if (isDataLoaded()) {
127 ScoringTables* result = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length);
128 if (result == NULL) {
129 fprintf(stderr, "WARNING: Dynamic data loading failed.\n");
132 dynamicTables = result;
133 kScoringtables = *dynamicTables;
134 dataSourceIsFile = false;
135 dynamicDataLoaded = true;
139 if (!dynamicDataLoaded) return;
140 if (dataSourceIsFile) {
141 CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength);
143 CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables);
145 dynamicDataLoaded = false;
146 dataSourceIsFile = false; // vacuous
147 kScoringtables = NULL_TABLES; // Housekeeping: null all pointers
149 #else // !CLD2_DYNAMIC_MODE
150 // This initializes kScoringtables.quadgram_obj etc.
151 static const ScoringTables kScoringtables = {
152 &cld_generated_CjkUni_obj,
155 &kDistinctBiTable_obj,
158 &kQuad_obj2, // Dual lookup tables
165 // Method implementations below are provided so that callers aren't *forced*
166 // to depend upon the CLD2_DYNAMIC_MODE flag, but can use runtime checks
167 // instead. For more information, refer to CLD2 issue 16:
168 // https://code.google.com/p/cld2/issues/detail?id=16
169 bool isDataLoaded() { return true; } // Data is statically linked
170 bool isDataDynamic() { return false; } // Because CLD2_DYNAMIC_MODE is not defined
172 void loadDataFromFile(const char* fileName) {
173 // This is a bug in the calling code.
174 fprintf(stderr, "WARNING: Dynamic mode not active, loadDataFromFile has no effect!\n");
176 void loadDataFromRawAddress(const void* rawAddress, const uint32_t length) {
177 // This is a bug in the calling code.
178 fprintf(stderr, "WARNING: Dynamic mode not active, loadDataFromRawAddress has no effect!\n");
181 // This is a bug in the calling code.
182 fprintf(stderr, "WARNING: Dynamic mode not active, unloadData has no effect!\n");
185 #endif // #ifdef CLD2_DYNAMIC_MODE
188 static const bool FLAGS_cld_no_minimum_bytes = false;
189 static const bool FLAGS_cld_forcewords = true;
190 static const bool FLAGS_cld_showme = false;
191 static const bool FLAGS_cld_echotext = true;
192 static const int32 FLAGS_cld_textlimit = 160;
193 static const int32 FLAGS_cld_smoothwidth = 20;
194 static const bool FLAGS_cld_2011_hints = true;
195 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
197 static const bool FLAGS_dbgscore = false;
200 static const int kLangHintInitial = 12; // Boost language by N initially
201 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
203 static const int kShortSpanThresh = 32; // Bytes
204 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
206 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
207 // after this many text bytes
208 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
209 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
210 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
212 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
213 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
214 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
216 static const int kMaxSpaceScan = 32; // Bytes
218 static const int kGoodLang1Percent = 70;
219 static const int kGoodLang1and2Percent = 93;
220 static const int kShortTextThresh = 256; // Bytes
222 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
223 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
225 static const int kDefaultWordSpan = 256; // Scan at least this many initial
226 // bytes with word scoring
227 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
229 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
231 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
234 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
235 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
236 static const int kGoodFirstMinPercent = 26; // <this => UNK
237 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
238 static const int kIgnoreMaxPercent = 20; // >this => unreli
239 static const int kKeepMinPercent = 2; // <this => unreli
243 // Statistically closest language, based on quadgram table
244 // Those that are far from other languges map to UNKNOWN_LANGUAGE
245 // Subscripted by Language
247 // From lang_correlation.txt and hand-edits
248 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
249 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
250 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
252 static const int kMinCorrPercent = 24; // Pick off how close you want
253 // 24 catches PERSIAN <== ARABIC
254 // but not SPANISH <== PORTUGESE
255 static Language Unknown = UNKNOWN_LANGUAGE;
258 // Subscripted by Language
259 static const Language kClosestAltLanguage[] = {
260 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
261 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
262 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
263 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
264 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
265 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
266 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
267 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
268 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
269 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
270 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
271 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
272 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
273 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
274 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
275 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
276 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
277 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
278 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
279 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
280 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
281 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
282 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
283 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
284 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
285 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
286 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
287 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
288 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
289 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
290 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
291 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
292 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
293 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
294 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
295 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
296 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
297 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
298 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
299 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
300 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
301 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
302 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
303 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
304 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
305 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
306 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
307 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
308 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
309 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
310 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
311 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
312 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
313 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
314 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
315 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
316 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
317 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
318 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
319 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
320 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
321 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
322 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
323 (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
324 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
325 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
326 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
327 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
328 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
329 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
330 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
331 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
332 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
333 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
334 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
335 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
336 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
337 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
338 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
339 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
340 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
341 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
342 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
343 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
344 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
345 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
346 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
347 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
348 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
349 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
350 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
351 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
352 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
353 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
354 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
355 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
356 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
357 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
358 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
359 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
360 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
361 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
362 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
363 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
364 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
365 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
366 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
367 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
368 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
369 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
370 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
371 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
372 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
373 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
374 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
375 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
376 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
377 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
378 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
379 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
380 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
381 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
382 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
383 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
384 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
385 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
386 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
387 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
388 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
389 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
390 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
391 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
392 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
393 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
394 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
395 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
396 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
397 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
398 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
399 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
400 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
401 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
402 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
403 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
404 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
405 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
406 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
407 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
408 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
409 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
410 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
411 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
412 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
413 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
414 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
415 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
416 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
417 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
418 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
419 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
420 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
421 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
423 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN
424 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO
425 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE
426 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN
429 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
430 // kClosestAltLanguage_has_incorrect_size);
433 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
434 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
435 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
436 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
437 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
438 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
439 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
440 inline bool FlagBestEffort(int flags) {
441 return (flags & kCLDFlagBestEffort) != 0;
445 // Defines Top40 packed languages
447 // Google top 40 languages
449 // Tier 0/1 Language enum list (16)
450 // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
451 // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
452 // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
455 // Tier 2 Language enum list (22)
456 // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
457 // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
458 // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
459 // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
462 // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
464 // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
467 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
471 void PrintText(FILE* f, Language cur_lang, const string& temp) {
472 if (temp.size() == 0) {return;}
473 fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
477 //------------------------------------------------------------------------------
478 // For --cld_html debugging output. Not thread safe
479 //------------------------------------------------------------------------------
480 static Language prior_lang = UNKNOWN_LANGUAGE;
481 static bool prior_unreliable = false;
483 //------------------------------------------------------------------------------
484 // End For --cld_html debugging output
485 //------------------------------------------------------------------------------
488 // Backscan to word boundary, returning how many bytes n to go back
489 // so that src - n is non-space ans src - n - 1 is space.
490 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
491 int BackscanToSpace(const char* src, int limit) {
493 limit = minint(limit, kMaxSpaceScan);
495 if (src[-n - 1] == ' ') {return n;} // We are at _X
500 if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin
506 // Forwardscan to word boundary, returning how many bytes n to go forward
507 // so that src + n is non-space ans src + n - 1 is space.
508 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
509 int ForwardscanToSpace(const char* src, int limit) {
511 limit = minint(limit, kMaxSpaceScan);
513 if (src[n] == ' ') {return n + 1;} // We are at _X
518 if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin
525 // This uses a cheap predictor to get a measure of compression, and
526 // hence a measure of repetitiveness. It works on complete UTF-8 characters
527 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
528 // all the time when done with a byte-based count. Sigh.
530 // To allow running prediction across multiple chunks, caller passes in current
531 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
533 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
534 // each correctly-predicted character.
536 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
539 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
541 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
543 const uint8* src = reinterpret_cast<const uint8*>(isrc);
544 const uint8* srclimit = src + src_len;
545 int local_hash = *hash;
547 while (src < srclimit) {
551 // Pick up one char and length
553 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
555 } else if ((c & 0xe0) == 0xc0) {
557 c = (c << 8) | src[1];
559 } else if ((c & 0xf0) == 0xe0) {
561 c = (c << 16) | (src[1] << 8) | src[2];
565 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
570 int p = tbl[local_hash]; // Prediction
571 tbl[local_hash] = c; // Update prediction
573 p_count += incr; // Count bytes of good predictions
576 local_hash = ((local_hash << 4) ^ c) & 0xfff;
584 // Counts number of spaces; a little faster than one-at-a-time
585 // Doesn't count odd bytes at end
586 int CountSpaces4(const char* src, int src_len) {
588 for (int i = 0; i < (src_len & ~3); i += 4) {
589 s_count += (src[i] == ' ');
590 s_count += (src[i+1] == ' ');
591 s_count += (src[i+2] == ' ');
592 s_count += (src[i+3] == ' ');
598 // Remove words of text that have more than half their letters predicted
599 // correctly by our cheap predictor, moving the remaining words in-place
600 // to the front of the input buffer.
602 // To allow running prediction across multiple chunks, caller passes in current
603 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
605 // Return the new, possibly-shorter length
607 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
610 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
611 const uint8* src = reinterpret_cast<const uint8*>(isrc);
612 const uint8* srclimit = src + src_len;
614 int local_hash = *hash;
615 char* word_dst = dst; // Start of next word
616 int good_predict_bytes = 0;
617 int word_length_bytes = 0;
619 while (src < srclimit) {
625 if ((good_predict_bytes * 2) > word_length_bytes) {
626 // Word is well-predicted: backup to start of this word
628 if (FLAGS_cld_showme) {
629 // Mark the deletion point with period
630 // Don't repeat multiple periods
631 // Cannot mark with more bytes or may overwrite unseen input
632 if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
638 word_dst = dst; // Start of next word
639 good_predict_bytes = 0;
640 word_length_bytes = 0;
643 // Pick up one char and length
645 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
647 } else if ((c & 0xe0) == 0xc0) {
650 c = (c << 8) | src[1];
652 } else if ((c & 0xf0) == 0xe0) {
656 c = (c << 16) | (src[1] << 8) | src[2];
663 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
667 word_length_bytes += incr;
669 int p = tbl[local_hash]; // Prediction
670 tbl[local_hash] = c; // Update prediction
672 good_predict_bytes += incr; // Count good predictions
675 local_hash = ((local_hash << 4) ^ c) & 0xfff;
680 if ((dst - isrc) < (src_len - 3)) {
681 // Pad and make last char clean UTF-8 by putting following spaces
686 } else if ((dst - isrc) < src_len) {
687 // Make last char clean UTF-8 by putting following space off the end
691 return static_cast<int>(dst - isrc);
695 // This alternate form overwrites redundant words, thus avoiding corrupting the
696 // backmap for generating a vector of original-text ranges.
697 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
698 const uint8* src = reinterpret_cast<const uint8*>(isrc);
699 const uint8* srclimit = src + src_len;
701 int local_hash = *hash;
702 char* word_dst = dst; // Start of next word
703 int good_predict_bytes = 0;
704 int word_length_bytes = 0;
706 while (src < srclimit) {
712 if ((good_predict_bytes * 2) > word_length_bytes) {
713 // Word [word_dst..dst-1) is well-predicted: overwrite
714 for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
716 word_dst = dst; // Start of next word
717 good_predict_bytes = 0;
718 word_length_bytes = 0;
721 // Pick up one char and length
723 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
725 } else if ((c & 0xe0) == 0xc0) {
728 c = (c << 8) | src[1];
730 } else if ((c & 0xf0) == 0xe0) {
734 c = (c << 16) | (src[1] << 8) | src[2];
741 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
745 word_length_bytes += incr;
747 int p = tbl[local_hash]; // Prediction
748 tbl[local_hash] = c; // Update prediction
750 good_predict_bytes += incr; // Count good predictions
753 local_hash = ((local_hash << 4) ^ c) & 0xfff;
758 if ((dst - isrc) < (src_len - 3)) {
759 // Pad and make last char clean UTF-8 by putting following spaces
764 } else if ((dst - isrc) < src_len) {
765 // Make last char clean UTF-8 by putting following space off the end
769 return static_cast<int>(dst - isrc);
773 // Remove portions of text that have a high density of spaces, or that are
774 // overly repetitive, squeezing the remaining text in-place to the front of the
777 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
778 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
780 // Return the new, possibly-shorter length
782 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
785 int CheapSqueezeInplace(char* isrc,
790 char* srclimit = src + src_len;
791 bool skipping = false;
794 // Allocate local prediction table.
795 int* predict_tbl = new int[kPredictionTableSize];
796 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
798 int chunksize = ichunksize;
799 if (chunksize == 0) {chunksize = kChunksizeDefault;}
800 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
801 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
803 while (src < srclimit) {
804 int remaining_bytes = srclimit - src;
805 int len = minint(chunksize, remaining_bytes);
806 // Make len land us on a UTF-8 character boundary.
807 // Ah. Also fixes mispredict because we could get out of phase
808 // Loop always terminates at trailing space in buffer
809 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
811 int space_n = CountSpaces4(src, len);
812 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
813 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
816 // Keeping-to-skipping transition; do it at a space
817 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
820 // Force a leading space if the first chunk is deleted
823 if (FLAGS_cld_showme) {
824 // Mark the deletion point with black square U+25A0
825 *dst++ = static_cast<unsigned char>(0xe2);
826 *dst++ = static_cast<unsigned char>(0x96);
827 *dst++ = static_cast<unsigned char>(0xa0);
835 // Skipping-to-keeping transition; do it at a space
836 int n = ForwardscanToSpace(src, len);
838 remaining_bytes -= n; // Shrink remaining length
842 // "len" can be negative in some cases
844 memmove(dst, src, len);
851 if ((dst - isrc) < (src_len - 3)) {
852 // Pad and make last char clean UTF-8 by putting following spaces
857 } else if ((dst - isrc) < src_len) {
858 // Make last char clean UTF-8 by putting following space off the end
862 // Deallocate local prediction table
863 delete[] predict_tbl;
864 return static_cast<int>(dst - isrc);
867 // This alternate form overwrites redundant words, thus avoiding corrupting the
868 // backmap for generating a vector of original-text ranges.
869 int CheapSqueezeInplaceOverwrite(char* isrc,
874 char* srclimit = src + src_len;
875 bool skipping = false;
878 // Allocate local prediction table.
879 int* predict_tbl = new int[kPredictionTableSize];
880 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
882 int chunksize = ichunksize;
883 if (chunksize == 0) {chunksize = kChunksizeDefault;}
884 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
885 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
887 // Always keep first byte (space)
890 while (src < srclimit) {
891 int remaining_bytes = srclimit - src;
892 int len = minint(chunksize, remaining_bytes);
893 // Make len land us on a UTF-8 character boundary.
894 // Ah. Also fixes mispredict because we could get out of phase
895 // Loop always terminates at trailing space in buffer
896 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
898 int space_n = CountSpaces4(src, len);
899 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
900 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
901 // Overwrite the text [dst-n..dst)
903 // Keeping-to-skipping transition; do it at a space
904 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
905 // Text [word_dst..dst) is well-predicted: overwrite
906 for (char* p = dst - n; p < dst; ++p) {*p = '.';}
909 // Overwrite the text [dst..dst+len)
910 for (char* p = dst; p < dst + len; ++p) {*p = '.';}
911 dst[len - 1] = ' '; // Space at end so we can see what is happening
915 // Skipping-to-keeping transition; do it at a space
916 int n = ForwardscanToSpace(src, len);
917 // Text [dst..dst+n) is well-predicted: overwrite
918 for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
926 if ((dst - isrc) < (src_len - 3)) {
927 // Pad and make last char clean UTF-8 by putting following spaces
932 } else if ((dst - isrc) < src_len) {
933 // Make last char clean UTF-8 by putting following space off the end
937 // Deallocate local prediction table
938 delete[] predict_tbl;
939 return static_cast<int>(dst - isrc);
942 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
943 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
944 // Just CountSpaces is about 340 MB/sec
945 // Byte-only CountPredictedBytes is about 150 MB/sec
946 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
947 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
948 // Unjammed byte-only both = 170 MB/sec
949 // Jammed byte-only both = 120 MB/sec
950 // Back to original w/slight updates, 110 MB/sec
952 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
953 // Don't trigger at all on short text
954 if (src_len < testsize) {return false;}
955 int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
956 int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
958 // Allocate local prediction table.
959 int* predict_tbl = new int[kPredictionTableSize];
960 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
963 if ((CountSpaces4(src, testsize) >= space_thresh) ||
964 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
968 // Deallocate local prediction table
969 delete[] predict_tbl;
976 // Delete any extended languages from doc_tote
977 void RemoveExtendedLanguages(DocTote* doc_tote) {
981 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
983 // For Tier3 languages, require a minimum number of bytes to be first-place lang
984 static const int kGoodFirstT3MinBytes = 24; // <this => no first
986 // Move bytes for unreliable langs to another lang or UNKNOWN
987 // doc_tote is sorted, so cannot Add
989 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
990 // merge both into CHINESE.
993 // we also want to remove Tier3 languages as the first lang if there is very
994 // little text like ej1 ej2 ej3 ej4
995 // maybe fold this back in earlier
997 void RemoveUnreliableLanguages(DocTote* doc_tote,
998 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
999 // Prepass to merge some low-reliablility languages
1000 // TODO: this shouldn't really reach in to the internal structure of doc_tote
1001 int total_bytes = 0;
1002 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1003 int plang = doc_tote->Key(sub);
1004 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
1006 Language lang = static_cast<Language>(plang);
1007 int bytes = doc_tote->Value(sub);
1008 int reli = doc_tote->Reliability(sub);
1009 if (bytes == 0) {continue;} // Zero bytes
1010 total_bytes += bytes;
1012 // Reliable percent = stored reliable score over stored bytecount
1013 int reliable_percent = reli / bytes;
1014 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
1016 // This language is too unreliable to keep, but we might merge it.
1017 Language altlang = UNKNOWN_LANGUAGE;
1018 if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
1019 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
1021 // Look for alternative in doc_tote
1022 int altsub = doc_tote->Find(altlang);
1023 if (altsub < 0) {continue;} // No alternative text
1025 int bytes2 = doc_tote->Value(altsub);
1026 int reli2 = doc_tote->Reliability(altsub);
1027 if (bytes2 == 0) {continue;} // Zero bytes
1029 // Reliable percent is stored reliable score over stored bytecount
1030 int reliable_percent2 = reli2 / bytes2;
1032 // Merge one language into the other. Break ties toward lower lang #
1035 bool into_lang = false;
1036 if ((reliable_percent2 < reliable_percent) ||
1037 ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
1043 // Make sure merged reliability doesn't drop and is enough to avoid delete
1044 int newpercent = maxint(reliable_percent, reliable_percent2);
1045 newpercent = maxint(newpercent, kMinReliableKeepPercent);
1046 int newbytes = bytes + bytes2;
1047 int newreli = newpercent * newbytes;
1049 doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
1050 doc_tote->SetScore(fromsub, 0);
1051 doc_tote->SetReliability(fromsub, 0);
1052 doc_tote->SetScore(tosub, newbytes);
1053 doc_tote->SetReliability(tosub, newreli);
1055 // Show fate of unreliable languages if at least 10 bytes
1056 if (FLAGS_cld2_html && (newbytes >= 10) &&
1057 !FLAGS_cld2_quiet) {
1059 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1060 LanguageCode(altlang), reliable_percent2, bytes2,
1061 LanguageCode(lang));
1063 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1064 LanguageCode(lang), reliable_percent, bytes,
1065 LanguageCode(altlang));
1071 // Pass to delete any remaining unreliable languages
1072 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1073 int plang = doc_tote->Key(sub);
1074 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
1076 Language lang = static_cast<Language>(plang);
1077 int bytes = doc_tote->Value(sub);
1078 int reli = doc_tote->Reliability(sub);
1079 if (bytes == 0) {continue;} // Zero bytes
1081 // Reliable percent is stored as reliable score over stored bytecount
1082 int reliable_percent = reli / bytes;
1083 if (reliable_percent >= kMinReliableKeepPercent) { // Keeper?
1087 // Delete unreliable entry
1088 doc_tote->SetKey(sub, DocTote::kUnusedKey);
1089 doc_tote->SetScore(sub, 0);
1090 doc_tote->SetReliability(sub, 0);
1092 // Show fate of unreliable languages if at least 10 bytes
1093 if (FLAGS_cld2_html && (bytes >= 10) &&
1094 !FLAGS_cld2_quiet) {
1095 fprintf(stderr, "{Unreli %s.%dR,%dB} ",
1096 LanguageCode(lang), reliable_percent, bytes);
1100 ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
1104 // Move all the text bytes from lower byte-count to higher one
1105 void MoveLang1ToLang2(Language lang1, Language lang2,
1106 int lang1_sub, int lang2_sub,
1108 ResultChunkVector* resultchunkvector) {
1109 // In doc_tote, move all the bytes lang1 => lang2
1110 int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
1111 doc_tote->SetValue(lang2_sub, sum);
1112 sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
1113 doc_tote->SetScore(lang2_sub, sum);
1114 sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
1115 doc_tote->SetReliability(lang2_sub, sum);
1118 doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
1119 doc_tote->SetScore(lang1_sub, 0);
1120 doc_tote->SetReliability(lang1_sub, 0);
1122 // In resultchunkvector, move all the bytes lang1 => lang2
1123 if (resultchunkvector == NULL) {return;}
1126 uint16 prior_lang = UNKNOWN_LANGUAGE;
1127 for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
1128 ResultChunk* rc = &(*resultchunkvector)[i];
1129 if (rc->lang1 == lang1) {
1130 // Update entry[i] lang1 => lang2
1133 // One change may produce two merges -- entry before and entry after
1134 if ((rc->lang1 == prior_lang) && (k > 0)) {
1135 // Merge with previous, deleting entry[i]
1136 ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
1137 prior_rc->bytes += rc->bytes;
1138 // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
1141 (*resultchunkvector)[k] = (*resultchunkvector)[i];
1142 // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
1145 prior_lang = rc->lang1;
1147 resultchunkvector->resize(k);
1152 // Move less likely byte count to more likely for close pairs of languages
1153 // If given, also update resultchunkvector
1154 void RefineScoredClosePairs(DocTote* doc_tote,
1155 ResultChunkVector* resultchunkvector,
1156 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1157 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1158 int close_packedlang = doc_tote->Key(sub);
1159 int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
1160 if (subscr == 0) {continue;}
1162 // We have a close pair language -- if the other one is also scored and the
1163 // longword score differs enough, put all our eggs into one basket
1165 // Nonzero longword score: Go look for the other of this pair
1166 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1167 if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
1168 // We have a matching pair
1169 int close_packedlang2 = doc_tote->Key(sub2);
1171 // Move all the text bytes from lower byte-count to higher one
1172 int from_sub, to_sub;
1173 Language from_lang, to_lang;
1174 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1177 from_lang = static_cast<Language>(close_packedlang);
1178 to_lang = static_cast<Language>(close_packedlang2);
1182 from_lang = static_cast<Language>(close_packedlang2);
1183 to_lang = static_cast<Language>(close_packedlang);
1186 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1187 // Show fate of closepair language
1188 int val = doc_tote->Value(from_sub); // byte count
1189 int reli = doc_tote->Reliability(from_sub);
1190 int reliable_percent = reli / (val ? val : 1); // avoid zdiv
1191 fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
1192 LanguageCode(from_lang),
1194 doc_tote->Value(from_sub),
1195 LanguageCode(to_lang));
1197 MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
1198 doc_tote, resultchunkvector);
1199 break; // Exit inner for sub2 loop
1206 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
1207 uint8* lang_hint_boost) {
1211 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1212 string temp(txt, len);
1213 fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
1216 void PrintLang(FILE* f, Tote* chunk_tote,
1217 Language cur_lang, bool cur_unreliable,
1218 Language prior_lang, bool prior_unreliable) {
1219 if (cur_lang == prior_lang) {
1222 fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
1227 void PrintTopLang(Language top_lang) {
1228 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1229 fprintf(stderr, "[] ");
1231 fprintf(stderr, "[%s] ", LanguageName(top_lang));
1232 prior_lang = top_lang;
1236 void PrintTopLangSpeculative(Language top_lang) {
1237 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1238 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1239 fprintf(stderr, "[] ");
1241 fprintf(stderr, "[%s] ", LanguageName(top_lang));
1242 prior_lang = top_lang;
1244 fprintf(stderr, "</span>\n");
1247 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1248 const int* text_bytes, const bool* is_reliable) {
1249 fprintf(f, "<br> Initial_Languages ");
1250 if (language3[0] != UNKNOWN_LANGUAGE) {
1251 fprintf(f, "%s%s(%d%%) ",
1252 LanguageName(language3[0]),
1253 *is_reliable ? "" : "*",
1256 if (language3[1] != UNKNOWN_LANGUAGE) {
1257 fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]);
1259 if (language3[2] != UNKNOWN_LANGUAGE) {
1260 fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]);
1262 fprintf(f, "%d bytes \n", *text_bytes);
1264 fprintf(f, "<br>\n");
1268 // Return internal probability score (sum) per 1024 bytes
1269 double GetNormalizedScore(Language lang, ULScript ulscript,
1270 int bytecount, int score) {
1271 if (bytecount <= 0) {return 0.0;}
1272 return (score << 10) / bytecount;
1275 // Extract return values before fixups
1276 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
1277 int* reliable_percent3, Language* language3, int* percent3,
1278 double* normalized_score3,
1279 int* text_bytes, bool* is_reliable) {
1280 reliable_percent3[0] = 0;
1281 reliable_percent3[1] = 0;
1282 reliable_percent3[2] = 0;
1283 language3[0] = UNKNOWN_LANGUAGE;
1284 language3[1] = UNKNOWN_LANGUAGE;
1285 language3[2] = UNKNOWN_LANGUAGE;
1289 normalized_score3[0] = 0.0;
1290 normalized_score3[1] = 0.0;
1291 normalized_score3[2] = 0.0;
1293 *text_bytes = total_text_bytes;
1294 *is_reliable = false;
1300 int lang1 = doc_tote->Key(0);
1301 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1302 // We have a top language
1303 language3[0] = static_cast<Language>(lang1);
1304 bytecount1 = doc_tote->Value(0);
1305 int reli1 = doc_tote->Reliability(0);
1306 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
1307 normalized_score3[0] = GetNormalizedScore(language3[0],
1310 doc_tote->Score(0));
1313 int lang2 = doc_tote->Key(1);
1314 if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
1315 language3[1] = static_cast<Language>(lang2);
1316 bytecount2 = doc_tote->Value(1);
1317 int reli2 = doc_tote->Reliability(1);
1318 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
1319 normalized_score3[1] = GetNormalizedScore(language3[1],
1322 doc_tote->Score(1));
1325 int lang3 = doc_tote->Key(2);
1326 if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
1327 language3[2] = static_cast<Language>(lang3);
1328 bytecount3 = doc_tote->Value(2);
1329 int reli3 = doc_tote->Reliability(2);
1330 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
1331 normalized_score3[2] = GetNormalizedScore(language3[2],
1334 doc_tote->Score(2));
1337 // Increase total bytes to sum (top 3) if low for some reason
1338 int total_bytecount12 = bytecount1 + bytecount2;
1339 int total_bytecount123 = total_bytecount12 + bytecount3;
1340 if (total_text_bytes < total_bytecount123) {
1341 total_text_bytes = total_bytecount123;
1342 *text_bytes = total_text_bytes;
1345 // Sum minus previous % gives better roundoff behavior than bytecount/total
1346 int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv
1347 percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1348 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1349 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1350 percent3[2] -= percent3[1];
1351 percent3[1] -= percent3[0];
1353 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1354 // Fix this explicitly
1355 if (percent3[1] < percent3[2]) {
1359 if (percent3[0] < percent3[1]) {
1364 *text_bytes = total_text_bytes;
1366 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1367 // We have a top language
1368 // Its reliability is overall result reliability
1369 int bytecount = doc_tote->Value(0);
1370 int reli = doc_tote->Reliability(0);
1371 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
1372 *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
1374 // No top language at all. This can happen with zero text or 100% Klingon
1375 // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
1376 *is_reliable = false;
1379 // If ignore percent is too large, set unreliable.
1380 int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1381 if ((ignore_percent > kIgnoreMaxPercent)) {
1382 *is_reliable = false;
1386 bool IsFIGS(Language lang) {
1387 if (lang == FRENCH) {return true;}
1388 if (lang == ITALIAN) {return true;}
1389 if (lang == GERMAN) {return true;}
1390 if (lang == SPANISH) {return true;}
1394 bool IsEFIGS(Language lang) {
1395 if (lang == ENGLISH) {return true;}
1396 if (lang == FRENCH) {return true;}
1397 if (lang == ITALIAN) {return true;}
1398 if (lang == GERMAN) {return true;}
1399 if (lang == SPANISH) {return true;}
1403 // For Tier3 languages, require more bytes of text to override
1404 // the first-place language
1405 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
1406 static const int kGoodSecondT3MinBytes = 128; // <this => no second
1408 // Calculate a single summary language for the document, and its reliability.
1409 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1410 // This is the heart of matching human-rater perception.
1411 // reliable_percent3[] is currently unused
1413 // Do not return Tier3 second language unless there are at least 128 bytes
1414 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
1415 const int* reliable_percent3,
1416 const Language* language3,
1417 const int* percent3,
1418 Language* summary_lang, bool* is_reliable,
1419 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet,
1421 // Vector of active languages; changes if we delete some
1423 int active_slot[3] = {0, 1, 2};
1425 int ignore_percent = 0;
1426 int return_percent = percent3[0]; // Default to top lang
1427 *summary_lang = language3[0];
1428 *is_reliable = true;
1429 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1431 // If any of top 3 is IGNORE, remove it and increment ignore_percent
1432 for (int i = 0; i < 3; ++i) {
1433 if (language3[i] == TG_UNKNOWN_LANGUAGE) {
1434 ignore_percent += percent3[i];
1435 // Move the rest up, leaving input vectors unchanged
1436 for (int j=i+1; j < 3; ++j) {
1437 active_slot[j - 1] = active_slot[j];
1440 // Logically remove Ignore from percentage-text calculation
1441 // (extra 1 in 101 avoids zdiv, biases slightly small)
1442 return_percent = (percent3[0] * 100) / (101 - ignore_percent);
1443 *summary_lang = language3[active_slot[0]];
1444 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
1449 // If English and X, where X (not UNK) is big enough,
1450 // assume the English is boilerplate and return X.
1451 // Logically remove English from percentage-text calculation
1452 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
1453 // Require more bytes of text for Tier3 languages
1454 int minbytesneeded = kGoodSecondT1T2MinBytes;
1455 int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
1457 if ((language3[active_slot[0]] == ENGLISH) &&
1458 (language3[active_slot[1]] != ENGLISH) &&
1459 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1460 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
1461 (second_bytes >= minbytesneeded)) {
1462 ignore_percent += percent3[active_slot[0]];
1463 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1464 *summary_lang = language3[active_slot[1]];
1465 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1467 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
1468 // assume the FIGS is boilerplate and return X.
1469 // Logically remove FIGS from percentage-text calculation
1470 } else if (IsFIGS(language3[active_slot[0]]) &&
1471 !IsEFIGS(language3[active_slot[1]]) &&
1472 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1473 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
1474 (second_bytes >= minbytesneeded)) {
1475 ignore_percent += percent3[active_slot[0]];
1476 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1477 *summary_lang = language3[active_slot[1]];
1478 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1480 // Else we are returning the first language, but want to improve its
1481 // return_percent if the second language should be ignored
1482 } else if ((language3[active_slot[1]] == ENGLISH) &&
1483 (language3[active_slot[0]] != ENGLISH)) {
1484 ignore_percent += percent3[active_slot[1]];
1485 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1486 } else if (IsFIGS(language3[active_slot[1]]) &&
1487 !IsEFIGS(language3[active_slot[0]])) {
1488 ignore_percent += percent3[active_slot[1]];
1489 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1492 // If return percent is too small (too many languages), return UNKNOWN
1493 if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) {
1494 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1495 fprintf(stderr, "{Unreli %s %d%% percent too small} ",
1496 LanguageCode(*summary_lang), return_percent);
1498 *summary_lang = UNKNOWN_LANGUAGE;
1499 *is_reliable = false;
1502 // If return percent is small, return language but set unreliable.
1503 if ((return_percent < kGoodFirstReliableMinPercent)) {
1504 *is_reliable = false;
1507 // If ignore percent is too large, set unreliable.
1508 ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1509 if ((ignore_percent > kIgnoreMaxPercent)) {
1510 *is_reliable = false;
1513 // If we removed all the active languages, return UNKNOWN
1514 if (slot_count == 0) {
1515 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1516 fprintf(stderr, "{Unreli %s no languages left} ",
1517 LanguageCode(*summary_lang));
1519 *summary_lang = UNKNOWN_LANGUAGE;
1520 *is_reliable = false;
1524 void AddLangPriorBoost(Language lang, uint32 langprob,
1525 ScoringContext* scoringcontext) {
1526 // This is called 0..n times with language hints
1527 // but we don't know the script -- so boost either or both Latn, Othr.
1529 if (IsLatnLanguage(lang)) {
1530 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
1531 int n = langprior_boost->n;
1532 langprior_boost->langprob[n] = langprob;
1533 langprior_boost->n = langprior_boost->wrap(n + 1);
1536 if (IsOthrLanguage(lang)) {
1537 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
1538 int n = langprior_boost->n;
1539 langprior_boost->langprob[n] = langprob;
1540 langprior_boost->n = langprior_boost->wrap(n + 1);
1545 void AddOneWhack(Language whacker_lang, Language whackee_lang,
1546 ScoringContext* scoringcontext) {
1547 uint32 langprob = MakeLangProb(whackee_lang, 1);
1548 // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
1549 if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
1550 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
1551 int n = langprior_whack->n;
1552 langprior_whack->langprob[n] = langprob;
1553 langprior_whack->n = langprior_whack->wrap(n + 1);
1555 if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
1556 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
1557 int n = langprior_whack->n;
1558 langprior_whack->langprob[n] = langprob;
1559 langprior_whack->n = langprior_whack->wrap(n + 1);
1563 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
1564 // We do not in general want zh-Hans and zh-Hant to be close pairs,
1566 if (lang == CLD2::CHINESE) {
1567 AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
1570 if (lang == CLD2::CHINESE_T) {
1571 AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
1575 int base_lang_set = LanguageCloseSet(lang);
1576 if (base_lang_set == 0) {return;}
1577 // TODO: add an explicit list of each set to avoid this 512-times loop
1578 for (int i = 0; i < kLanguageToPLangSize; ++i) {
1579 Language lang2 = static_cast<Language>(i);
1580 if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
1581 AddOneWhack(lang, lang2, scoringcontext);
1587 void ApplyHints(const char* buffer,
1590 const CLDHints* cld_hints,
1591 ScoringContext* scoringcontext) {
1592 CLDLangPriors lang_priors;
1593 InitCLDLangPriors(&lang_priors);
1595 // We now use lang= tags.
1596 // Last look, circa 2008 found only 15% of web pages with lang= tags and
1597 // many of those were wrong. Now (July 2011), we find 44% of web pages have
1598 // lang= tags, and most of them are correct. So we now give them substantial
1599 // weight in each chunk scored.
1600 if (!is_plain_text) {
1601 // Get any contained language tags in first n KB
1602 int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
1603 string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
1605 SetCLDLangTagsHint(lang_tags, &lang_priors);
1606 if (scoringcontext->flags_cld2_html) {
1607 if (!lang_tags.empty()) {
1608 fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
1614 if (cld_hints != NULL) {
1615 if ((cld_hints->content_language_hint != NULL) &&
1616 (cld_hints->content_language_hint[0] != '\0')) {
1617 SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
1620 // Input is from GetTLD(), already lowercased
1621 if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
1622 SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
1625 if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
1626 Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
1627 SetCLDEncodingHint(enc, &lang_priors);
1630 if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
1631 SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
1635 // Keep no more than four different languages with hints
1636 TrimCLDLangPriors(4, &lang_priors);
1638 if (scoringcontext->flags_cld2_html) {
1639 string print_temp = DumpCLDLangPriors(&lang_priors);
1640 if (!print_temp.empty()) {
1641 fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
1642 print_temp.c_str());
1646 // Put boosts into ScoringContext
1647 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1648 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1649 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1651 uint32 langprob = MakeLangProb(lang, qprob);
1652 AddLangPriorBoost(lang, langprob, scoringcontext);
1656 // Put whacks into scoring context
1657 // We do not in general want zh-Hans and zh-Hant to be close pairs,
1658 // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
1659 std::vector<int> close_set_count(kCloseSetSize + 1, 0);
1661 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1662 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1663 ++close_set_count[LanguageCloseSet(lang)];
1664 if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
1665 if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
1668 // If a boost language is in a close set, force suppressing the others in
1669 // that set, if exactly one of the set is present
1670 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1671 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1672 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1674 int close_set = LanguageCloseSet(lang);
1675 if ((close_set > 0) && (close_set_count[close_set] == 1)) {
1676 AddCloseLangWhack(lang, scoringcontext);
1678 if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
1679 (close_set_count[kCloseSetSize] == 1)) {
1680 AddCloseLangWhack(lang, scoringcontext);
1687 // Extend results to fully cover the [lo..hi) range
1688 void FinishResultVector(int lo, int hi, ResultChunkVector* vec) {
1689 if (vec == NULL) {return;}
1690 if (vec->size() == 0) {return;}
1691 ResultChunk* rc = &(*vec)[0];
1692 if (rc->offset > lo) {
1693 int diff = rc->offset - lo;
1697 ResultChunk* rc2 = &(*vec)[vec->size() - 1];
1698 int rc2hi = rc2->offset + rc2->bytes;
1700 int diff = hi - rc2hi;
1706 // Results language3/percent3/text_bytes must be exactly three items
1707 Language DetectLanguageSummaryV2(
1711 const CLDHints* cld_hints,
1712 bool allow_extended_lang,
1715 Language* language3,
1717 double* normalized_score3,
1718 ResultChunkVector* resultchunkvector,
1720 bool* is_reliable) {
1721 language3[0] = UNKNOWN_LANGUAGE;
1722 language3[1] = UNKNOWN_LANGUAGE;
1723 language3[2] = UNKNOWN_LANGUAGE;
1727 normalized_score3[0] = 0.0;
1728 normalized_score3[1] = 0.0;
1729 normalized_score3[2] = 0.0;
1730 if (resultchunkvector != NULL) {
1731 resultchunkvector->clear();
1734 *is_reliable = false;
1736 if ((flags & kCLDFlagEcho) != 0) {
1737 string temp(buffer, buffer_length);
1738 if ((flags & kCLDFlagHtml) != 0) {
1739 fprintf(stderr, "CLD2[%d] '%s'<br>\n",
1740 buffer_length, GetHtmlEscapedText(temp).c_str());
1742 fprintf(stderr, "CLD2[%d] '%s'\n",
1743 buffer_length, GetPlainEscapedText(temp).c_str());
1747 #ifdef CLD2_DYNAMIC_MODE
1748 // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
1749 // hasn't been loaded yet. This is the only sane thing we can do, as there
1750 // are no scoring tables to consult.
1751 bool dataLoaded = isDataLoaded();
1752 if ((flags & kCLDFlagVerbose) != 0) {
1753 fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
1756 return UNKNOWN_LANGUAGE;
1760 // Exit now if no text
1761 if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
1762 if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
1765 DocTote doc_tote; // Reliability = 0..100
1767 // ScoringContext carries state across scriptspans
1768 ScoringContext scoringcontext;
1769 scoringcontext.debug_file = stderr;
1770 scoringcontext.flags_cld2_score_as_quads =
1771 ((flags & kCLDFlagScoreAsQuads) != 0);
1772 scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
1773 scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
1774 scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
1775 scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
1776 scoringcontext.ulscript = ULScript_Common;
1777 scoringcontext.scoringtables = &kScoringtables;
1778 scoringcontext.scanner = NULL;
1779 scoringcontext.init(); // Clear the internal memory arrays
1782 bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
1783 bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
1785 ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
1787 // Four individual script totals, Latin, Han, other2, other3
1788 int next_other_tote = 2;
1791 // Four totes for up to four different scripts pending at once
1792 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
1793 bool tote_seen[4] = {false, false, false, false};
1794 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
1795 ULScript tote_script[4] =
1796 {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
1798 // Loop through text spans in a single script
1799 ScriptScanner ss(buffer, buffer_length, is_plain_text);
1800 LangSpan scriptspan;
1802 scoringcontext.scanner = &ss;
1804 scriptspan.text = NULL;
1805 scriptspan.text_bytes = 0;
1806 scriptspan.offset = 0;
1807 scriptspan.ulscript = ULScript_Common;
1808 scriptspan.lang = UNKNOWN_LANGUAGE;
1810 int total_text_bytes = 0;
1811 int textlimit = FLAGS_cld_textlimit << 10; // in KB
1812 if (textlimit == 0) {textlimit = 0x7fffffff;}
1814 int advance_by = 2; // Advance 2 bytes
1815 int advance_limit = textlimit >> 3; // For first 1/8 of max document
1817 int initial_word_span = kDefaultWordSpan;
1818 if (FLAGS_cld_forcewords) {
1819 initial_word_span = kReallyBigWordSpan;
1822 // Pick up chunk sizes
1823 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
1824 // Sanity check -- force into a reasonable range
1825 int chunksizequads = FLAGS_cld_smoothwidth;
1826 chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
1827 kMaxChunkSizeQuads);
1828 int chunksizeunis = (chunksizequads * 5) >> 1;
1830 // Varying short-span limit doesn't work well -- skips too much beyond 20KB
1831 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
1832 int spantooshortlimit = kShortSpanThresh;
1834 // For debugging only. Not thread-safe
1835 prior_lang = UNKNOWN_LANGUAGE;
1836 prior_unreliable = false;
1838 // Allocate full-document prediction table for finding repeating words
1840 int* predict_tbl = new int[kPredictionTableSize];
1841 if (FlagRepeats(flags)) {
1842 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1847 // Loop through scriptspans accumulating number of text bytes in each language
1848 while (ss.GetOneScriptSpanLower(&scriptspan)) {
1849 ULScript ulscript = scriptspan.ulscript;
1851 // Squeeze out big chunks of text span if asked to
1852 if (FlagSqueeze(flags)) {
1853 // Remove repetitive or mostly-spaces chunks
1855 int chunksize = 0; // Use the default
1856 if (resultchunkvector != NULL) {
1857 newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
1858 scriptspan.text_bytes,
1861 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
1864 scriptspan.text_bytes = newlen;
1866 // Check now and then to see if we should be squeezing
1867 if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
1868 !FlagFinish(flags)) {
1869 // fprintf(stderr, "CheapSqueezeTriggerTest, "
1870 // "first %d bytes of %d (>%d/2)<br>\n",
1871 // kCheapSqueezeTestLen,
1872 // scriptspan.text_bytes,
1873 // kCheapSqueezeTestThresh);
1875 if (CheapSqueezeTriggerTest(scriptspan.text,
1876 scriptspan.text_bytes,
1877 kCheapSqueezeTestLen)) {
1878 // Recursive call with big-chunk squeezing set
1879 if (FLAGS_cld2_html || FLAGS_dbgscore) {
1881 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
1884 // Deallocate full-document prediction table
1885 delete[] predict_tbl;
1887 return DetectLanguageSummaryV2(
1892 allow_extended_lang,
1893 flags | kCLDFlagSqueeze,
1905 // Remove repetitive words if asked to
1906 if (FlagRepeats(flags)) {
1907 // Remove repetitive words
1909 if (resultchunkvector != NULL) {
1910 newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
1911 scriptspan.text_bytes,
1912 &hash, predict_tbl);
1914 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
1915 &hash, predict_tbl);
1917 scriptspan.text_bytes = newlen;
1920 // Scoring depends on scriptspan buffer ALWAYS having
1921 // leading space and off-the-end space space space NUL,
1922 // DCHECK(scriptspan.text[0] == ' ');
1923 // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
1924 // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
1925 // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
1926 // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
1929 // Accumulate directly into the document total, or accmulate in one of four
1930 // chunk totals. The purpose of the multiple chunk totals is to piece
1931 // together short choppy pieces of text in alternating scripts. One total is
1932 // dedicated to Latin text, one to Han text, and the other two are dynamicly
1935 scoringcontext.ulscript = scriptspan.ulscript;
1936 // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
1938 ScoreOneScriptSpan(scriptspan,
1943 total_text_bytes += scriptspan.text_bytes;
1944 } // End while (ss.GetOneScriptSpanLower())
1946 // Deallocate full-document prediction table
1947 delete[] predict_tbl;
1949 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1950 // If no forced <cr>, put one in front of dump
1951 if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
1952 doc_tote.Dump(stderr);
1956 // If extended langauges are disallowed, remove them here
1957 if (!allow_extended_lang) {
1958 RemoveExtendedLanguages(&doc_tote);
1961 // Force close pairs to one or the other
1962 // If given, also update resultchunkvector
1963 RefineScoredClosePairs(&doc_tote, resultchunkvector,
1964 FLAGS_cld2_html, FLAGS_cld2_quiet);
1967 // Calculate return results
1968 // Find top three byte counts in tote heap
1969 int reliable_percent3[3];
1971 // Cannot use Add, etc. after sorting
1974 ExtractLangEtc(&doc_tote, total_text_bytes,
1975 reliable_percent3, language3, percent3, normalized_score3,
1976 text_bytes, is_reliable);
1978 bool have_good_answer = false;
1979 if (FlagFinish(flags)) {
1981 have_good_answer = true;
1982 } else if (total_text_bytes <= kShortTextThresh) {
1983 // Don't recurse on short text -- we already did word scores
1984 have_good_answer = true;
1985 } else if (*is_reliable &&
1986 (percent3[0] >= kGoodLang1Percent)) {
1987 have_good_answer = true;
1988 } else if (*is_reliable &&
1989 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
1990 have_good_answer = true;
1994 if (have_good_answer) {
1995 // This is the real, non-recursive return
1997 // Move bytes for unreliable langs to another lang or UNKNOWN
1998 if (!FlagBestEffort(flags)) {
1999 RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
2002 // Redo the result extraction after the removal above
2004 ExtractLangEtc(&doc_tote, total_text_bytes,
2005 reliable_percent3, language3, percent3, normalized_score3,
2006 text_bytes, is_reliable);
2008 Language summary_lang;
2009 CalcSummaryLang(&doc_tote, total_text_bytes,
2010 reliable_percent3, language3, percent3,
2011 &summary_lang, is_reliable,
2012 FLAGS_cld2_html, FLAGS_cld2_quiet, flags);
2014 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
2015 for (int i = 0; i < 3; ++i) {
2016 if (language3[i] != UNKNOWN_LANGUAGE) {
2017 fprintf(stderr, "%s.%dR(%d%%) ",
2018 LanguageCode(language3[i]),
2019 reliable_percent3[i],
2024 fprintf(stderr, "%d bytes ", total_text_bytes);
2025 fprintf(stderr, "= %s%c ",
2026 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
2027 fprintf(stderr, "<br><br>\n");
2030 // Slightly condensed if quiet
2031 if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
2032 fprintf(stderr, " ");
2033 for (int i = 0; i < 3; ++i) {
2034 if (language3[i] != UNKNOWN_LANGUAGE) {
2035 fprintf(stderr, " %s %d%% ",
2036 LanguageCode(language3[i]),
2040 fprintf(stderr, "= %s%c ",
2041 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
2042 fprintf(stderr, "<br>\n");
2045 // Extend results to fully cover the input buffer
2046 FinishResultVector(0, buffer_length, resultchunkvector);
2048 return summary_lang;
2051 // Not a good answer -- do recursive call to refine
2052 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
2053 // This is what we hope to improve on in the recursive call, if any
2054 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
2057 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
2058 // For this purpose, we treate "Ignore" as top40
2059 Language new_plus_one = UNKNOWN_LANGUAGE;
2061 if (total_text_bytes < kShortTextThresh) {
2062 // Short text: Recursive call with top40 and short set
2063 if (FLAGS_cld2_html || FLAGS_dbgscore) {
2064 fprintf(stderr, " ---text_bytes[%d] "
2065 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2068 return DetectLanguageSummaryV2(
2073 allow_extended_lang,
2074 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2075 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2085 // Longer text: Recursive call with top40 set
2086 if (FLAGS_cld2_html || FLAGS_dbgscore) {
2088 " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2091 return DetectLanguageSummaryV2(
2096 allow_extended_lang,
2097 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2109 // For debugging and wrappers. Not thread safe.
2110 static char temp_detectlanguageversion[32];
2112 // Return version text string
2113 // String is "code_version - data_build_date"
2114 const char* DetectLanguageVersion() {
2115 if (kScoringtables.quadgram_obj == NULL) {return "";}
2116 sprintf(temp_detectlanguageversion,
2117 "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
2118 return temp_detectlanguageversion;
2122 } // End namespace CLD2