1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
27 #include "integral_types.h"
28 #include "lang_script.h"
29 #include "utf8statetable.h"
31 #ifdef CLD2_DYNAMIC_MODE
32 #include "cld2_dynamic_data.h"
33 #include "cld2_dynamic_data_loader.h"
35 #include "cld2tablesummary.h"
36 #include "compact_lang_det_impl.h"
37 #include "compact_lang_det_hint_code.h"
38 #include "getonescriptspan.h"
46 // Linker supplies the right tables, From files
47 // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc
48 // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc
49 // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc
50 // cld2_generated_distinctocta*.cc
51 // cld_generated_score_quad_octa_1024_256.cc
53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
54 // sizes that are 1/3/5 times a power of two, instead of just powers of two.
55 // Gives more flexibility of total footprint for CLD2.
57 extern const int kLanguageToPLangSize;
58 extern const int kCloseSetSize;
60 extern const UTF8PropObj cld_generated_CjkUni_obj;
61 extern const CLD2TableSummary kCjkCompat_obj;
62 extern const CLD2TableSummary kCjkDeltaBi_obj;
63 extern const CLD2TableSummary kDistinctBiTable_obj;
64 extern const CLD2TableSummary kQuad_obj;
65 extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables
66 extern const CLD2TableSummary kDeltaOcta_obj;
67 extern const CLD2TableSummary kDistinctOcta_obj;
68 extern const short kAvgDeltaOctaScore[];
70 #ifdef CLD2_DYNAMIC_MODE
71 // CLD2_DYNAMIC_MODE is defined:
72 // Data will be read from an mmap opened at runtime.
74 // Convenience for nulling things out completely at any point.
75 static ScoringTables NULL_TABLES = {
76 NULL, //&cld_generated_CjkUni_obj,
77 NULL, //&kCjkCompat_obj,
78 NULL, //&kCjkDeltaBi_obj,
79 NULL, //&kDistinctBiTable_obj,
82 NULL, //&kDeltaOcta_obj,
83 NULL, //&kDistinctOcta_obj,
84 NULL, //kAvgDeltaOctaScore,
86 static ScoringTables kScoringtables = NULL_TABLES; // copy constructed
87 static bool dynamicDataLoaded = false;
88 static bool dataSourceIsFile = false;
89 static ScoringTables* dynamicTables = NULL;
90 static void* mmapAddress = NULL;
91 static int mmapLength = 0;
93 bool isDataLoaded() { return dynamicDataLoaded; }
95 void loadDataFromFile(const char* fileName) {
99 dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
100 kScoringtables = *dynamicTables;
101 dataSourceIsFile = true;
102 dynamicDataLoaded = true;
105 void loadDataFromRawAddress(const void* rawAddress, const int length) {
106 if (isDataLoaded()) {
109 dynamicTables = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length);
110 kScoringtables = *dynamicTables;
111 dataSourceIsFile = false;
112 dynamicDataLoaded = true;
116 if (!dynamicDataLoaded) return;
117 if (dataSourceIsFile) {
118 CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength);
120 CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables);
122 dynamicDataLoaded = false;
123 dataSourceIsFile = false; // vacuous
124 kScoringtables = NULL_TABLES; // Housekeeping: null all pointers
127 // This initializes kScoringtables.quadgram_obj etc.
128 static const ScoringTables kScoringtables = {
129 &cld_generated_CjkUni_obj,
132 &kDistinctBiTable_obj,
135 &kQuad_obj2, // Dual lookup tables
141 #endif // #ifdef CLD2_DYNAMIC_MODE
144 static const bool FLAGS_cld_no_minimum_bytes = false;
145 static const bool FLAGS_cld_forcewords = true;
146 static const bool FLAGS_cld_showme = false;
147 static const bool FLAGS_cld_echotext = true;
148 static const int32 FLAGS_cld_textlimit = 160;
149 static const int32 FLAGS_cld_smoothwidth = 20;
150 static const bool FLAGS_cld_2011_hints = true;
151 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
153 static const bool FLAGS_dbgscore = false;
156 static const int kLangHintInitial = 12; // Boost language by N initially
157 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
159 static const int kShortSpanThresh = 32; // Bytes
160 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
162 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
163 // after this many text bytes
164 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
165 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
166 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
168 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
169 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
170 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
172 static const int kMaxSpaceScan = 32; // Bytes
174 static const int kGoodLang1Percent = 70;
175 static const int kGoodLang1and2Percent = 93;
176 static const int kShortTextThresh = 256; // Bytes
178 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
179 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
181 static const int kDefaultWordSpan = 256; // Scan at least this many initial
182 // bytes with word scoring
183 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
185 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
187 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
190 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
191 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
192 static const int kGoodFirstMinPercent = 26; // <this => UNK
193 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
194 static const int kIgnoreMaxPercent = 20; // >this => unreli
195 static const int kKeepMinPercent = 2; // <this => unreli
199 // Statistically closest language, based on quadgram table
200 // Those that are far from other languges map to UNKNOWN_LANGUAGE
201 // Subscripted by Language
203 // From lang_correlation.txt and hand-edits
204 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
205 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
206 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
208 static const int kMinCorrPercent = 24; // Pick off how close you want
209 // 24 catches PERSIAN <== ARABIC
210 // but not SPANISH <== PORTUGESE
211 static Language Unknown = UNKNOWN_LANGUAGE;
214 // Subscripted by Language
215 static const Language kClosestAltLanguage[] = {
216 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
217 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
218 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
219 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
220 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
221 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
222 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
223 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
224 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
225 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
226 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
227 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
228 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
229 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
230 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
231 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
232 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
233 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
234 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
235 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
236 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
237 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
238 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
239 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
240 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
241 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
242 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
243 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
244 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
245 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
246 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
247 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
248 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
249 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
250 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
251 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
252 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
253 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
254 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
255 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
256 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
257 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
258 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
259 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
260 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
261 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
262 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
263 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
264 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
265 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
266 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
267 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
268 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
269 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
270 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
271 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
272 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
273 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
274 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
275 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
276 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
277 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
278 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
279 (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
280 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
281 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
282 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
283 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
284 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
285 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
286 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
287 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
288 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
289 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
290 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
291 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
292 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
293 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
294 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
295 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
296 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
297 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
298 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
299 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
300 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
301 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
302 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
303 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
304 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
305 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
306 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
307 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
308 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
309 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
310 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
311 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
312 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
313 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
314 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
315 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
316 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
317 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
318 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
319 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
320 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
321 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
322 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
323 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
324 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
325 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
326 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
327 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
328 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
329 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
330 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
331 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
332 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
333 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
334 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
335 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
336 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
337 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
338 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
339 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
340 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
341 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
342 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
343 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
344 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
345 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
346 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
347 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
348 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
349 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
350 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
351 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
352 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
353 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
354 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
355 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
356 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
357 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
358 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
359 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
360 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
361 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
362 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
363 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
364 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
365 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
366 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
367 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
368 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
369 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
370 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
371 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
372 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
373 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
374 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
375 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
376 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
377 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
379 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN
380 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO
381 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE
382 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN
385 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
386 // kClosestAltLanguage_has_incorrect_size);
389 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
390 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
391 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
392 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
393 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
394 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
395 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
398 // Defines Top40 packed languages
400 // Google top 40 languages
402 // Tier 0/1 Language enum list (16)
403 // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
404 // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
405 // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
408 // Tier 2 Language enum list (22)
409 // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
410 // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
411 // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
412 // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
415 // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
417 // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
420 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
424 void PrintText(FILE* f, Language cur_lang, const string& temp) {
425 if (temp.size() == 0) {return;}
426 fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
430 //------------------------------------------------------------------------------
431 // For --cld_html debugging output. Not thread safe
432 //------------------------------------------------------------------------------
433 static Language prior_lang = UNKNOWN_LANGUAGE;
434 static bool prior_unreliable = false;
436 //------------------------------------------------------------------------------
437 // End For --cld_html debugging output
438 //------------------------------------------------------------------------------
441 // Backscan to word boundary, returning how many bytes n to go back
442 // so that src - n is non-space ans src - n - 1 is space.
443 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
444 int BackscanToSpace(const char* src, int limit) {
446 limit = minint(limit, kMaxSpaceScan);
448 if (src[-n - 1] == ' ') {return n;} // We are at _X
453 if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin
459 // Forwardscan to word boundary, returning how many bytes n to go forward
460 // so that src + n is non-space ans src + n - 1 is space.
461 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
462 int ForwardscanToSpace(const char* src, int limit) {
464 limit = minint(limit, kMaxSpaceScan);
466 if (src[n] == ' ') {return n + 1;} // We are at _X
471 if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin
478 // This uses a cheap predictor to get a measure of compression, and
479 // hence a measure of repetitiveness. It works on complete UTF-8 characters
480 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
481 // all the time when done with a byte-based count. Sigh.
483 // To allow running prediction across multiple chunks, caller passes in current
484 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
486 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
487 // each correctly-predicted character.
489 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
492 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
494 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
496 const uint8* src = reinterpret_cast<const uint8*>(isrc);
497 const uint8* srclimit = src + src_len;
498 int local_hash = *hash;
500 while (src < srclimit) {
504 // Pick up one char and length
506 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
508 } else if ((c & 0xe0) == 0xc0) {
510 c = (c << 8) | src[1];
512 } else if ((c & 0xf0) == 0xe0) {
514 c = (c << 16) | (src[1] << 8) | src[2];
518 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
523 int p = tbl[local_hash]; // Prediction
524 tbl[local_hash] = c; // Update prediction
526 p_count += incr; // Count bytes of good predictions
529 local_hash = ((local_hash << 4) ^ c) & 0xfff;
537 // Counts number of spaces; a little faster than one-at-a-time
538 // Doesn't count odd bytes at end
539 int CountSpaces4(const char* src, int src_len) {
541 for (int i = 0; i < (src_len & ~3); i += 4) {
542 s_count += (src[i] == ' ');
543 s_count += (src[i+1] == ' ');
544 s_count += (src[i+2] == ' ');
545 s_count += (src[i+3] == ' ');
551 // Remove words of text that have more than half their letters predicted
552 // correctly by our cheap predictor, moving the remaining words in-place
553 // to the front of the input buffer.
555 // To allow running prediction across multiple chunks, caller passes in current
556 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
558 // Return the new, possibly-shorter length
560 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
563 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
564 const uint8* src = reinterpret_cast<const uint8*>(isrc);
565 const uint8* srclimit = src + src_len;
567 int local_hash = *hash;
568 char* word_dst = dst; // Start of next word
569 int good_predict_bytes = 0;
570 int word_length_bytes = 0;
572 while (src < srclimit) {
578 if ((good_predict_bytes * 2) > word_length_bytes) {
579 // Word is well-predicted: backup to start of this word
581 if (FLAGS_cld_showme) {
582 // Mark the deletion point with period
583 // Don't repeat multiple periods
584 // Cannot mark with more bytes or may overwrite unseen input
585 if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
591 word_dst = dst; // Start of next word
592 good_predict_bytes = 0;
593 word_length_bytes = 0;
596 // Pick up one char and length
598 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
600 } else if ((c & 0xe0) == 0xc0) {
603 c = (c << 8) | src[1];
605 } else if ((c & 0xf0) == 0xe0) {
609 c = (c << 16) | (src[1] << 8) | src[2];
616 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
620 word_length_bytes += incr;
622 int p = tbl[local_hash]; // Prediction
623 tbl[local_hash] = c; // Update prediction
625 good_predict_bytes += incr; // Count good predictions
628 local_hash = ((local_hash << 4) ^ c) & 0xfff;
633 if ((dst - isrc) < (src_len - 3)) {
634 // Pad and make last char clean UTF-8 by putting following spaces
639 } else if ((dst - isrc) < src_len) {
640 // Make last char clean UTF-8 by putting following space off the end
644 return static_cast<int>(dst - isrc);
648 // This alternate form overwrites redundant words, thus avoiding corrupting the
649 // backmap for generate a vector of original-text ranges.
650 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
651 const uint8* src = reinterpret_cast<const uint8*>(isrc);
652 const uint8* srclimit = src + src_len;
654 int local_hash = *hash;
655 char* word_dst = dst; // Start of next word
656 int good_predict_bytes = 0;
657 int word_length_bytes = 0;
659 while (src < srclimit) {
665 if ((good_predict_bytes * 2) > word_length_bytes) {
666 // Word [word_dst..dst-1) is well-predicted: overwrite
667 for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
669 word_dst = dst; // Start of next word
670 good_predict_bytes = 0;
671 word_length_bytes = 0;
674 // Pick up one char and length
676 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
678 } else if ((c & 0xe0) == 0xc0) {
681 c = (c << 8) | src[1];
683 } else if ((c & 0xf0) == 0xe0) {
687 c = (c << 16) | (src[1] << 8) | src[2];
694 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
698 word_length_bytes += incr;
700 int p = tbl[local_hash]; // Prediction
701 tbl[local_hash] = c; // Update prediction
703 good_predict_bytes += incr; // Count good predictions
706 local_hash = ((local_hash << 4) ^ c) & 0xfff;
711 if ((dst - isrc) < (src_len - 3)) {
712 // Pad and make last char clean UTF-8 by putting following spaces
717 } else if ((dst - isrc) < src_len) {
718 // Make last char clean UTF-8 by putting following space off the end
722 return static_cast<int>(dst - isrc);
726 // Remove portions of text that have a high density of spaces, or that are
727 // overly repetitive, squeezing the remaining text in-place to the front of the
730 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
731 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
733 // Return the new, possibly-shorter length
735 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
738 int CheapSqueezeInplace(char* isrc,
743 char* srclimit = src + src_len;
744 bool skipping = false;
747 // Allocate local prediction table.
748 int* predict_tbl = new int[kPredictionTableSize];
749 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
751 int chunksize = ichunksize;
752 if (chunksize == 0) {chunksize = kChunksizeDefault;}
753 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
754 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
756 while (src < srclimit) {
757 int remaining_bytes = srclimit - src;
758 int len = minint(chunksize, remaining_bytes);
759 // Make len land us on a UTF-8 character boundary.
760 // Ah. Also fixes mispredict because we could get out of phase
761 // Loop always terminates at trailing space in buffer
762 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
764 int space_n = CountSpaces4(src, len);
765 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
766 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
769 // Keeping-to-skipping transition; do it at a space
770 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
773 // Force a leading space if the first chunk is deleted
776 if (FLAGS_cld_showme) {
777 // Mark the deletion point with black square U+25A0
778 *dst++ = static_cast<unsigned char>(0xe2);
779 *dst++ = static_cast<unsigned char>(0x96);
780 *dst++ = static_cast<unsigned char>(0xa0);
788 // Skipping-to-keeping transition; do it at a space
789 int n = ForwardscanToSpace(src, len);
791 remaining_bytes -= n; // Shrink remaining length
795 // "len" can be negative in some cases
797 memmove(dst, src, len);
804 if ((dst - isrc) < (src_len - 3)) {
805 // Pad and make last char clean UTF-8 by putting following spaces
810 } else if ((dst - isrc) < src_len) {
811 // Make last char clean UTF-8 by putting following space off the end
815 // Deallocate local prediction table
816 delete[] predict_tbl;
817 return static_cast<int>(dst - isrc);
820 // This alternate form overwrites redundant words, thus avoiding corrupting the
821 // backmap for generate a vector of original-text ranges.
822 int CheapSqueezeInplaceOverwrite(char* isrc,
827 char* srclimit = src + src_len;
828 bool skipping = false;
831 // Allocate local prediction table.
832 int* predict_tbl = new int[kPredictionTableSize];
833 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
835 int chunksize = ichunksize;
836 if (chunksize == 0) {chunksize = kChunksizeDefault;}
837 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
838 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
840 // Always keep first byte (space)
843 while (src < srclimit) {
844 int remaining_bytes = srclimit - src;
845 int len = minint(chunksize, remaining_bytes);
846 // Make len land us on a UTF-8 character boundary.
847 // Ah. Also fixes mispredict because we could get out of phase
848 // Loop always terminates at trailing space in buffer
849 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
851 int space_n = CountSpaces4(src, len);
852 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
853 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
854 // Overwrite the text [dst-n..dst)
856 // Keeping-to-skipping transition; do it at a space
857 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
858 // Text [word_dst..dst) is well-predicted: overwrite
859 for (char* p = dst - n; p < dst; ++p) {*p = '.';}
862 // Overwrite the text [dst..dst+len)
863 for (char* p = dst; p < dst + len; ++p) {*p = '.';}
864 dst[len - 1] = ' '; // Space at end so we can see what is happening
868 // Skipping-to-keeping transition; do it at a space
869 int n = ForwardscanToSpace(src, len);
870 // Text [dst..dst+n) is well-predicted: overwrite
871 for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
879 if ((dst - isrc) < (src_len - 3)) {
880 // Pad and make last char clean UTF-8 by putting following spaces
885 } else if ((dst - isrc) < src_len) {
886 // Make last char clean UTF-8 by putting following space off the end
890 // Deallocate local prediction table
891 delete[] predict_tbl;
892 return static_cast<int>(dst - isrc);
895 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
896 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
897 // Just CountSpaces is about 340 MB/sec
898 // Byte-only CountPredictedBytes is about 150 MB/sec
899 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
900 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
901 // Unjammed byte-only both = 170 MB/sec
902 // Jammed byte-only both = 120 MB/sec
903 // Back to original w/slight updates, 110 MB/sec
905 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
906 // Don't trigger at all on short text
907 if (src_len < testsize) {return false;}
908 int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
909 int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
911 // Allocate local prediction table.
912 int* predict_tbl = new int[kPredictionTableSize];
913 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
916 if ((CountSpaces4(src, testsize) >= space_thresh) ||
917 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
921 // Deallocate local prediction table
922 delete[] predict_tbl;
929 // Delete any extended languages from doc_tote
930 void RemoveExtendedLanguages(DocTote* doc_tote) {
934 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
936 // For Tier3 languages, require a minimum number of bytes to be first-place lang
937 static const int kGoodFirstT3MinBytes = 24; // <this => no first
939 // Move bytes for unreliable langs to another lang or UNKNOWN
940 // doc_tote is sorted, so cannot Add
942 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
943 // merge both into CHINESE.
946 // we also want to remove Tier3 languages as the first lang if there is very
947 // little text like ej1 ej2 ej3 ej4
948 // maybe fold this back in earlier
950 void RemoveUnreliableLanguages(DocTote* doc_tote,
951 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
952 // Prepass to merge some low-reliablility languages
953 // TODO: this shouldn't really reach in to the internal structure of doc_tote
955 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
956 int plang = doc_tote->Key(sub);
957 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
959 Language lang = static_cast<Language>(plang);
960 int bytes = doc_tote->Value(sub);
961 int reli = doc_tote->Reliability(sub);
962 if (bytes == 0) {continue;} // Zero bytes
963 total_bytes += bytes;
965 // Reliable percent = stored reliable score over stored bytecount
966 int reliable_percent = reli / bytes;
967 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
969 // This language is too unreliable to keep, but we might merge it.
970 Language altlang = UNKNOWN_LANGUAGE;
971 if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
972 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
974 // Look for alternative in doc_tote
975 int altsub = doc_tote->Find(altlang);
976 if (altsub < 0) {continue;} // No alternative text
978 int bytes2 = doc_tote->Value(altsub);
979 int reli2 = doc_tote->Reliability(altsub);
980 if (bytes2 == 0) {continue;} // Zero bytes
982 // Reliable percent is stored reliable score over stored bytecount
983 int reliable_percent2 = reli2 / bytes2;
985 // Merge one language into the other. Break ties toward lower lang #
988 bool into_lang = false;
989 if ((reliable_percent2 < reliable_percent) ||
990 ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
996 // Make sure merged reliability doesn't drop and is enough to avoid delete
997 int newpercent = maxint(reliable_percent, reliable_percent2);
998 newpercent = maxint(newpercent, kMinReliableKeepPercent);
999 int newbytes = bytes + bytes2;
1000 int newreli = newpercent * newbytes;
1002 doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
1003 doc_tote->SetScore(fromsub, 0);
1004 doc_tote->SetReliability(fromsub, 0);
1005 doc_tote->SetScore(tosub, newbytes);
1006 doc_tote->SetReliability(tosub, newreli);
1008 // Show fate of unreliable languages if at least 10 bytes
1009 if (FLAGS_cld2_html && (newbytes >= 10) &&
1010 !FLAGS_cld2_quiet) {
1012 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1013 LanguageCode(altlang), reliable_percent2, bytes2,
1014 LanguageCode(lang));
1016 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1017 LanguageCode(lang), reliable_percent, bytes,
1018 LanguageCode(altlang));
1024 // Pass to delete any remaining unreliable languages
1025 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1026 int plang = doc_tote->Key(sub);
1027 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
1029 Language lang = static_cast<Language>(plang);
1030 int bytes = doc_tote->Value(sub);
1031 int reli = doc_tote->Reliability(sub);
1032 if (bytes == 0) {continue;} // Zero bytes
1034 // Reliable percent is stored as reliable score over stored bytecount
1035 int reliable_percent = reli / bytes;
1036 if (reliable_percent >= kMinReliableKeepPercent) { // Keeper?
1040 // Delete unreliable entry
1041 doc_tote->SetKey(sub, DocTote::kUnusedKey);
1042 doc_tote->SetScore(sub, 0);
1043 doc_tote->SetReliability(sub, 0);
1045 // Show fate of unreliable languages if at least 10 bytes
1046 if (FLAGS_cld2_html && (bytes >= 10) &&
1047 !FLAGS_cld2_quiet) {
1048 fprintf(stderr, "{Unreli %s.%dR,%dB} ",
1049 LanguageCode(lang), reliable_percent, bytes);
1053 ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
1057 // Move all the text bytes from lower byte-count to higher one
1058 void MoveLang1ToLang2(Language lang1, Language lang2,
1059 int lang1_sub, int lang2_sub,
1061 ResultChunkVector* resultchunkvector) {
1062 // In doc_tote, move all the bytes lang1 => lang2
1063 int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
1064 doc_tote->SetValue(lang2_sub, sum);
1065 sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
1066 doc_tote->SetScore(lang2_sub, sum);
1067 sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
1068 doc_tote->SetReliability(lang2_sub, sum);
1071 doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
1072 doc_tote->SetScore(lang1_sub, 0);
1073 doc_tote->SetReliability(lang1_sub, 0);
1075 // In resultchunkvector, move all the bytes lang1 => lang2
1076 if (resultchunkvector == NULL) {return;}
1079 uint16 prior_lang = UNKNOWN_LANGUAGE;
1080 for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
1081 ResultChunk* rc = &(*resultchunkvector)[i];
1082 if (rc->lang1 == lang1) {
1083 // Update entry[i] lang1 => lang2
1086 // One change may produce two merges -- entry before and entry after
1087 if ((rc->lang1 == prior_lang) && (k > 0)) {
1088 // Merge with previous, deleting entry[i]
1089 ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
1090 prior_rc->bytes += rc->bytes;
1091 // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
1094 (*resultchunkvector)[k] = (*resultchunkvector)[i];
1095 // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
1098 prior_lang = rc->lang1;
1100 resultchunkvector->resize(k);
1105 // Move less likely byte count to more likely for close pairs of languages
1106 // If given, also update resultchunkvector
1107 void RefineScoredClosePairs(DocTote* doc_tote,
1108 ResultChunkVector* resultchunkvector,
1109 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1110 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1111 int close_packedlang = doc_tote->Key(sub);
1112 int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
1113 if (subscr == 0) {continue;}
1115 // We have a close pair language -- if the other one is also scored and the
1116 // longword score differs enough, put all our eggs into one basket
1118 // Nonzero longword score: Go look for the other of this pair
1119 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1120 if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
1121 // We have a matching pair
1122 int close_packedlang2 = doc_tote->Key(sub2);
1124 // Move all the text bytes from lower byte-count to higher one
1125 int from_sub, to_sub;
1126 Language from_lang, to_lang;
1127 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1130 from_lang = static_cast<Language>(close_packedlang);
1131 to_lang = static_cast<Language>(close_packedlang2);
1135 from_lang = static_cast<Language>(close_packedlang2);
1136 to_lang = static_cast<Language>(close_packedlang);
1139 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1140 // Show fate of closepair language
1141 int val = doc_tote->Value(from_sub); // byte count
1142 int reli = doc_tote->Reliability(from_sub);
1143 int reliable_percent = reli / (val ? val : 1); // avoid zdiv
1144 fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
1145 LanguageCode(from_lang),
1147 doc_tote->Value(from_sub),
1148 LanguageCode(to_lang));
1150 MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
1151 doc_tote, resultchunkvector);
1152 break; // Exit inner for sub2 loop
1159 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
1160 uint8* lang_hint_boost) {
1164 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1165 string temp(txt, len);
1166 fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
1169 void PrintLang(FILE* f, Tote* chunk_tote,
1170 Language cur_lang, bool cur_unreliable,
1171 Language prior_lang, bool prior_unreliable) {
1172 if (cur_lang == prior_lang) {
1175 fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
1180 void PrintTopLang(Language top_lang) {
1181 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1182 fprintf(stderr, "[] ");
1184 fprintf(stderr, "[%s] ", LanguageName(top_lang));
1185 prior_lang = top_lang;
1189 void PrintTopLangSpeculative(Language top_lang) {
1190 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1191 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1192 fprintf(stderr, "[] ");
1194 fprintf(stderr, "[%s] ", LanguageName(top_lang));
1195 prior_lang = top_lang;
1197 fprintf(stderr, "</span>\n");
1200 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1201 const int* text_bytes, const bool* is_reliable) {
1202 fprintf(f, "<br> Initial_Languages ");
1203 if (language3[0] != UNKNOWN_LANGUAGE) {
1204 fprintf(f, "%s%s(%d%%) ",
1205 LanguageName(language3[0]),
1206 *is_reliable ? "" : "*",
1209 if (language3[1] != UNKNOWN_LANGUAGE) {
1210 fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]);
1212 if (language3[2] != UNKNOWN_LANGUAGE) {
1213 fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]);
1215 fprintf(f, "%d bytes \n", *text_bytes);
1217 fprintf(f, "<br>\n");
1221 // Return internal probability score (sum) per 1024 bytes
1222 double GetNormalizedScore(Language lang, ULScript ulscript,
1223 int bytecount, int score) {
1224 if (bytecount <= 0) {return 0.0;}
1225 return (score << 10) / bytecount;
1228 // Extract return values before fixups
1229 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
1230 int* reliable_percent3, Language* language3, int* percent3,
1231 double* normalized_score3,
1232 int* text_bytes, bool* is_reliable) {
1233 reliable_percent3[0] = 0;
1234 reliable_percent3[1] = 0;
1235 reliable_percent3[2] = 0;
1236 language3[0] = UNKNOWN_LANGUAGE;
1237 language3[1] = UNKNOWN_LANGUAGE;
1238 language3[2] = UNKNOWN_LANGUAGE;
1242 normalized_score3[0] = 0.0;
1243 normalized_score3[1] = 0.0;
1244 normalized_score3[2] = 0.0;
1246 *text_bytes = total_text_bytes;
1247 *is_reliable = false;
1253 int lang1 = doc_tote->Key(0);
1254 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1255 // We have a top language
1256 language3[0] = static_cast<Language>(lang1);
1257 bytecount1 = doc_tote->Value(0);
1258 int reli1 = doc_tote->Reliability(0);
1259 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
1260 normalized_score3[0] = GetNormalizedScore(language3[0],
1263 doc_tote->Score(0));
1266 int lang2 = doc_tote->Key(1);
1267 if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
1268 language3[1] = static_cast<Language>(lang2);
1269 bytecount2 = doc_tote->Value(1);
1270 int reli2 = doc_tote->Reliability(1);
1271 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
1272 normalized_score3[1] = GetNormalizedScore(language3[1],
1275 doc_tote->Score(1));
1278 int lang3 = doc_tote->Key(2);
1279 if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
1280 language3[2] = static_cast<Language>(lang3);
1281 bytecount3 = doc_tote->Value(2);
1282 int reli3 = doc_tote->Reliability(2);
1283 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
1284 normalized_score3[2] = GetNormalizedScore(language3[2],
1287 doc_tote->Score(2));
1290 // Increase total bytes to sum (top 3) if low for some reason
1291 int total_bytecount12 = bytecount1 + bytecount2;
1292 int total_bytecount123 = total_bytecount12 + bytecount3;
1293 if (total_text_bytes < total_bytecount123) {
1294 total_text_bytes = total_bytecount123;
1295 *text_bytes = total_text_bytes;
1298 // Sum minus previous % gives better roundoff behavior than bytecount/total
1299 int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv
1300 percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1301 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1302 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1303 percent3[2] -= percent3[1];
1304 percent3[1] -= percent3[0];
1306 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1307 // Fix this explicitly
1308 if (percent3[1] < percent3[2]) {
1312 if (percent3[0] < percent3[1]) {
1317 *text_bytes = total_text_bytes;
1319 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1320 // We have a top language
1321 // Its reliability is overall result reliability
1322 int bytecount = doc_tote->Value(0);
1323 int reli = doc_tote->Reliability(0);
1324 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
1325 *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
1327 // No top language at all. This can happen with zero text or 100% Klingon
1328 // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
1329 *is_reliable = false;
1332 // If ignore percent is too large, set unreliable.
1333 int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1334 if ((ignore_percent > kIgnoreMaxPercent)) {
1335 *is_reliable = false;
1339 bool IsFIGS(Language lang) {
1340 if (lang == FRENCH) {return true;}
1341 if (lang == ITALIAN) {return true;}
1342 if (lang == GERMAN) {return true;}
1343 if (lang == SPANISH) {return true;}
1347 bool IsEFIGS(Language lang) {
1348 if (lang == ENGLISH) {return true;}
1349 if (lang == FRENCH) {return true;}
1350 if (lang == ITALIAN) {return true;}
1351 if (lang == GERMAN) {return true;}
1352 if (lang == SPANISH) {return true;}
1356 // For Tier3 languages, require more bytes of text to override
1357 // the first-place language
1358 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
1359 static const int kGoodSecondT3MinBytes = 128; // <this => no second
1361 // Calculate a single summary language for the document, and its reliability.
1362 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1363 // This is the heart of matching human-rater perception.
1364 // reliable_percent3[] is currently unused
1366 // Do not return Tier3 second language unless there are at least 128 bytes
1367 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
1368 const int* reliable_percent3,
1369 const Language* language3,
1370 const int* percent3,
1371 Language* summary_lang, bool* is_reliable,
1372 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1373 // Vector of active languages; changes if we delete some
1375 int active_slot[3] = {0, 1, 2};
1377 int ignore_percent = 0;
1378 int return_percent = percent3[0]; // Default to top lang
1379 *summary_lang = language3[0];
1380 *is_reliable = true;
1381 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1383 // If any of top 3 is IGNORE, remove it and increment ignore_percent
1384 for (int i = 0; i < 3; ++i) {
1385 if (language3[i] == TG_UNKNOWN_LANGUAGE) {
1386 ignore_percent += percent3[i];
1387 // Move the rest up, levaing input vectors unchanged
1388 for (int j=i+1; j < 3; ++j) {
1389 active_slot[j - 1] = active_slot[j];
1392 // Logically remove Ignore from percentage-text calculation
1393 // (extra 1 in 101 avoids zdiv, biases slightly small)
1394 return_percent = (percent3[0] * 100) / (101 - ignore_percent);
1395 *summary_lang = language3[active_slot[0]];
1396 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
1401 // If English and X, where X (not UNK) is big enough,
1402 // assume the English is boilerplate and return X.
1403 // Logically remove English from percentage-text calculation
1404 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
1405 // Require more bytes of text for Tier3 languages
1406 int minbytesneeded = kGoodSecondT1T2MinBytes;
1407 int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
1409 if ((language3[active_slot[0]] == ENGLISH) &&
1410 (language3[active_slot[1]] != ENGLISH) &&
1411 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1412 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
1413 (second_bytes >= minbytesneeded)) {
1414 ignore_percent += percent3[active_slot[0]];
1415 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1416 *summary_lang = language3[active_slot[1]];
1417 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1419 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
1420 // assume the FIGS is boilerplate and return X.
1421 // Logically remove FIGS from percentage-text calculation
1422 } else if (IsFIGS(language3[active_slot[0]]) &&
1423 !IsEFIGS(language3[active_slot[1]]) &&
1424 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1425 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
1426 (second_bytes >= minbytesneeded)) {
1427 ignore_percent += percent3[active_slot[0]];
1428 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1429 *summary_lang = language3[active_slot[1]];
1430 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1432 // Else we are returning the first language, but want to improve its
1433 // return_percent if the second language should be ignored
1434 } else if ((language3[active_slot[1]] == ENGLISH) &&
1435 (language3[active_slot[0]] != ENGLISH)) {
1436 ignore_percent += percent3[active_slot[1]];
1437 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1438 } else if (IsFIGS(language3[active_slot[1]]) &&
1439 !IsEFIGS(language3[active_slot[0]])) {
1440 ignore_percent += percent3[active_slot[1]];
1441 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1444 // If return percent is too small (too many languages), return UNKNOWN
1445 if ((return_percent < kGoodFirstMinPercent)) {
1446 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1447 fprintf(stderr, "{Unreli %s %d%% percent too small} ",
1448 LanguageCode(*summary_lang), return_percent);
1450 *summary_lang = UNKNOWN_LANGUAGE;
1451 *is_reliable = false;
1454 // If return percent is small, return language but set unreliable.
1455 if ((return_percent < kGoodFirstReliableMinPercent)) {
1456 *is_reliable = false;
1459 // If ignore percent is too large, set unreliable.
1460 ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1461 if ((ignore_percent > kIgnoreMaxPercent)) {
1462 *is_reliable = false;
1465 // If we removed all the active languages, return UNKNOWN
1466 if (slot_count == 0) {
1467 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1468 fprintf(stderr, "{Unreli %s no languages left} ",
1469 LanguageCode(*summary_lang));
1471 *summary_lang = UNKNOWN_LANGUAGE;
1472 *is_reliable = false;
1476 void AddLangPriorBoost(Language lang, uint32 langprob,
1477 ScoringContext* scoringcontext) {
1478 // This is called 0..n times with language hints
1479 // but we don't know the script -- so boost either or both Latn, Othr.
1481 if (IsLatnLanguage(lang)) {
1482 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
1483 int n = langprior_boost->n;
1484 langprior_boost->langprob[n] = langprob;
1485 langprior_boost->n = langprior_boost->wrap(n + 1);
1488 if (IsOthrLanguage(lang)) {
1489 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
1490 int n = langprior_boost->n;
1491 langprior_boost->langprob[n] = langprob;
1492 langprior_boost->n = langprior_boost->wrap(n + 1);
1497 void AddOneWhack(Language whacker_lang, Language whackee_lang,
1498 ScoringContext* scoringcontext) {
1499 uint32 langprob = MakeLangProb(whackee_lang, 1);
1500 // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
1501 if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
1502 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
1503 int n = langprior_whack->n;
1504 langprior_whack->langprob[n] = langprob;
1505 langprior_whack->n = langprior_whack->wrap(n + 1);
1507 if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
1508 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
1509 int n = langprior_whack->n;
1510 langprior_whack->langprob[n] = langprob;
1511 langprior_whack->n = langprior_whack->wrap(n + 1);
1515 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
1516 // We do not in general want zh-Hans and zh-Hant to be close pairs,
1518 if (lang == CLD2::CHINESE) {
1519 AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
1522 if (lang == CLD2::CHINESE_T) {
1523 AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
1527 int base_lang_set = LanguageCloseSet(lang);
1528 if (base_lang_set == 0) {return;}
1529 // TODO: add an explicit list of each set to avoid this 512-times loop
1530 for (int i = 0; i < kLanguageToPLangSize; ++i) {
1531 Language lang2 = static_cast<Language>(i);
1532 if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
1533 AddOneWhack(lang, lang2, scoringcontext);
1539 void ApplyHints(const char* buffer,
1542 const CLDHints* cld_hints,
1543 ScoringContext* scoringcontext) {
1544 CLDLangPriors lang_priors;
1545 InitCLDLangPriors(&lang_priors);
1547 // We now use lang= tags.
1548 // Last look, circa 2008 found only 15% of web pages with lang= tags and
1549 // many of those were wrong. Now (July 2011), we find 44% of web pages have
1550 // lang= tags, and most of them are correct. So we now give them substantial
1551 // weight in each chunk scored.
1552 if (!is_plain_text) {
1553 // Get any contained language tags in first n KB
1554 int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
1555 string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
1557 SetCLDLangTagsHint(lang_tags, &lang_priors);
1558 if (scoringcontext->flags_cld2_html) {
1559 if (!lang_tags.empty()) {
1560 fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
1566 if (cld_hints != NULL) {
1567 if ((cld_hints->content_language_hint != NULL) &&
1568 (cld_hints->content_language_hint[0] != '\0')) {
1569 SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
1572 // Input is from GetTLD(), already lowercased
1573 if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
1574 SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
1577 if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
1578 Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
1579 SetCLDEncodingHint(enc, &lang_priors);
1582 if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
1583 SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
1587 // Keep no more than four different languages with hints
1588 TrimCLDLangPriors(4, &lang_priors);
1590 if (scoringcontext->flags_cld2_html) {
1591 string print_temp = DumpCLDLangPriors(&lang_priors);
1592 if (!print_temp.empty()) {
1593 fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
1594 print_temp.c_str());
1598 // Put boosts into ScoringContext
1599 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1600 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1601 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1603 uint32 langprob = MakeLangProb(lang, qprob);
1604 AddLangPriorBoost(lang, langprob, scoringcontext);
1608 // Put whacks into scoring context
1609 // We do not in general want zh-Hans and zh-Hant to be close pairs,
1610 // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
1611 std::vector<int> close_set_count(kCloseSetSize + 1, 0);
1613 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1614 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1615 ++close_set_count[LanguageCloseSet(lang)];
1616 if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
1617 if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
1620 // If a boost language is in a close set, force suppressing the others in
1621 // that set, if exactly one of the set is present
1622 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1623 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1624 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1626 int close_set = LanguageCloseSet(lang);
1627 if ((close_set > 0) && (close_set_count[close_set] == 1)) {
1628 AddCloseLangWhack(lang, scoringcontext);
1630 if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
1631 (close_set_count[kCloseSetSize] == 1)) {
1632 AddCloseLangWhack(lang, scoringcontext);
1646 // Results language3/percent3/text_bytes must be exactly three items
1647 Language DetectLanguageSummaryV2(
1651 const CLDHints* cld_hints,
1652 bool allow_extended_lang,
1655 Language* language3,
1657 double* normalized_score3,
1658 ResultChunkVector* resultchunkvector,
1660 bool* is_reliable) {
1661 language3[0] = UNKNOWN_LANGUAGE;
1662 language3[1] = UNKNOWN_LANGUAGE;
1663 language3[2] = UNKNOWN_LANGUAGE;
1667 normalized_score3[0] = 0.0;
1668 normalized_score3[1] = 0.0;
1669 normalized_score3[2] = 0.0;
1670 if (resultchunkvector != NULL) {
1671 resultchunkvector->clear();
1674 *is_reliable = false;
1676 if ((flags & kCLDFlagEcho) != 0) {
1677 string temp(buffer, buffer_length);
1678 if ((flags & kCLDFlagHtml) != 0) {
1679 fprintf(stderr, "CLD2[%d] '%s'<br>\n",
1680 buffer_length, GetHtmlEscapedText(temp).c_str());
1682 fprintf(stderr, "CLD2[%d] '%s'\n",
1683 buffer_length, GetPlainEscapedText(temp).c_str());
1687 #ifdef CLD2_DYNAMIC_MODE
1688 // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
1689 // hasn't been loaded yet. This is the only sane thing we can do, as there
1690 // are no scoring tables to consult.
1691 bool dataLoaded = isDataLoaded();
1692 if ((flags & kCLDFlagVerbose) != 0) {
1693 fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
1696 return UNKNOWN_LANGUAGE;
1700 // Exit now if no text
1701 if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
1702 if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
1705 DocTote doc_tote; // Reliability = 0..100
1707 // ScoringContext carries state across scriptspans
1708 ScoringContext scoringcontext;
1709 scoringcontext.debug_file = stderr;
1710 scoringcontext.flags_cld2_score_as_quads =
1711 ((flags & kCLDFlagScoreAsQuads) != 0);
1712 scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
1713 scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
1714 scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
1715 scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
1716 scoringcontext.ulscript = ULScript_Common;
1717 scoringcontext.scoringtables = &kScoringtables;
1718 scoringcontext.scanner = NULL;
1719 scoringcontext.init(); // Clear the internal memory arrays
1722 bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
1723 bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
1725 ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
1727 // Four individual script totals, Latin, Han, other2, other3
1728 int next_other_tote = 2;
1731 // Four totes for up to four different scripts pending at once
1732 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
1733 bool tote_seen[4] = {false, false, false, false};
1734 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
1735 ULScript tote_script[4] =
1736 {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
1738 // Loop through text spans in a single script
1739 ScriptScanner ss(buffer, buffer_length, is_plain_text);
1740 LangSpan scriptspan;
1742 scoringcontext.scanner = &ss;
1744 scriptspan.text = NULL;
1745 scriptspan.text_bytes = 0;
1746 scriptspan.offset = 0;
1747 scriptspan.ulscript = ULScript_Common;
1748 scriptspan.lang = UNKNOWN_LANGUAGE;
1750 int total_text_bytes = 0;
1751 int textlimit = FLAGS_cld_textlimit << 10; // in KB
1752 if (textlimit == 0) {textlimit = 0x7fffffff;}
1754 int advance_by = 2; // Advance 2 bytes
1755 int advance_limit = textlimit >> 3; // For first 1/8 of max document
1757 int initial_word_span = kDefaultWordSpan;
1758 if (FLAGS_cld_forcewords) {
1759 initial_word_span = kReallyBigWordSpan;
1762 // Pick up chunk sizes
1763 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
1764 // Sanity check -- force into a reasonable range
1765 int chunksizequads = FLAGS_cld_smoothwidth;
1766 chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
1767 kMaxChunkSizeQuads);
1768 int chunksizeunis = (chunksizequads * 5) >> 1;
1770 // Varying short-span limit doesn't work well -- skips too much beyond 20KB
1771 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
1772 int spantooshortlimit = kShortSpanThresh;
1774 // For debugging only. Not thread-safe
1775 prior_lang = UNKNOWN_LANGUAGE;
1776 prior_unreliable = false;
1778 // Allocate full-document prediction table for finding repeating words
1780 int* predict_tbl = new int[kPredictionTableSize];
1781 if (FlagRepeats(flags)) {
1782 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1787 // Loop through scriptspans accumulating number of text bytes in each language
1788 while (ss.GetOneScriptSpanLower(&scriptspan)) {
1789 ULScript ulscript = scriptspan.ulscript;
1791 // Squeeze out big chunks of text span if asked to
1792 if (FlagSqueeze(flags)) {
1793 // Remove repetitive or mostly-spaces chunks
1795 int chunksize = 0; // Use the default
1796 if (resultchunkvector != NULL) {
1797 newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
1798 scriptspan.text_bytes,
1801 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
1804 scriptspan.text_bytes = newlen;
1806 // Check now and then to see if we should be squeezing
1807 if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
1808 !FlagFinish(flags)) {
1809 // fprintf(stderr, "CheapSqueezeTriggerTest, "
1810 // "first %d bytes of %d (>%d/2)<br>\n",
1811 // kCheapSqueezeTestLen,
1812 // scriptspan.text_bytes,
1813 // kCheapSqueezeTestThresh);
1815 if (CheapSqueezeTriggerTest(scriptspan.text,
1816 scriptspan.text_bytes,
1817 kCheapSqueezeTestLen)) {
1818 // Recursive call with big-chunk squeezing set
1819 if (FLAGS_cld2_html || FLAGS_dbgscore) {
1821 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
1824 // Deallocate full-document prediction table
1825 delete[] predict_tbl;
1827 return DetectLanguageSummaryV2(
1832 allow_extended_lang,
1833 flags | kCLDFlagSqueeze,
1845 // Remove repetitive words if asked to
1846 if (FlagRepeats(flags)) {
1847 // Remove repetitive words
1849 if (resultchunkvector != NULL) {
1850 newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
1851 scriptspan.text_bytes,
1852 &hash, predict_tbl);
1854 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
1855 &hash, predict_tbl);
1857 scriptspan.text_bytes = newlen;
1860 // Scoring depends on scriptspan buffer ALWAYS having
1861 // leading space and off-the-end space space space NUL,
1862 // DCHECK(scriptspan.text[0] == ' ');
1863 // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
1864 // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
1865 // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
1866 // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
1869 // Accumulate directly into the document total, or accmulate in one of four
1870 // chunk totals. The purpose of the multiple chunk totals is to piece
1871 // together short choppy pieces of text in alternating scripts. One total is
1872 // dedicated to Latin text, one to Han text, and the other two are dynamicly
1875 scoringcontext.ulscript = scriptspan.ulscript;
1876 // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
1878 ScoreOneScriptSpan(scriptspan,
1883 total_text_bytes += scriptspan.text_bytes;
1884 } // End while (ss.GetOneScriptSpanLower())
1886 // Deallocate full-document prediction table
1887 delete[] predict_tbl;
1889 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1890 // If no forced <cr>, put one in front of dump
1891 if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
1892 doc_tote.Dump(stderr);
1896 // If extended langauges are disallowed, remove them here
1897 if (!allow_extended_lang) {
1898 RemoveExtendedLanguages(&doc_tote);
1901 // Force close pairs to one or the other
1902 // If given, also update resultchunkvector
1903 RefineScoredClosePairs(&doc_tote, resultchunkvector,
1904 FLAGS_cld2_html, FLAGS_cld2_quiet);
1907 // Calculate return results
1908 // Find top three byte counts in tote heap
1909 int reliable_percent3[3];
1911 // Cannot use Add, etc. after sorting
1914 ExtractLangEtc(&doc_tote, total_text_bytes,
1915 reliable_percent3, language3, percent3, normalized_score3,
1916 text_bytes, is_reliable);
1918 bool have_good_answer = false;
1919 if (FlagFinish(flags)) {
1921 have_good_answer = true;
1922 } else if (total_text_bytes <= kShortTextThresh) {
1923 // Don't recurse on short text -- we already did word scores
1924 have_good_answer = true;
1925 } else if (*is_reliable &&
1926 (percent3[0] >= kGoodLang1Percent)) {
1927 have_good_answer = true;
1928 } else if (*is_reliable &&
1929 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
1930 have_good_answer = true;
1934 if (have_good_answer) {
1935 // This is the real, non-recursive return
1937 // Move bytes for unreliable langs to another lang or UNKNOWN
1938 RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
1940 // Redo the result extraction after the removal above
1942 ExtractLangEtc(&doc_tote, total_text_bytes,
1943 reliable_percent3, language3, percent3, normalized_score3,
1944 text_bytes, is_reliable);
1948 Language summary_lang;
1949 CalcSummaryLang(&doc_tote, total_text_bytes,
1950 reliable_percent3, language3, percent3,
1951 &summary_lang, is_reliable,
1952 FLAGS_cld2_html, FLAGS_cld2_quiet);
1954 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1955 for (int i = 0; i < 3; ++i) {
1956 if (language3[i] != UNKNOWN_LANGUAGE) {
1957 fprintf(stderr, "%s.%dR(%d%%) ",
1958 LanguageCode(language3[i]),
1959 reliable_percent3[i],
1964 fprintf(stderr, "%d bytes ", total_text_bytes);
1965 fprintf(stderr, "= %s%c ",
1966 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1967 fprintf(stderr, "<br><br>\n");
1970 // Slightly condensed if quiet
1971 if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
1972 fprintf(stderr, " ");
1973 for (int i = 0; i < 3; ++i) {
1974 if (language3[i] != UNKNOWN_LANGUAGE) {
1975 fprintf(stderr, " %s %d%% ",
1976 LanguageCode(language3[i]),
1980 fprintf(stderr, "= %s%c ",
1981 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1982 fprintf(stderr, "<br>\n");
1985 return summary_lang;
1988 // Not a good answer -- do recursive call to refine
1989 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1990 // This is what we hope to improve on in the recursive call, if any
1991 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
1994 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
1995 // For this purpose, we treate "Ignore" as top40
1996 Language new_plus_one = UNKNOWN_LANGUAGE;
1998 if (total_text_bytes < kShortTextThresh) {
1999 // Short text: Recursive call with top40 and short set
2000 if (FLAGS_cld2_html || FLAGS_dbgscore) {
2001 fprintf(stderr, " ---text_bytes[%d] "
2002 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2005 return DetectLanguageSummaryV2(
2010 allow_extended_lang,
2011 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2012 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2022 // Longer text: Recursive call with top40 set
2023 if (FLAGS_cld2_html || FLAGS_dbgscore) {
2025 " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2028 return DetectLanguageSummaryV2(
2033 allow_extended_lang,
2034 flags | kCLDFlagTop40 | kCLDFlagRepeats |
2046 // For debugging and wrappers. Not thread safe.
2047 static char temp_detectlanguageversion[32];
2049 // Return version text string
2050 // String is "code_version - data_build_date"
2051 const char* DetectLanguageVersion() {
2052 if (kScoringtables.quadgram_obj == NULL) {return "";}
2053 sprintf(temp_detectlanguageversion,
2054 "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
2055 return temp_detectlanguageversion;
2059 } // End namespace CLD2