src/third_party/cld_2/src/internal/compact_lang_det_impl.cc

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 // Updated 2014.01 for dual table lookup
  18 //
  19
  20 #include <stdio.h>
  21 #include <string.h>
  22 #include <string>
  23 #include <vector>
  24
  25 #include "cldutil.h"
  26 #include "debug.h"
  27 #include "integral_types.h"
  28 #include "lang_script.h"
  29 #include "utf8statetable.h"
  30
  31 #ifdef CLD2_DYNAMIC_MODE
  32 #include "cld2_dynamic_data.h"
  33 #include "cld2_dynamic_data_loader.h"
  34 #endif
  35 #include "cld2tablesummary.h"
  36 #include "compact_lang_det_impl.h"
  37 #include "compact_lang_det_hint_code.h"
  38 #include "getonescriptspan.h"
  39 #include "tote.h"
  40
  41
  42 namespace CLD2 {
  43
  44 using namespace std;
  45
  46 // Linker supplies the right tables, From files
  47 // cld_generated_cjk_uni_prop_80.cc  cld2_generated_cjk_compatible.cc
  48 // cld_generated_cjk_delta_bi_32.cc  generated_distinct_bi_0.cc
  49 // cld2_generated_quad*.cc  cld2_generated_deltaocta*.cc
  50 // cld2_generated_distinctocta*.cc
  51 // cld_generated_score_quad_octa_1024_256.cc
  52
  53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
  54 //   sizes that are 1/3/5 times a power of two, instead of just powers of two.
  55 //   Gives more flexibility of total footprint for CLD2.
  56
  57 extern const int kLanguageToPLangSize;
  58 extern const int kCloseSetSize;
  59
  60 extern const UTF8PropObj cld_generated_CjkUni_obj;
  61 extern const CLD2TableSummary kCjkCompat_obj;
  62 extern const CLD2TableSummary kCjkDeltaBi_obj;
  63 extern const CLD2TableSummary kDistinctBiTable_obj;
  64 extern const CLD2TableSummary kQuad_obj;
  65 extern const CLD2TableSummary kQuad_obj2;     // Dual lookup tables
  66 extern const CLD2TableSummary kDeltaOcta_obj;
  67 extern const CLD2TableSummary kDistinctOcta_obj;
  68 extern const short kAvgDeltaOctaScore[];
  69
  70 #ifdef CLD2_DYNAMIC_MODE
  71   // CLD2_DYNAMIC_MODE is defined:
  72   // Data will be read from an mmap opened at runtime.
  73
  74   // Convenience for nulling things out completely at any point.
  75   static ScoringTables NULL_TABLES = {
  76     NULL, //&cld_generated_CjkUni_obj,
  77     NULL, //&kCjkCompat_obj,
  78     NULL, //&kCjkDeltaBi_obj,
  79     NULL, //&kDistinctBiTable_obj,
  80     NULL, //&kQuad_obj,
  81     NULL, //&kQuad_obj2,
  82     NULL, //&kDeltaOcta_obj,
  83     NULL, //&kDistinctOcta_obj,
  84     NULL, //kAvgDeltaOctaScore,
  85   };
  86   static ScoringTables kScoringtables = NULL_TABLES; // copy constructed
  87   static bool dynamicDataLoaded = false;
  88   static bool dataSourceIsFile = false;
  89   static ScoringTables* dynamicTables = NULL;
  90   static void* mmapAddress = NULL;
  91   static int mmapLength = 0;
  92
  93   bool isDataLoaded() { return dynamicDataLoaded; }
  94
  95   void loadDataFromFile(const char* fileName) {
  96     if (isDataLoaded()) {
  97       unloadData();
  98     }
  99     dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
 100     kScoringtables = *dynamicTables;
 101     dataSourceIsFile = true;
 102     dynamicDataLoaded = true;
 103   };
 104
 105   void loadDataFromRawAddress(const void* rawAddress, const int length) {
 106     if (isDataLoaded()) {
 107       unloadData();
 108     }
 109     dynamicTables = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length);
 110     kScoringtables = *dynamicTables;
 111     dataSourceIsFile = false;
 112     dynamicDataLoaded = true;
 113   }
 114
 115   void unloadData() {
 116     if (!dynamicDataLoaded) return;
 117     if (dataSourceIsFile) {
 118       CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength);
 119     } else {
 120       CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables);
 121     }
 122     dynamicDataLoaded = false;
 123     dataSourceIsFile = false; // vacuous
 124     kScoringtables = NULL_TABLES; // Housekeeping: null all pointers
 125   }
 126 #else
 127   // This initializes kScoringtables.quadgram_obj etc.
 128   static const ScoringTables kScoringtables = {
 129     &cld_generated_CjkUni_obj,
 130     &kCjkCompat_obj,
 131     &kCjkDeltaBi_obj,
 132     &kDistinctBiTable_obj,
 133
 134     &kQuad_obj,
 135     &kQuad_obj2,                              // Dual lookup tables
 136     &kDeltaOcta_obj,
 137     &kDistinctOcta_obj,
 138
 139     kAvgDeltaOctaScore,
 140   };
 141 #endif // #ifdef CLD2_DYNAMIC_MODE
 142
 143
 144 static const bool FLAGS_cld_no_minimum_bytes = false;
 145 static const bool FLAGS_cld_forcewords = true;
 146 static const bool FLAGS_cld_showme = false;
 147 static const bool FLAGS_cld_echotext = true;
 148 static const int32 FLAGS_cld_textlimit = 160;
 149 static const int32 FLAGS_cld_smoothwidth = 20;
 150 static const bool FLAGS_cld_2011_hints = true;
 151 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
 152
 153 static const bool FLAGS_dbgscore = false;
 154
 155
 156 static const int kLangHintInitial = 12;  // Boost language by N initially
 157 static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
 158
 159 static const int kShortSpanThresh = 32;       // Bytes
 160 static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
 161
 162 static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
 163                                                   // after this many text bytes
 164 static const int kCheapSqueezeTestLen = 256;  // Bytes to test to trigger sqz
 165 static const int kSpacesTriggerPercent = 25;  // Trigger sqz if >=25% spaces
 166 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
 167
 168 static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
 169 static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
 170 static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
 171
 172 static const int kMaxSpaceScan = 32;          // Bytes
 173
 174 static const int kGoodLang1Percent = 70;
 175 static const int kGoodLang1and2Percent = 93;
 176 static const int kShortTextThresh = 256;      // Bytes
 177
 178 static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
 179 static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
 180
 181 static const int kDefaultWordSpan = 256;      // Scan at least this many initial
 182                                               // bytes with word scoring
 183 static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
 184
 185 static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
 186
 187 static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
 188                                                 // cheap compressor
 189
 190 static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
 191 static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
 192 static const int kGoodFirstMinPercent = 26;           // <this => UNK
 193 static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
 194 static const int kIgnoreMaxPercent = 20;              // >this => unreli
 195 static const int kKeepMinPercent = 2;                 // <this => unreli
 196
 197
 198
 199 // Statistically closest language, based on quadgram table
 200 // Those that are far from other languges map to UNKNOWN_LANGUAGE
 201 // Subscripted by Language
 202 //
 203 // From lang_correlation.txt and hand-edits
 204 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
 205 //   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
 206 //   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
 207 //
 208 static const int kMinCorrPercent = 24;        // Pick off how close you want
 209                                               // 24 catches PERSIAN <== ARABIC
 210                                               // but not SPANISH <== PORTUGESE
 211 static Language Unknown = UNKNOWN_LANGUAGE;
 212
 213 // Suspect idea
 214 // Subscripted by Language
 215 static const Language kClosestAltLanguage[] = {
 216   (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
 217   (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
 218   (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
 219   (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
 220   (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
 221   (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
 222   (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
 223   (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
 224   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
 225   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
 226   (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
 227   ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
 228   (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
 229   (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
 230   (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
 231   (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
 232   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
 233   (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
 234   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
 235   (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
 236   ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
 237   ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
 238   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
 239   ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
 240   (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
 241   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
 242   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
 243   (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
 244   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
 245   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
 246   (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
 247   (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
 248   ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
 249   (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
 250   (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
 251   (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
 252   (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
 253   (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
 254   (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
 255   ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
 256   (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
 257   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
 258   ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
 259   ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
 260   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
 261   ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
 262   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
 263   (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
 264   (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
 265   (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
 266   (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
 267   (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
 268   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
 269   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
 270   (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
 271   (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
 272   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
 273   ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
 274   ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
 275   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
 276   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
 277   (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
 278   ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
 279   (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
 280   (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
 281   ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
 282   ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
 283   (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
 284   (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
 285   // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
 286   (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
 287   (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
 288   (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
 289   (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
 290   ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
 291   (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
 292   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
 293   ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
 294   (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
 295   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
 296   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
 297   (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
 298   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
 299   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
 300   (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
 301   (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
 302   ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
 303   (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
 304   ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
 305   ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
 306   ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
 307   ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
 308   (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
 309   (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
 310   (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
 311   ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
 312   (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
 313   ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
 314   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
 315   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
 316   ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
 317   (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
 318   (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
 319   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
 320   ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
 321   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
 322   (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
 323   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
 324   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
 325   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
 326   ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
 327   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
 328   (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
 329   (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
 330   ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
 331   ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
 332   ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
 333   (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
 334   (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
 335   ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
 336   ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
 337   (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
 338   (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
 339   (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
 340   ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
 341   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
 342   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
 343   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
 344   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
 345   ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
 346   ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
 347   ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
 348   ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
 349   ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
 350   (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
 351   ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
 352   (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
 353   ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
 354   ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
 355   ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
 356   ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
 357   ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
 358   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
 359   ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
 360   (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
 361   ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
 362   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
 363   (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
 364   (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
 365   (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
 366   ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
 367   (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
 368   (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
 369   ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
 370   (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
 371   ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
 372   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
 373   ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
 374   (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
 375   (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
 376   ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
 377   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
 378
 379   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // AKAN
 380   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // IGBO
 381   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MAURITIAN_CREOLE
 382   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // HAWAIIAN
 383 };
 384
 385 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
 386 //                kClosestAltLanguage_has_incorrect_size);
 387
 388
 389 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
 390 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
 391 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
 392 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
 393 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
 394 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
 395 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
 396
 397
 398   // Defines Top40 packed languages
 399
 400   // Google top 40 languages
 401   //
 402   // Tier 0/1 Language enum list (16)
 403   //   ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH,    // E - FIGS
 404   //   DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
 405   //   PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
 406   //   ARABIC,
 407   //
 408   // Tier 2 Language enum list (22)
 409   //   SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
 410   //   HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
 411   //   VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
 412   //   TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
 413   //   UKRAINIAN, HINDI,
 414   //
 415   //   use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
 416   //
 417   // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
 418
 419
 420 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
 421   // REVISIT
 422 }
 423
 424 void PrintText(FILE* f, Language cur_lang, const string& temp) {
 425   if (temp.size() == 0) {return;}
 426   fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
 427 }
 428
 429
 430 //------------------------------------------------------------------------------
 431 // For --cld_html debugging output. Not thread safe
 432 //------------------------------------------------------------------------------
 433 static Language prior_lang = UNKNOWN_LANGUAGE;
 434 static bool prior_unreliable = false;
 435
 436 //------------------------------------------------------------------------------
 437 // End For --cld_html debugging output
 438 //------------------------------------------------------------------------------
 439
 440
 441 // Backscan to word boundary, returning how many bytes n to go back
 442 // so that src - n is non-space ans src - n - 1 is space.
 443 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
 444 int BackscanToSpace(const char* src, int limit) {
 445   int n = 0;
 446   limit = minint(limit, kMaxSpaceScan);
 447   while (n < limit) {
 448     if (src[-n - 1] == ' ') {return n;}    // We are at _X
 449     ++n;
 450   }
 451   n = 0;
 452   while (n < limit) {
 453     if ((src[-n] & 0xc0) != 0x80) {return n;}    // We are at char begin
 454     ++n;
 455   }
 456   return 0;
 457 }
 458
 459 // Forwardscan to word boundary, returning how many bytes n to go forward
 460 // so that src + n is non-space ans src + n - 1 is space.
 461 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
 462 int ForwardscanToSpace(const char* src, int limit) {
 463   int n = 0;
 464   limit = minint(limit, kMaxSpaceScan);
 465   while (n < limit) {
 466     if (src[n] == ' ') {return n + 1;}    // We are at _X
 467     ++n;
 468   }
 469   n = 0;
 470   while (n < limit) {
 471     if ((src[n] & 0xc0) != 0x80) {return n;}    // We are at char begin
 472     ++n;
 473   }
 474   return 0;
 475 }
 476
 477
 478 // This uses a cheap predictor to get a measure of compression, and
 479 // hence a measure of repetitiveness. It works on complete UTF-8 characters
 480 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
 481 // all the time when done with a byte-based count. Sigh.
 482 //
 483 // To allow running prediction across multiple chunks, caller passes in current
 484 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
 485 //
 486 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
 487 // each correctly-predicted character.
 488 //
 489 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
 490 //
 491
 492 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
 493
 494 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
 495   int p_count = 0;
 496   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 497   const uint8* srclimit = src + src_len;
 498   int local_hash = *hash;
 499
 500   while (src < srclimit) {
 501     int c = src[0];
 502     int incr = 1;
 503
 504     // Pick up one char and length
 505     if (c < 0xc0) {
 506       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
 507       // Do nothing more
 508     } else if ((c & 0xe0) == 0xc0) {
 509       // Two-byte
 510       c = (c << 8) | src[1];
 511       incr = 2;
 512     } else if ((c & 0xf0) == 0xe0) {
 513       // Three-byte
 514       c = (c << 16) | (src[1] << 8) | src[2];
 515       incr = 3;
 516     } else {
 517       // Four-byte
 518       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
 519       incr = 4;
 520     }
 521     src += incr;
 522
 523     int p = tbl[local_hash];            // Prediction
 524     tbl[local_hash] = c;                // Update prediction
 525     if (c == p) {
 526       p_count += incr;                  // Count bytes of good predictions
 527     }
 528
 529     local_hash = ((local_hash << 4) ^ c) & 0xfff;
 530   }
 531   *hash = local_hash;
 532   return p_count;
 533 }
 534
 535
 536
 537 // Counts number of spaces; a little faster than one-at-a-time
 538 // Doesn't count odd bytes at end
 539 int CountSpaces4(const char* src, int src_len) {
 540   int s_count = 0;
 541   for (int i = 0; i < (src_len & ~3); i += 4) {
 542     s_count += (src[i] == ' ');
 543     s_count += (src[i+1] == ' ');
 544     s_count += (src[i+2] == ' ');
 545     s_count += (src[i+3] == ' ');
 546   }
 547   return s_count;
 548 }
 549
 550
 551 // Remove words of text that have more than half their letters predicted
 552 // correctly by our cheap predictor, moving the remaining words in-place
 553 // to the front of the input buffer.
 554 //
 555 // To allow running prediction across multiple chunks, caller passes in current
 556 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
 557 //
 558 // Return the new, possibly-shorter length
 559 //
 560 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
 561 // if input does
 562 //
 563 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
 564   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 565   const uint8* srclimit = src + src_len;
 566   char* dst = isrc;
 567   int local_hash = *hash;
 568   char* word_dst = dst;           // Start of next word
 569   int good_predict_bytes = 0;
 570   int word_length_bytes = 0;
 571
 572   while (src < srclimit) {
 573     int c = src[0];
 574     int incr = 1;
 575     *dst++ = c;
 576
 577     if (c == ' ') {
 578       if ((good_predict_bytes * 2) > word_length_bytes) {
 579         // Word is well-predicted: backup to start of this word
 580         dst = word_dst;
 581         if (FLAGS_cld_showme) {
 582           // Mark the deletion point with period
 583           // Don't repeat multiple periods
 584           // Cannot mark with more bytes or may overwrite unseen input
 585           if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
 586             *dst++ = '.';
 587             *dst++ = ' ';
 588           }
 589         }
 590       }
 591       word_dst = dst;              // Start of next word
 592       good_predict_bytes = 0;
 593       word_length_bytes = 0;
 594     }
 595
 596     // Pick up one char and length
 597     if (c < 0xc0) {
 598       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
 599       // Do nothing more
 600     } else if ((c & 0xe0) == 0xc0) {
 601       // Two-byte
 602       *dst++ = src[1];
 603       c = (c << 8) | src[1];
 604       incr = 2;
 605     } else if ((c & 0xf0) == 0xe0) {
 606       // Three-byte
 607       *dst++ = src[1];
 608       *dst++ = src[2];
 609       c = (c << 16) | (src[1] << 8) | src[2];
 610       incr = 3;
 611     } else {
 612       // Four-byte
 613       *dst++ = src[1];
 614       *dst++ = src[2];
 615       *dst++ = src[3];
 616       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
 617       incr = 4;
 618     }
 619     src += incr;
 620     word_length_bytes += incr;
 621
 622     int p = tbl[local_hash];            // Prediction
 623     tbl[local_hash] = c;                // Update prediction
 624     if (c == p) {
 625       good_predict_bytes += incr;       // Count good predictions
 626     }
 627
 628     local_hash = ((local_hash << 4) ^ c) & 0xfff;
 629   }
 630
 631   *hash = local_hash;
 632
 633   if ((dst - isrc) < (src_len - 3)) {
 634     // Pad and make last char clean UTF-8 by putting following spaces
 635     dst[0] = ' ';
 636     dst[1] = ' ';
 637     dst[2] = ' ';
 638     dst[3] = '\0';
 639   } else  if ((dst - isrc) < src_len) {
 640     // Make last char clean UTF-8 by putting following space off the end
 641     dst[0] = ' ';
 642   }
 643
 644   return static_cast<int>(dst - isrc);
 645 }
 646
 647
 648 // This alternate form overwrites redundant words, thus avoiding corrupting the
 649 // backmap for generate a vector of original-text ranges.
 650 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
 651   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 652   const uint8* srclimit = src + src_len;
 653   char* dst = isrc;
 654   int local_hash = *hash;
 655   char* word_dst = dst;           // Start of next word
 656   int good_predict_bytes = 0;
 657   int word_length_bytes = 0;
 658
 659   while (src < srclimit) {
 660     int c = src[0];
 661     int incr = 1;
 662     *dst++ = c;
 663
 664     if (c == ' ') {
 665       if ((good_predict_bytes * 2) > word_length_bytes) {
 666         // Word [word_dst..dst-1) is well-predicted: overwrite
 667         for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
 668       }
 669       word_dst = dst;              // Start of next word
 670       good_predict_bytes = 0;
 671       word_length_bytes = 0;
 672     }
 673
 674     // Pick up one char and length
 675     if (c < 0xc0) {
 676       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
 677       // Do nothing more
 678     } else if ((c & 0xe0) == 0xc0) {
 679       // Two-byte
 680       *dst++ = src[1];
 681       c = (c << 8) | src[1];
 682       incr = 2;
 683     } else if ((c & 0xf0) == 0xe0) {
 684       // Three-byte
 685       *dst++ = src[1];
 686       *dst++ = src[2];
 687       c = (c << 16) | (src[1] << 8) | src[2];
 688       incr = 3;
 689     } else {
 690       // Four-byte
 691       *dst++ = src[1];
 692       *dst++ = src[2];
 693       *dst++ = src[3];
 694       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
 695       incr = 4;
 696     }
 697     src += incr;
 698     word_length_bytes += incr;
 699
 700     int p = tbl[local_hash];            // Prediction
 701     tbl[local_hash] = c;                // Update prediction
 702     if (c == p) {
 703       good_predict_bytes += incr;       // Count good predictions
 704     }
 705
 706     local_hash = ((local_hash << 4) ^ c) & 0xfff;
 707   }
 708
 709   *hash = local_hash;
 710
 711   if ((dst - isrc) < (src_len - 3)) {
 712     // Pad and make last char clean UTF-8 by putting following spaces
 713     dst[0] = ' ';
 714     dst[1] = ' ';
 715     dst[2] = ' ';
 716     dst[3] = '\0';
 717   } else  if ((dst - isrc) < src_len) {
 718     // Make last char clean UTF-8 by putting following space off the end
 719     dst[0] = ' ';
 720   }
 721
 722   return static_cast<int>(dst - isrc);
 723 }
 724
 725
 726 // Remove portions of text that have a high density of spaces, or that are
 727 // overly repetitive, squeezing the remaining text in-place to the front of the
 728 // input buffer.
 729 //
 730 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
 731 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
 732 //
 733 // Return the new, possibly-shorter length
 734 //
 735 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
 736 // if input does
 737 //
 738 int CheapSqueezeInplace(char* isrc,
 739                                             int src_len,
 740                                             int ichunksize) {
 741   char* src = isrc;
 742   char* dst = src;
 743   char* srclimit = src + src_len;
 744   bool skipping = false;
 745
 746   int hash = 0;
 747   // Allocate local prediction table.
 748   int* predict_tbl = new int[kPredictionTableSize];
 749   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
 750
 751   int chunksize = ichunksize;
 752   if (chunksize == 0) {chunksize = kChunksizeDefault;}
 753   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
 754   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
 755
 756   while (src < srclimit) {
 757     int remaining_bytes = srclimit - src;
 758     int len = minint(chunksize, remaining_bytes);
 759     // Make len land us on a UTF-8 character boundary.
 760     // Ah. Also fixes mispredict because we could get out of phase
 761     // Loop always terminates at trailing space in buffer
 762     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
 763
 764     int space_n = CountSpaces4(src, len);
 765     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
 766     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
 767       // Skip the text
 768       if (!skipping) {
 769         // Keeping-to-skipping transition; do it at a space
 770         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
 771         dst -= n;
 772         if (dst == isrc) {
 773           // Force a leading space if the first chunk is deleted
 774           *dst++ = ' ';
 775         }
 776         if (FLAGS_cld_showme) {
 777           // Mark the deletion point with black square U+25A0
 778           *dst++ = static_cast<unsigned char>(0xe2);
 779           *dst++ = static_cast<unsigned char>(0x96);
 780           *dst++ = static_cast<unsigned char>(0xa0);
 781           *dst++ = ' ';
 782         }
 783         skipping = true;
 784       }
 785     } else {
 786       // Keep the text
 787       if (skipping) {
 788         // Skipping-to-keeping transition; do it at a space
 789         int n = ForwardscanToSpace(src, len);
 790         src += n;
 791         remaining_bytes -= n;   // Shrink remaining length
 792         len -= n;
 793         skipping = false;
 794       }
 795       // "len" can be negative in some cases
 796       if (len > 0) {
 797         memmove(dst, src, len);
 798         dst += len;
 799       }
 800     }
 801     src += len;
 802   }
 803
 804   if ((dst - isrc) < (src_len - 3)) {
 805     // Pad and make last char clean UTF-8 by putting following spaces
 806     dst[0] = ' ';
 807     dst[1] = ' ';
 808     dst[2] = ' ';
 809     dst[3] = '\0';
 810   } else   if ((dst - isrc) < src_len) {
 811     // Make last char clean UTF-8 by putting following space off the end
 812     dst[0] = ' ';
 813   }
 814
 815   // Deallocate local prediction table
 816   delete[] predict_tbl;
 817   return static_cast<int>(dst - isrc);
 818 }
 819
 820 // This alternate form overwrites redundant words, thus avoiding corrupting the
 821 // backmap for generate a vector of original-text ranges.
 822 int CheapSqueezeInplaceOverwrite(char* isrc,
 823                                             int src_len,
 824                                             int ichunksize) {
 825   char* src = isrc;
 826   char* dst = src;
 827   char* srclimit = src + src_len;
 828   bool skipping = false;
 829
 830   int hash = 0;
 831   // Allocate local prediction table.
 832   int* predict_tbl = new int[kPredictionTableSize];
 833   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
 834
 835   int chunksize = ichunksize;
 836   if (chunksize == 0) {chunksize = kChunksizeDefault;}
 837   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
 838   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
 839
 840   // Always keep first byte (space)
 841   ++src;
 842   ++dst;
 843   while (src < srclimit) {
 844     int remaining_bytes = srclimit - src;
 845     int len = minint(chunksize, remaining_bytes);
 846     // Make len land us on a UTF-8 character boundary.
 847     // Ah. Also fixes mispredict because we could get out of phase
 848     // Loop always terminates at trailing space in buffer
 849     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
 850
 851     int space_n = CountSpaces4(src, len);
 852     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
 853     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
 854       // Overwrite the text [dst-n..dst)
 855       if (!skipping) {
 856         // Keeping-to-skipping transition; do it at a space
 857         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
 858         // Text [word_dst..dst) is well-predicted: overwrite
 859         for (char* p = dst - n; p < dst; ++p) {*p = '.';}
 860         skipping = true;
 861       }
 862       // Overwrite the text [dst..dst+len)
 863       for (char* p = dst; p < dst + len; ++p) {*p = '.';}
 864       dst[len - 1] = ' ';    // Space at end so we can see what is happening
 865     } else {
 866       // Keep the text
 867       if (skipping) {
 868         // Skipping-to-keeping transition; do it at a space
 869         int n = ForwardscanToSpace(src, len);
 870         // Text [dst..dst+n) is well-predicted: overwrite
 871         for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
 872         skipping = false;
 873       }
 874     }
 875     dst += len;
 876     src += len;
 877   }
 878
 879   if ((dst - isrc) < (src_len - 3)) {
 880     // Pad and make last char clean UTF-8 by putting following spaces
 881     dst[0] = ' ';
 882     dst[1] = ' ';
 883     dst[2] = ' ';
 884     dst[3] = '\0';
 885   } else   if ((dst - isrc) < src_len) {
 886     // Make last char clean UTF-8 by putting following space off the end
 887     dst[0] = ' ';
 888   }
 889
 890   // Deallocate local prediction table
 891   delete[] predict_tbl;
 892   return static_cast<int>(dst - isrc);
 893 }
 894
 895 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
 896 //  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
 897 //  Just CountSpaces is about 340 MB/sec
 898 //  Byte-only CountPredictedBytes is about 150 MB/sec
 899 //  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
 900 //  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
 901 //  Unjammed byte-only both = 170 MB/sec
 902 //  Jammed byte-only both = 120 MB/sec
 903 //  Back to original w/slight updates, 110 MB/sec
 904 //
 905 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
 906   // Don't trigger at all on short text
 907   if (src_len < testsize) {return false;}
 908   int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
 909   int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
 910   int hash = 0;
 911   // Allocate local prediction table.
 912   int* predict_tbl = new int[kPredictionTableSize];
 913   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
 914
 915   bool retval = false;
 916   if ((CountSpaces4(src, testsize) >= space_thresh) ||
 917       (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
 918        predict_thresh)) {
 919     retval = true;
 920   }
 921   // Deallocate local prediction table
 922   delete[] predict_tbl;
 923   return retval;
 924 }
 925
 926
 927
 928
 929 // Delete any extended languages from doc_tote
 930 void RemoveExtendedLanguages(DocTote* doc_tote) {
 931   // Now a nop
 932 }
 933
 934 static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
 935
 936 // For Tier3 languages, require a minimum number of bytes to be first-place lang
 937 static const int kGoodFirstT3MinBytes = 24;         // <this => no first
 938
 939 // Move bytes for unreliable langs to another lang or UNKNOWN
 940 // doc_tote is sorted, so cannot Add
 941 //
 942 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
 943 // merge both into CHINESE.
 944 //
 945 //dsites 2009.03.19
 946 // we also want to remove Tier3 languages as the first lang if there is very
 947 // little text like ej1 ej2 ej3 ej4
 948 // maybe fold this back in earlier
 949 //
 950 void RemoveUnreliableLanguages(DocTote* doc_tote,
 951                                bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
 952   // Prepass to merge some low-reliablility languages
 953   // TODO: this shouldn't really reach in to the internal structure of doc_tote
 954   int total_bytes = 0;
 955   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
 956     int plang = doc_tote->Key(sub);
 957     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
 958
 959     Language lang = static_cast<Language>(plang);
 960     int bytes = doc_tote->Value(sub);
 961     int reli = doc_tote->Reliability(sub);
 962     if (bytes == 0) {continue;}                     // Zero bytes
 963     total_bytes += bytes;
 964
 965     // Reliable percent = stored reliable score over stored bytecount
 966     int reliable_percent = reli / bytes;
 967     if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
 968
 969     // This language is too unreliable to keep, but we might merge it.
 970     Language altlang = UNKNOWN_LANGUAGE;
 971     if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
 972     if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
 973
 974     // Look for alternative in doc_tote
 975     int altsub = doc_tote->Find(altlang);
 976     if (altsub < 0) {continue;}                     // No alternative text
 977
 978     int bytes2 = doc_tote->Value(altsub);
 979     int reli2 = doc_tote->Reliability(altsub);
 980     if (bytes2 == 0) {continue;}                    // Zero bytes
 981
 982     // Reliable percent is stored reliable score over stored bytecount
 983     int reliable_percent2 = reli2 / bytes2;
 984
 985     // Merge one language into the other. Break ties toward lower lang #
 986     int tosub = altsub;
 987     int fromsub = sub;
 988     bool into_lang = false;
 989     if ((reliable_percent2 < reliable_percent) ||
 990         ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
 991       tosub = sub;
 992       fromsub = altsub;
 993       into_lang = true;
 994     }
 995
 996     // Make sure merged reliability doesn't drop and is enough to avoid delete
 997     int newpercent = maxint(reliable_percent, reliable_percent2);
 998     newpercent = maxint(newpercent, kMinReliableKeepPercent);
 999     int newbytes = bytes + bytes2;
1000     int newreli = newpercent * newbytes;
1001
1002     doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
1003     doc_tote->SetScore(fromsub, 0);
1004     doc_tote->SetReliability(fromsub, 0);
1005     doc_tote->SetScore(tosub, newbytes);
1006     doc_tote->SetReliability(tosub, newreli);
1007
1008     // Show fate of unreliable languages if at least 10 bytes
1009     if (FLAGS_cld2_html && (newbytes >= 10) &&
1010         !FLAGS_cld2_quiet) {
1011       if (into_lang) {
1012         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1013                 LanguageCode(altlang), reliable_percent2, bytes2,
1014                 LanguageCode(lang));
1015       } else {
1016         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1017                 LanguageCode(lang), reliable_percent, bytes,
1018                 LanguageCode(altlang));
1019       }
1020     }
1021   }
1022
1023
1024   // Pass to delete any remaining unreliable languages
1025   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1026     int plang = doc_tote->Key(sub);
1027     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
1028
1029     Language lang = static_cast<Language>(plang);
1030     int bytes = doc_tote->Value(sub);
1031     int reli = doc_tote->Reliability(sub);
1032     if (bytes == 0) {continue;}                     // Zero bytes
1033
1034     // Reliable percent is stored as reliable score over stored bytecount
1035     int reliable_percent = reli / bytes;
1036     if (reliable_percent >= kMinReliableKeepPercent) {  // Keeper?
1037        continue;                                        // yes
1038     }
1039
1040     // Delete unreliable entry
1041     doc_tote->SetKey(sub, DocTote::kUnusedKey);
1042     doc_tote->SetScore(sub, 0);
1043     doc_tote->SetReliability(sub, 0);
1044
1045     // Show fate of unreliable languages if at least 10 bytes
1046     if (FLAGS_cld2_html && (bytes >= 10) &&
1047         !FLAGS_cld2_quiet) {
1048       fprintf(stderr, "{Unreli %s.%dR,%dB} ",
1049               LanguageCode(lang), reliable_percent, bytes);
1050     }
1051   }
1052
1053   ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
1054 }
1055
1056
1057 // Move all the text bytes from lower byte-count to higher one
1058 void MoveLang1ToLang2(Language lang1, Language lang2,
1059                       int lang1_sub, int lang2_sub,
1060                       DocTote* doc_tote,
1061                       ResultChunkVector* resultchunkvector) {
1062   // In doc_tote, move all the bytes lang1 => lang2
1063   int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
1064   doc_tote->SetValue(lang2_sub, sum);
1065   sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
1066   doc_tote->SetScore(lang2_sub, sum);
1067   sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
1068   doc_tote->SetReliability(lang2_sub, sum);
1069
1070   // Delete old entry
1071   doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
1072   doc_tote->SetScore(lang1_sub, 0);
1073   doc_tote->SetReliability(lang1_sub, 0);
1074
1075   // In resultchunkvector, move all the bytes lang1 => lang2
1076   if (resultchunkvector == NULL) {return;}
1077
1078   int k = 0;
1079   uint16 prior_lang = UNKNOWN_LANGUAGE;
1080   for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
1081     ResultChunk* rc = &(*resultchunkvector)[i];
1082     if (rc->lang1 == lang1) {
1083       // Update entry[i] lang1 => lang2
1084       rc->lang1 = lang2;
1085     }
1086     // One change may produce two merges -- entry before and entry after
1087     if ((rc->lang1 == prior_lang) && (k > 0)) {
1088       // Merge with previous, deleting entry[i]
1089       ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
1090       prior_rc->bytes += rc->bytes;
1091       // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
1092     } else {
1093       // Keep entry[i]
1094       (*resultchunkvector)[k] = (*resultchunkvector)[i];
1095       // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
1096       ++k;
1097     }
1098     prior_lang = rc->lang1;
1099   }
1100   resultchunkvector->resize(k);
1101 }
1102
1103
1104
1105 // Move less likely byte count to more likely for close pairs of languages
1106 // If given, also update resultchunkvector
1107 void RefineScoredClosePairs(DocTote* doc_tote,
1108                             ResultChunkVector* resultchunkvector,
1109                             bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1110   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1111     int close_packedlang = doc_tote->Key(sub);
1112     int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
1113     if (subscr == 0) {continue;}
1114
1115     // We have a close pair language -- if the other one is also scored and the
1116     // longword score differs enough, put all our eggs into one basket
1117
1118     // Nonzero longword score: Go look for the other of this pair
1119     for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1120       if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
1121         // We have a matching pair
1122         int close_packedlang2 = doc_tote->Key(sub2);
1123
1124         // Move all the text bytes from lower byte-count to higher one
1125         int from_sub, to_sub;
1126         Language from_lang, to_lang;
1127         if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1128           from_sub = sub;
1129           to_sub = sub2;
1130           from_lang = static_cast<Language>(close_packedlang);
1131           to_lang = static_cast<Language>(close_packedlang2);
1132         } else {
1133           from_sub = sub2;
1134           to_sub = sub;
1135           from_lang = static_cast<Language>(close_packedlang2);
1136           to_lang = static_cast<Language>(close_packedlang);
1137         }
1138
1139         if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1140           // Show fate of closepair language
1141           int val = doc_tote->Value(from_sub);           // byte count
1142           int reli = doc_tote->Reliability(from_sub);
1143           int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
1144           fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
1145                   LanguageCode(from_lang),
1146                   reliable_percent,
1147                   doc_tote->Value(from_sub),
1148                   LanguageCode(to_lang));
1149         }
1150         MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
1151                          doc_tote, resultchunkvector);
1152         break;    // Exit inner for sub2 loop
1153       }
1154     }     // End for sub2
1155   }   // End for sub
1156 }
1157
1158
1159 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
1160                         uint8* lang_hint_boost) {
1161 }
1162
1163
1164 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1165    string temp(txt, len);
1166    fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
1167 }
1168
1169 void PrintLang(FILE* f, Tote* chunk_tote,
1170               Language cur_lang, bool cur_unreliable,
1171               Language prior_lang, bool prior_unreliable) {
1172   if (cur_lang == prior_lang) {
1173     fprintf(f, "[]");
1174   } else {
1175     fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
1176   }
1177 }
1178
1179
1180 void PrintTopLang(Language top_lang) {
1181   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1182     fprintf(stderr, "[] ");
1183   } else {
1184     fprintf(stderr, "[%s] ", LanguageName(top_lang));
1185     prior_lang = top_lang;
1186   }
1187 }
1188
1189 void PrintTopLangSpeculative(Language top_lang) {
1190   fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1191   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1192     fprintf(stderr, "[] ");
1193   } else {
1194     fprintf(stderr, "[%s] ", LanguageName(top_lang));
1195     prior_lang = top_lang;
1196   }
1197   fprintf(stderr, "</span>\n");
1198 }
1199
1200 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1201                 const int* text_bytes, const bool* is_reliable) {
1202   fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1203   if (language3[0] != UNKNOWN_LANGUAGE) {
1204     fprintf(f, "%s%s(%d%%)  ",
1205             LanguageName(language3[0]),
1206             *is_reliable ? "" : "*",
1207             percent3[0]);
1208   }
1209   if (language3[1] != UNKNOWN_LANGUAGE) {
1210     fprintf(f, "%s(%d%%)  ", LanguageName(language3[1]), percent3[1]);
1211   }
1212   if (language3[2] != UNKNOWN_LANGUAGE) {
1213     fprintf(f, "%s(%d%%)  ", LanguageName(language3[2]), percent3[2]);
1214   }
1215   fprintf(f, "%d bytes \n", *text_bytes);
1216
1217   fprintf(f, "<br>\n");
1218 }
1219
1220
1221 // Return internal probability score (sum) per 1024 bytes
1222 double GetNormalizedScore(Language lang, ULScript ulscript,
1223                           int bytecount, int score) {
1224   if (bytecount <= 0) {return 0.0;}
1225   return (score << 10) / bytecount;
1226 }
1227
1228 // Extract return values before fixups
1229 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
1230                     int* reliable_percent3, Language* language3, int* percent3,
1231                     double*  normalized_score3,
1232                     int* text_bytes, bool* is_reliable) {
1233   reliable_percent3[0] = 0;
1234   reliable_percent3[1] = 0;
1235   reliable_percent3[2] = 0;
1236   language3[0] = UNKNOWN_LANGUAGE;
1237   language3[1] = UNKNOWN_LANGUAGE;
1238   language3[2] = UNKNOWN_LANGUAGE;
1239   percent3[0] = 0;
1240   percent3[1] = 0;
1241   percent3[2] = 0;
1242   normalized_score3[0] = 0.0;
1243   normalized_score3[1] = 0.0;
1244   normalized_score3[2] = 0.0;
1245
1246   *text_bytes = total_text_bytes;
1247   *is_reliable = false;
1248
1249   int bytecount1 = 0;
1250   int bytecount2 = 0;
1251   int bytecount3 = 0;
1252
1253   int lang1 = doc_tote->Key(0);
1254   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1255     // We have a top language
1256     language3[0] = static_cast<Language>(lang1);
1257     bytecount1 = doc_tote->Value(0);
1258     int reli1 = doc_tote->Reliability(0);
1259     reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
1260     normalized_score3[0] = GetNormalizedScore(language3[0],
1261                                                   ULScript_Common,
1262                                                   bytecount1,
1263                                                   doc_tote->Score(0));
1264   }
1265
1266   int lang2 = doc_tote->Key(1);
1267   if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
1268     language3[1] = static_cast<Language>(lang2);
1269     bytecount2 = doc_tote->Value(1);
1270     int reli2 = doc_tote->Reliability(1);
1271     reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
1272     normalized_score3[1] = GetNormalizedScore(language3[1],
1273                                                   ULScript_Common,
1274                                                   bytecount2,
1275                                                   doc_tote->Score(1));
1276   }
1277
1278   int lang3 = doc_tote->Key(2);
1279   if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
1280     language3[2] = static_cast<Language>(lang3);
1281     bytecount3 = doc_tote->Value(2);
1282     int reli3 = doc_tote->Reliability(2);
1283     reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
1284     normalized_score3[2] = GetNormalizedScore(language3[2],
1285                                                   ULScript_Common,
1286                                                   bytecount3,
1287                                                   doc_tote->Score(2));
1288   }
1289
1290   // Increase total bytes to sum (top 3) if low for some reason
1291   int total_bytecount12 = bytecount1 + bytecount2;
1292   int total_bytecount123 = total_bytecount12 + bytecount3;
1293   if (total_text_bytes < total_bytecount123) {
1294     total_text_bytes = total_bytecount123;
1295     *text_bytes = total_text_bytes;
1296   }
1297
1298   // Sum minus previous % gives better roundoff behavior than bytecount/total
1299   int total_text_bytes_div = maxint(1, total_text_bytes);    // Avoid zdiv
1300   percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1301   percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1302   percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1303   percent3[2] -= percent3[1];
1304   percent3[1] -= percent3[0];
1305
1306   // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1307   // Fix this explicitly
1308   if (percent3[1] < percent3[2]) {
1309     ++percent3[1];
1310     --percent3[2];
1311   }
1312   if (percent3[0] < percent3[1]) {
1313     ++percent3[0];
1314     --percent3[1];
1315   }
1316
1317   *text_bytes = total_text_bytes;
1318
1319   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1320     // We have a top language
1321     // Its reliability is overall result reliability
1322     int bytecount = doc_tote->Value(0);
1323     int reli = doc_tote->Reliability(0);
1324     int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
1325     *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
1326   } else {
1327     // No top language at all. This can happen with zero text or 100% Klingon
1328     // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
1329     *is_reliable = false;
1330   }
1331
1332   // If ignore percent is too large, set unreliable.
1333   int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1334   if ((ignore_percent > kIgnoreMaxPercent)) {
1335     *is_reliable = false;
1336   }
1337 }
1338
1339 bool IsFIGS(Language lang) {
1340   if (lang == FRENCH) {return true;}
1341   if (lang == ITALIAN) {return true;}
1342   if (lang == GERMAN) {return true;}
1343   if (lang == SPANISH) {return true;}
1344   return false;
1345 }
1346
1347 bool IsEFIGS(Language lang) {
1348   if (lang == ENGLISH) {return true;}
1349   if (lang == FRENCH) {return true;}
1350   if (lang == ITALIAN) {return true;}
1351   if (lang == GERMAN) {return true;}
1352   if (lang == SPANISH) {return true;}
1353   return false;
1354 }
1355
1356 // For Tier3 languages, require more bytes of text to override
1357 // the first-place language
1358 static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
1359 static const int kGoodSecondT3MinBytes = 128;         // <this => no second
1360
1361 // Calculate a single summary language for the document, and its reliability.
1362 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1363 // This is the heart of matching human-rater perception.
1364 // reliable_percent3[] is currently unused
1365 //
1366 // Do not return Tier3 second language unless there are at least 128 bytes
1367 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
1368                      const int* reliable_percent3,
1369                      const Language* language3,
1370                      const int* percent3,
1371                      Language* summary_lang, bool* is_reliable,
1372                      bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1373   // Vector of active languages; changes if we delete some
1374   int slot_count = 3;
1375   int active_slot[3] = {0, 1, 2};
1376
1377   int ignore_percent = 0;
1378   int return_percent = percent3[0];   // Default to top lang
1379   *summary_lang = language3[0];
1380   *is_reliable = true;
1381   if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1382
1383   // If any of top 3 is IGNORE, remove it and increment ignore_percent
1384   for (int i = 0; i < 3; ++i) {
1385     if (language3[i] == TG_UNKNOWN_LANGUAGE) {
1386       ignore_percent += percent3[i];
1387       // Move the rest up, levaing input vectors unchanged
1388       for (int j=i+1; j < 3; ++j) {
1389         active_slot[j - 1] = active_slot[j];
1390       }
1391       -- slot_count;
1392       // Logically remove Ignore from percentage-text calculation
1393       // (extra 1 in 101 avoids zdiv, biases slightly small)
1394       return_percent = (percent3[0] * 100) / (101 - ignore_percent);
1395       *summary_lang = language3[active_slot[0]];
1396       if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
1397     }
1398   }
1399
1400
1401   // If English and X, where X (not UNK) is big enough,
1402   // assume the English is boilerplate and return X.
1403   // Logically remove English from percentage-text calculation
1404   int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
1405   // Require more bytes of text for Tier3 languages
1406   int minbytesneeded = kGoodSecondT1T2MinBytes;
1407   int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
1408
1409   if ((language3[active_slot[0]] == ENGLISH) &&
1410       (language3[active_slot[1]] != ENGLISH) &&
1411       (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1412       (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
1413       (second_bytes >= minbytesneeded)) {
1414     ignore_percent += percent3[active_slot[0]];
1415     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1416     *summary_lang = language3[active_slot[1]];
1417     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1418
1419   // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
1420   // assume the FIGS is boilerplate and return X.
1421   // Logically remove FIGS from percentage-text calculation
1422   } else if (IsFIGS(language3[active_slot[0]]) &&
1423              !IsEFIGS(language3[active_slot[1]]) &&
1424              (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1425              (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
1426              (second_bytes >= minbytesneeded)) {
1427     ignore_percent += percent3[active_slot[0]];
1428     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1429     *summary_lang = language3[active_slot[1]];
1430     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1431
1432   // Else we are returning the first language, but want to improve its
1433   // return_percent if the second language should be ignored
1434   } else  if ((language3[active_slot[1]] == ENGLISH) &&
1435               (language3[active_slot[0]] != ENGLISH)) {
1436     ignore_percent += percent3[active_slot[1]];
1437     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1438   } else  if (IsFIGS(language3[active_slot[1]]) &&
1439               !IsEFIGS(language3[active_slot[0]])) {
1440     ignore_percent += percent3[active_slot[1]];
1441     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1442   }
1443
1444   // If return percent is too small (too many languages), return UNKNOWN
1445   if ((return_percent < kGoodFirstMinPercent)) {
1446     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1447       fprintf(stderr, "{Unreli %s %d%% percent too small} ",
1448               LanguageCode(*summary_lang), return_percent);
1449     }
1450     *summary_lang = UNKNOWN_LANGUAGE;
1451     *is_reliable = false;
1452   }
1453
1454   // If return percent is small, return language but set unreliable.
1455   if ((return_percent < kGoodFirstReliableMinPercent)) {
1456     *is_reliable = false;
1457   }
1458
1459   // If ignore percent is too large, set unreliable.
1460   ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1461   if ((ignore_percent > kIgnoreMaxPercent)) {
1462     *is_reliable = false;
1463   }
1464
1465   // If we removed all the active languages, return UNKNOWN
1466   if (slot_count == 0) {
1467     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1468       fprintf(stderr, "{Unreli %s no languages left} ",
1469               LanguageCode(*summary_lang));
1470     }
1471     *summary_lang = UNKNOWN_LANGUAGE;
1472     *is_reliable = false;
1473   }
1474 }
1475
1476 void AddLangPriorBoost(Language lang, uint32 langprob,
1477                        ScoringContext* scoringcontext) {
1478   // This is called 0..n times with language hints
1479   // but we don't know the script -- so boost either or both Latn, Othr.
1480
1481   if (IsLatnLanguage(lang)) {
1482     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
1483     int n = langprior_boost->n;
1484     langprior_boost->langprob[n] = langprob;
1485     langprior_boost->n = langprior_boost->wrap(n + 1);
1486   }
1487
1488   if (IsOthrLanguage(lang)) {
1489     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
1490     int n = langprior_boost->n;
1491     langprior_boost->langprob[n] = langprob;
1492     langprior_boost->n = langprior_boost->wrap(n + 1);
1493   }
1494
1495 }
1496
1497 void AddOneWhack(Language whacker_lang, Language whackee_lang,
1498                  ScoringContext* scoringcontext) {
1499   uint32 langprob = MakeLangProb(whackee_lang, 1);
1500   // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
1501   if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
1502     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
1503     int n = langprior_whack->n;
1504     langprior_whack->langprob[n] = langprob;
1505     langprior_whack->n = langprior_whack->wrap(n + 1);
1506   }
1507   if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
1508     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
1509     int n = langprior_whack->n;
1510     langprior_whack->langprob[n] = langprob;
1511     langprior_whack->n = langprior_whack->wrap(n + 1);
1512  }
1513 }
1514
1515 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
1516   // We do not in general want zh-Hans and zh-Hant to be close pairs,
1517   // but we do here.
1518   if (lang == CLD2::CHINESE) {
1519     AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
1520     return;
1521   }
1522   if (lang == CLD2::CHINESE_T) {
1523     AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
1524     return;
1525   }
1526
1527   int base_lang_set = LanguageCloseSet(lang);
1528   if (base_lang_set == 0) {return;}
1529   // TODO: add an explicit list of each set to avoid this 512-times loop
1530   for (int i = 0; i < kLanguageToPLangSize; ++i) {
1531     Language lang2 = static_cast<Language>(i);
1532     if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
1533       AddOneWhack(lang, lang2, scoringcontext);
1534     }
1535   }
1536 }
1537
1538
1539 void ApplyHints(const char* buffer,
1540                 int buffer_length,
1541                 bool is_plain_text,
1542                 const CLDHints* cld_hints,
1543                 ScoringContext* scoringcontext) {
1544   CLDLangPriors lang_priors;
1545   InitCLDLangPriors(&lang_priors);
1546
1547   // We now use lang= tags.
1548   // Last look, circa 2008 found only 15% of web pages with lang= tags and
1549   // many of those were wrong. Now (July 2011), we find 44% of web pages have
1550   // lang= tags, and most of them are correct. So we now give them substantial
1551   // weight in each chunk scored.
1552   if (!is_plain_text) {
1553     // Get any contained language tags in first n KB
1554     int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
1555     string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
1556                                            max_scan_bytes);
1557     SetCLDLangTagsHint(lang_tags, &lang_priors);
1558     if (scoringcontext->flags_cld2_html) {
1559       if (!lang_tags.empty()) {
1560         fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
1561                 lang_tags.c_str());
1562       }
1563     }
1564   }
1565
1566   if (cld_hints != NULL) {
1567     if ((cld_hints->content_language_hint != NULL) &&
1568         (cld_hints->content_language_hint[0] != '\0')) {
1569       SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
1570     }
1571
1572     // Input is from GetTLD(), already lowercased
1573     if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
1574       SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
1575     }
1576
1577     if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
1578       Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
1579       SetCLDEncodingHint(enc, &lang_priors);
1580     }
1581
1582     if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
1583       SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
1584     }
1585   }
1586
1587   // Keep no more than four different languages with hints
1588   TrimCLDLangPriors(4, &lang_priors);
1589
1590   if (scoringcontext->flags_cld2_html) {
1591     string print_temp = DumpCLDLangPriors(&lang_priors);
1592     if (!print_temp.empty()) {
1593       fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
1594               print_temp.c_str());
1595     }
1596   }
1597
1598   // Put boosts into ScoringContext
1599   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1600     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1601     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1602     if (qprob > 0) {
1603       uint32 langprob = MakeLangProb(lang, qprob);
1604       AddLangPriorBoost(lang, langprob, scoringcontext);
1605     }
1606   }
1607
1608   // Put whacks into scoring context
1609   // We do not in general want zh-Hans and zh-Hant to be close pairs,
1610   // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
1611   std::vector<int> close_set_count(kCloseSetSize + 1, 0);
1612
1613   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1614     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1615     ++close_set_count[LanguageCloseSet(lang)];
1616     if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
1617     if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
1618   }
1619
1620   // If a boost language is in a close set, force suppressing the others in
1621   // that set, if exactly one of the set is present
1622   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1623     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1624     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1625     if (qprob > 0) {
1626       int close_set = LanguageCloseSet(lang);
1627       if ((close_set > 0) && (close_set_count[close_set] == 1)) {
1628         AddCloseLangWhack(lang, scoringcontext);
1629       }
1630       if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
1631           (close_set_count[kCloseSetSize] == 1)) {
1632         AddCloseLangWhack(lang, scoringcontext);
1633       }
1634     }
1635   }
1636
1637
1638
1639
1640
1641
1642 }
1643
1644
1645
1646 // Results language3/percent3/text_bytes must be exactly three items
1647 Language DetectLanguageSummaryV2(
1648                         const char* buffer,
1649                         int buffer_length,
1650                         bool is_plain_text,
1651                         const CLDHints* cld_hints,
1652                         bool allow_extended_lang,
1653                         int flags,
1654                         Language plus_one,
1655                         Language* language3,
1656                         int* percent3,
1657                         double* normalized_score3,
1658                         ResultChunkVector* resultchunkvector,
1659                         int* text_bytes,
1660                         bool* is_reliable) {
1661   language3[0] = UNKNOWN_LANGUAGE;
1662   language3[1] = UNKNOWN_LANGUAGE;
1663   language3[2] = UNKNOWN_LANGUAGE;
1664   percent3[0] = 0;
1665   percent3[1] = 0;
1666   percent3[2] = 0;
1667   normalized_score3[0] = 0.0;
1668   normalized_score3[1] = 0.0;
1669   normalized_score3[2] = 0.0;
1670   if (resultchunkvector != NULL) {
1671     resultchunkvector->clear();
1672   }
1673   *text_bytes = 0;
1674   *is_reliable = false;
1675
1676   if ((flags & kCLDFlagEcho) != 0) {
1677      string temp(buffer, buffer_length);
1678      if ((flags & kCLDFlagHtml) != 0) {
1679         fprintf(stderr, "CLD2[%d] '%s'<br>\n",
1680                 buffer_length, GetHtmlEscapedText(temp).c_str());
1681      } else {
1682         fprintf(stderr, "CLD2[%d] '%s'\n",
1683                 buffer_length, GetPlainEscapedText(temp).c_str());
1684      }
1685   }
1686
1687 #ifdef CLD2_DYNAMIC_MODE
1688   // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
1689   // hasn't been loaded yet. This is the only sane thing we can do, as there
1690   // are no scoring tables to consult.
1691   bool dataLoaded = isDataLoaded();
1692   if ((flags & kCLDFlagVerbose) != 0) {
1693     fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
1694   }
1695   if (!dataLoaded) {
1696     return UNKNOWN_LANGUAGE;
1697   }
1698 #endif
1699
1700   // Exit now if no text
1701   if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
1702   if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
1703
1704   // Document totals
1705   DocTote doc_tote;   // Reliability = 0..100
1706
1707   // ScoringContext carries state across scriptspans
1708   ScoringContext scoringcontext;
1709   scoringcontext.debug_file = stderr;
1710   scoringcontext.flags_cld2_score_as_quads =
1711     ((flags & kCLDFlagScoreAsQuads) != 0);
1712   scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
1713   scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
1714   scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
1715   scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
1716   scoringcontext.ulscript = ULScript_Common;
1717   scoringcontext.scoringtables = &kScoringtables;
1718   scoringcontext.scanner = NULL;
1719   scoringcontext.init();            // Clear the internal memory arrays
1720
1721   // Now thread safe.
1722   bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
1723   bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
1724
1725   ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
1726
1727   // Four individual script totals, Latin, Han, other2, other3
1728   int next_other_tote = 2;
1729   int tote_num = 0;
1730
1731   // Four totes for up to four different scripts pending at once
1732   Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
1733   bool tote_seen[4] = {false, false, false, false};
1734   int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
1735   ULScript tote_script[4] =
1736     {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
1737
1738   // Loop through text spans in a single script
1739   ScriptScanner ss(buffer, buffer_length, is_plain_text);
1740   LangSpan scriptspan;
1741
1742   scoringcontext.scanner = &ss;
1743
1744   scriptspan.text = NULL;
1745   scriptspan.text_bytes = 0;
1746   scriptspan.offset = 0;
1747   scriptspan.ulscript = ULScript_Common;
1748   scriptspan.lang = UNKNOWN_LANGUAGE;
1749
1750   int total_text_bytes = 0;
1751   int textlimit = FLAGS_cld_textlimit << 10;    // in KB
1752   if (textlimit == 0) {textlimit = 0x7fffffff;}
1753
1754   int advance_by = 2;                   // Advance 2 bytes
1755   int advance_limit = textlimit >> 3;   // For first 1/8 of max document
1756
1757   int initial_word_span = kDefaultWordSpan;
1758   if (FLAGS_cld_forcewords) {
1759     initial_word_span = kReallyBigWordSpan;
1760   }
1761
1762   // Pick up chunk sizes
1763   // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
1764   // Sanity check -- force into a reasonable range
1765   int chunksizequads = FLAGS_cld_smoothwidth;
1766   chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
1767                                kMaxChunkSizeQuads);
1768   int chunksizeunis = (chunksizequads * 5) >> 1;
1769
1770   // Varying short-span limit doesn't work well -- skips too much beyond 20KB
1771   // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
1772   int spantooshortlimit = kShortSpanThresh;
1773
1774   // For debugging only. Not thread-safe
1775   prior_lang = UNKNOWN_LANGUAGE;
1776   prior_unreliable = false;
1777
1778   // Allocate full-document prediction table for finding repeating words
1779   int hash = 0;
1780   int* predict_tbl = new int[kPredictionTableSize];
1781   if (FlagRepeats(flags)) {
1782     memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1783   }
1784
1785
1786
1787   // Loop through scriptspans accumulating number of text bytes in each language
1788   while (ss.GetOneScriptSpanLower(&scriptspan)) {
1789     ULScript ulscript = scriptspan.ulscript;
1790
1791     // Squeeze out big chunks of text span if asked to
1792     if (FlagSqueeze(flags)) {
1793       // Remove repetitive or mostly-spaces chunks
1794       int newlen;
1795       int chunksize = 0;    // Use the default
1796       if (resultchunkvector != NULL) {
1797          newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
1798                                                scriptspan.text_bytes,
1799                                                chunksize);
1800       } else {
1801          newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
1802                                       chunksize);
1803       }
1804       scriptspan.text_bytes = newlen;
1805     } else {
1806       // Check now and then to see if we should be squeezing
1807       if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
1808           !FlagFinish(flags)) {
1809         // fprintf(stderr, "CheapSqueezeTriggerTest, "
1810         //                 "first %d bytes of %d (>%d/2)<br>\n",
1811         //         kCheapSqueezeTestLen,
1812         //         scriptspan.text_bytes,
1813         //         kCheapSqueezeTestThresh);
1814
1815         if (CheapSqueezeTriggerTest(scriptspan.text,
1816                                       scriptspan.text_bytes,
1817                                       kCheapSqueezeTestLen)) {
1818           // Recursive call with big-chunk squeezing set
1819           if (FLAGS_cld2_html || FLAGS_dbgscore) {
1820             fprintf(stderr,
1821                     "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
1822                     total_text_bytes);
1823           }
1824           // Deallocate full-document prediction table
1825           delete[] predict_tbl;
1826
1827           return DetectLanguageSummaryV2(
1828                             buffer,
1829                             buffer_length,
1830                             is_plain_text,
1831                             cld_hints,
1832                             allow_extended_lang,
1833                             flags | kCLDFlagSqueeze,
1834                             plus_one,
1835                             language3,
1836                             percent3,
1837                             normalized_score3,
1838                             resultchunkvector,
1839                             text_bytes,
1840                             is_reliable);
1841         }
1842       }
1843     }
1844
1845     // Remove repetitive words if asked to
1846     if (FlagRepeats(flags)) {
1847       // Remove repetitive words
1848       int newlen;
1849       if (resultchunkvector != NULL) {
1850         newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
1851                                                scriptspan.text_bytes,
1852                                                &hash, predict_tbl);
1853       } else {
1854         newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
1855                                       &hash, predict_tbl);
1856       }
1857       scriptspan.text_bytes = newlen;
1858     }
1859
1860     // Scoring depends on scriptspan buffer ALWAYS having
1861     // leading space and off-the-end space space space NUL,
1862     // DCHECK(scriptspan.text[0] == ' ');
1863     // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
1864     // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
1865     // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
1866     // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
1867
1868     // The real scoring
1869     // Accumulate directly into the document total, or accmulate in one of four
1870     // chunk totals. The purpose of the multiple chunk totals is to piece
1871     // together short choppy pieces of text in alternating scripts. One total is
1872     // dedicated to Latin text, one to Han text, and the other two are dynamicly
1873     // assigned.
1874
1875     scoringcontext.ulscript = scriptspan.ulscript;
1876     // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
1877
1878     ScoreOneScriptSpan(scriptspan,
1879                        &scoringcontext,
1880                        &doc_tote,
1881                        resultchunkvector);
1882
1883     total_text_bytes += scriptspan.text_bytes;
1884   }     // End while (ss.GetOneScriptSpanLower())
1885
1886   // Deallocate full-document prediction table
1887   delete[] predict_tbl;
1888
1889   if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1890     // If no forced <cr>, put one in front of dump
1891     if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
1892     doc_tote.Dump(stderr);
1893   }
1894
1895
1896   // If extended langauges are disallowed, remove them here
1897   if (!allow_extended_lang) {
1898     RemoveExtendedLanguages(&doc_tote);
1899   }
1900
1901   // Force close pairs to one or the other
1902   // If given, also update resultchunkvector
1903   RefineScoredClosePairs(&doc_tote, resultchunkvector,
1904                          FLAGS_cld2_html, FLAGS_cld2_quiet);
1905
1906
1907   // Calculate return results
1908   // Find top three byte counts in tote heap
1909   int reliable_percent3[3];
1910
1911   // Cannot use Add, etc. after sorting
1912   doc_tote.Sort(3);
1913
1914   ExtractLangEtc(&doc_tote, total_text_bytes,
1915                  reliable_percent3, language3, percent3, normalized_score3,
1916                  text_bytes, is_reliable);
1917
1918   bool have_good_answer = false;
1919   if (FlagFinish(flags)) {
1920     // Force a result
1921     have_good_answer = true;
1922   } else if (total_text_bytes <= kShortTextThresh) {
1923     // Don't recurse on short text -- we already did word scores
1924     have_good_answer = true;
1925   } else if (*is_reliable &&
1926              (percent3[0] >= kGoodLang1Percent)) {
1927     have_good_answer = true;
1928   } else if (*is_reliable &&
1929              ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
1930     have_good_answer = true;
1931   }
1932
1933
1934   if (have_good_answer) {
1935     // This is the real, non-recursive return
1936
1937     // Move bytes for unreliable langs to another lang or UNKNOWN
1938     RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
1939
1940     // Redo the result extraction after the removal above
1941     doc_tote.Sort(3);
1942     ExtractLangEtc(&doc_tote, total_text_bytes,
1943                    reliable_percent3, language3, percent3, normalized_score3,
1944                    text_bytes, is_reliable);
1945
1946
1947
1948     Language summary_lang;
1949     CalcSummaryLang(&doc_tote, total_text_bytes,
1950                     reliable_percent3, language3, percent3,
1951                     &summary_lang, is_reliable,
1952                     FLAGS_cld2_html, FLAGS_cld2_quiet);
1953
1954     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1955       for (int i = 0; i < 3; ++i) {
1956         if (language3[i] != UNKNOWN_LANGUAGE) {
1957           fprintf(stderr, "%s.%dR(%d%%) ",
1958                   LanguageCode(language3[i]),
1959                   reliable_percent3[i],
1960                   percent3[i]);
1961         }
1962       }
1963
1964       fprintf(stderr, "%d bytes ", total_text_bytes);
1965       fprintf(stderr, "= %s%c ",
1966               LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1967       fprintf(stderr, "<br><br>\n");
1968     }
1969
1970     // Slightly condensed if quiet
1971     if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
1972       fprintf(stderr, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ");
1973       for (int i = 0; i < 3; ++i) {
1974         if (language3[i] != UNKNOWN_LANGUAGE) {
1975           fprintf(stderr, "&nbsp;&nbsp;%s %d%% ",
1976                   LanguageCode(language3[i]),
1977                   percent3[i]);
1978         }
1979       }
1980       fprintf(stderr, "= %s%c ",
1981               LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1982       fprintf(stderr, "<br>\n");
1983     }
1984
1985     return summary_lang;
1986   }
1987
1988   // Not a good answer -- do recursive call to refine
1989   if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1990     // This is what we hope to improve on in the recursive call, if any
1991     PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
1992   }
1993
1994   // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
1995   // For this purpose, we treate "Ignore" as top40
1996   Language new_plus_one = UNKNOWN_LANGUAGE;
1997
1998   if (total_text_bytes < kShortTextThresh) {
1999       // Short text: Recursive call with top40 and short set
2000       if (FLAGS_cld2_html || FLAGS_dbgscore) {
2001         fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
2002                 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2003                 total_text_bytes);
2004       }
2005       return DetectLanguageSummaryV2(
2006                         buffer,
2007                         buffer_length,
2008                         is_plain_text,
2009                         cld_hints,
2010                         allow_extended_lang,
2011                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2012                           kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2013                         new_plus_one,
2014                         language3,
2015                         percent3,
2016                         normalized_score3,
2017                         resultchunkvector,
2018                         text_bytes,
2019                         is_reliable);
2020   }
2021
2022   // Longer text: Recursive call with top40 set
2023   if (FLAGS_cld2_html || FLAGS_dbgscore) {
2024     fprintf(stderr,
2025             "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2026             total_text_bytes);
2027   }
2028   return DetectLanguageSummaryV2(
2029                         buffer,
2030                         buffer_length,
2031                         is_plain_text,
2032                         cld_hints,
2033                         allow_extended_lang,
2034                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2035                           kCLDFlagFinish,
2036                         new_plus_one,
2037                         language3,
2038                         percent3,
2039                         normalized_score3,
2040                         resultchunkvector,
2041                         text_bytes,
2042                         is_reliable);
2043 }
2044
2045
2046 // For debugging and wrappers. Not thread safe.
2047 static char temp_detectlanguageversion[32];
2048
2049 // Return version text string
2050 // String is "code_version - data_build_date"
2051 const char* DetectLanguageVersion() {
2052   if (kScoringtables.quadgram_obj == NULL) {return "";}
2053   sprintf(temp_detectlanguageversion,
2054           "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
2055   return temp_detectlanguageversion;
2056 }
2057
2058
2059 }       // End namespace CLD2