src/third_party/cld_2/src/internal/compact_lang_det_impl.cc

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 // Updated 2014.01 for dual table lookup
  18 //
  19
  20 #include <stdint.h>
  21 #include <stdio.h>
  22 #include <string.h>
  23 #include <string>
  24 #include <vector>
  25
  26 #include "cldutil.h"
  27 #include "debug.h"
  28 #include "integral_types.h"
  29 #include "lang_script.h"
  30 #include "utf8acceptinterchange.h"
  31 #include "utf8statetable.h"
  32
  33 #ifdef CLD2_DYNAMIC_MODE
  34 #include "cld2_dynamic_data.h"
  35 #include "cld2_dynamic_data_loader.h"
  36 #endif
  37 #include "cld2tablesummary.h"
  38 #include "compact_lang_det_impl.h"
  39 #include "compact_lang_det_hint_code.h"
  40 #include "getonescriptspan.h"
  41 #include "tote.h"
  42
  43
  44 namespace CLD2 {
  45
  46 using namespace std;
  47
  48 // Linker supplies the right tables, From files
  49 // cld_generated_cjk_uni_prop_80.cc  cld2_generated_cjk_compatible.cc
  50 // cld_generated_cjk_delta_bi_32.cc  generated_distinct_bi_0.cc
  51 // cld2_generated_quad*.cc  cld2_generated_deltaocta*.cc
  52 // cld2_generated_distinctocta*.cc
  53 // cld_generated_score_quad_octa_1024_256.cc
  54
  55 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
  56 //   sizes that are 1/3/5 times a power of two, instead of just powers of two.
  57 //   Gives more flexibility of total footprint for CLD2.
  58
  59 extern const int kLanguageToPLangSize;
  60 extern const int kCloseSetSize;
  61
  62 extern const UTF8PropObj cld_generated_CjkUni_obj;
  63 extern const CLD2TableSummary kCjkCompat_obj;
  64 extern const CLD2TableSummary kCjkDeltaBi_obj;
  65 extern const CLD2TableSummary kDistinctBiTable_obj;
  66 extern const CLD2TableSummary kQuad_obj;
  67 extern const CLD2TableSummary kQuad_obj2;     // Dual lookup tables
  68 extern const CLD2TableSummary kDeltaOcta_obj;
  69 extern const CLD2TableSummary kDistinctOcta_obj;
  70 extern const short kAvgDeltaOctaScore[];
  71
  72 // Returns the length in bytes of the prefix of src that is all
  73 //  interchange valid UTF-8
  74 int SpanInterchangeValid(const char* src, int byte_length) {
  75   int bytes_consumed;
  76   const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
  77   StringPiece str(src, byte_length);
  78   UTF8GenericScan(st, str, &bytes_consumed);
  79   return bytes_consumed;
  80 }
  81
  82 #ifdef CLD2_DYNAMIC_MODE
  83   // CLD2_DYNAMIC_MODE is defined:
  84   // Data will be read from an mmap opened at runtime.
  85
  86   // Convenience for nulling things out completely at any point.
  87   static ScoringTables NULL_TABLES = {
  88     NULL, //&cld_generated_CjkUni_obj,
  89     NULL, //&kCjkCompat_obj,
  90     NULL, //&kCjkDeltaBi_obj,
  91     NULL, //&kDistinctBiTable_obj,
  92     NULL, //&kQuad_obj,
  93     NULL, //&kQuad_obj2,
  94     NULL, //&kDeltaOcta_obj,
  95     NULL, //&kDistinctOcta_obj,
  96     NULL, //kAvgDeltaOctaScore,
  97   };
  98   static ScoringTables kScoringtables = NULL_TABLES; // copy constructed
  99   static bool dynamicDataLoaded = false;
 100   static bool dataSourceIsFile = false;
 101   static ScoringTables* dynamicTables = NULL;
 102   static void* mmapAddress = NULL;
 103   static uint32_t mmapLength = 0;
 104
 105   bool isDataLoaded() { return dynamicDataLoaded; }
 106   bool isDataDynamic() { return true; } // Because CLD2_DYNAMIC_MODE is defined
 107
 108   void loadDataFromFile(const char* fileName) {
 109     if (isDataLoaded()) {
 110       unloadData();
 111     }
 112     ScoringTables* result = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
 113     if (result == NULL) {
 114       fprintf(stderr, "WARNING: Dynamic data loading failed.\n");
 115       return;
 116     }
 117     dynamicTables = result;
 118     kScoringtables = *dynamicTables;
 119     dataSourceIsFile = true;
 120     dynamicDataLoaded = true;
 121   };
 122
 123   void loadDataFromRawAddress(const void* rawAddress, const uint32_t length) {
 124     if (isDataLoaded()) {
 125       unloadData();
 126     }
 127     ScoringTables* result = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length);
 128     if (result == NULL) {
 129       fprintf(stderr, "WARNING: Dynamic data loading failed.\n");
 130       return;
 131     }
 132     dynamicTables = result;
 133     kScoringtables = *dynamicTables;
 134     dataSourceIsFile = false;
 135     dynamicDataLoaded = true;
 136   }
 137
 138   void unloadData() {
 139     if (!dynamicDataLoaded) return;
 140     if (dataSourceIsFile) {
 141       CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength);
 142     } else {
 143       CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables);
 144     }
 145     dynamicDataLoaded = false;
 146     dataSourceIsFile = false; // vacuous
 147     kScoringtables = NULL_TABLES; // Housekeeping: null all pointers
 148   }
 149 #else // !CLD2_DYNAMIC_MODE
 150   // This initializes kScoringtables.quadgram_obj etc.
 151   static const ScoringTables kScoringtables = {
 152     &cld_generated_CjkUni_obj,
 153     &kCjkCompat_obj,
 154     &kCjkDeltaBi_obj,
 155     &kDistinctBiTable_obj,
 156
 157     &kQuad_obj,
 158     &kQuad_obj2,                              // Dual lookup tables
 159     &kDeltaOcta_obj,
 160     &kDistinctOcta_obj,
 161
 162     kAvgDeltaOctaScore,
 163   };
 164
 165   // Method implementations below are provided so that callers aren't *forced*
 166   // to depend upon the CLD2_DYNAMIC_MODE flag, but can use runtime checks
 167   // instead. For more information, refer to CLD2 issue 16:
 168   // https://code.google.com/p/cld2/issues/detail?id=16
 169   bool isDataLoaded() { return true; } // Data is statically linked
 170   bool isDataDynamic() { return false; } // Because CLD2_DYNAMIC_MODE is not defined
 171
 172   void loadDataFromFile(const char* fileName) {
 173     // This is a bug in the calling code.
 174     fprintf(stderr, "WARNING: Dynamic mode not active, loadDataFromFile has no effect!\n");
 175   }
 176   void loadDataFromRawAddress(const void* rawAddress, const uint32_t length) {
 177     // This is a bug in the calling code.
 178     fprintf(stderr, "WARNING: Dynamic mode not active, loadDataFromRawAddress has no effect!\n");
 179   }
 180   void unloadData() {
 181     // This is a bug in the calling code.
 182     fprintf(stderr, "WARNING: Dynamic mode not active, unloadData has no effect!\n");
 183   }
 184
 185 #endif // #ifdef CLD2_DYNAMIC_MODE
 186
 187
 188 static const bool FLAGS_cld_no_minimum_bytes = false;
 189 static const bool FLAGS_cld_forcewords = true;
 190 static const bool FLAGS_cld_showme = false;
 191 static const bool FLAGS_cld_echotext = true;
 192 static const int32 FLAGS_cld_textlimit = 160;
 193 static const int32 FLAGS_cld_smoothwidth = 20;
 194 static const bool FLAGS_cld_2011_hints = true;
 195 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
 196
 197 static const bool FLAGS_dbgscore = false;
 198
 199
 200 static const int kLangHintInitial = 12;  // Boost language by N initially
 201 static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
 202
 203 static const int kShortSpanThresh = 32;       // Bytes
 204 static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
 205
 206 static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
 207                                                   // after this many text bytes
 208 static const int kCheapSqueezeTestLen = 256;  // Bytes to test to trigger sqz
 209 static const int kSpacesTriggerPercent = 25;  // Trigger sqz if >=25% spaces
 210 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
 211
 212 static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
 213 static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
 214 static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
 215
 216 static const int kMaxSpaceScan = 32;          // Bytes
 217
 218 static const int kGoodLang1Percent = 70;
 219 static const int kGoodLang1and2Percent = 93;
 220 static const int kShortTextThresh = 256;      // Bytes
 221
 222 static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
 223 static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
 224
 225 static const int kDefaultWordSpan = 256;      // Scan at least this many initial
 226                                               // bytes with word scoring
 227 static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
 228
 229 static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
 230
 231 static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
 232                                                 // cheap compressor
 233
 234 static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
 235 static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
 236 static const int kGoodFirstMinPercent = 26;           // <this => UNK
 237 static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
 238 static const int kIgnoreMaxPercent = 20;              // >this => unreli
 239 static const int kKeepMinPercent = 2;                 // <this => unreli
 240
 241
 242
 243 // Statistically closest language, based on quadgram table
 244 // Those that are far from other languges map to UNKNOWN_LANGUAGE
 245 // Subscripted by Language
 246 //
 247 // From lang_correlation.txt and hand-edits
 248 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
 249 //   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
 250 //   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
 251 //
 252 static const int kMinCorrPercent = 24;        // Pick off how close you want
 253                                               // 24 catches PERSIAN <== ARABIC
 254                                               // but not SPANISH <== PORTUGESE
 255 static Language Unknown = UNKNOWN_LANGUAGE;
 256
 257 // Suspect idea
 258 // Subscripted by Language
 259 static const Language kClosestAltLanguage[] = {
 260   (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
 261   (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
 262   (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
 263   (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
 264   (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
 265   (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
 266   (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
 267   (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
 268   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
 269   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
 270   (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
 271   ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
 272   (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
 273   (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
 274   (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
 275   (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
 276   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
 277   (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
 278   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
 279   (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
 280   ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
 281   ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
 282   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
 283   ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
 284   (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
 285   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
 286   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
 287   (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
 288   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
 289   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
 290   (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
 291   (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
 292   ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
 293   (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
 294   (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
 295   (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
 296   (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
 297   (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
 298   (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
 299   ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
 300   (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
 301   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
 302   ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
 303   ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
 304   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
 305   ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
 306   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
 307   (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
 308   (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
 309   (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
 310   (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
 311   (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
 312   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
 313   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
 314   (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
 315   (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
 316   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
 317   ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
 318   ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
 319   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
 320   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
 321   (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
 322   ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
 323   (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
 324   (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
 325   ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
 326   ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
 327   (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
 328   (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
 329   // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
 330   (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
 331   (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
 332   (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
 333   (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
 334   ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
 335   (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
 336   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
 337   ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
 338   (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
 339   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
 340   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
 341   (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
 342   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
 343   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
 344   (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
 345   (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
 346   ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
 347   (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
 348   ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
 349   ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
 350   ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
 351   ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
 352   (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
 353   (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
 354   (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
 355   ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
 356   (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
 357   ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
 358   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
 359   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
 360   ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
 361   (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
 362   (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
 363   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
 364   ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
 365   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
 366   (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
 367   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
 368   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
 369   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
 370   ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
 371   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
 372   (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
 373   (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
 374   ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
 375   ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
 376   ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
 377   (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
 378   (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
 379   ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
 380   ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
 381   (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
 382   (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
 383   (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
 384   ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
 385   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
 386   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
 387   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
 388   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
 389   ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
 390   ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
 391   ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
 392   ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
 393   ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
 394   (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
 395   ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
 396   (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
 397   ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
 398   ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
 399   ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
 400   ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
 401   ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
 402   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
 403   ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
 404   (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
 405   ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
 406   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
 407   (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
 408   (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
 409   (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
 410   ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
 411   (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
 412   (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
 413   ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
 414   (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
 415   ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
 416   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
 417   ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
 418   (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
 419   (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
 420   ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
 421   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
 422
 423   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // AKAN
 424   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // IGBO
 425   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MAURITIAN_CREOLE
 426   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // HAWAIIAN
 427 };
 428
 429 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
 430 //                kClosestAltLanguage_has_incorrect_size);
 431
 432
 433 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
 434 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
 435 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
 436 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
 437 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
 438 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
 439 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
 440 inline bool FlagBestEffort(int flags) {
 441   return (flags & kCLDFlagBestEffort) != 0;
 442 }
 443
 444
 445   // Defines Top40 packed languages
 446
 447   // Google top 40 languages
 448   //
 449   // Tier 0/1 Language enum list (16)
 450   //   ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH,    // E - FIGS
 451   //   DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
 452   //   PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
 453   //   ARABIC,
 454   //
 455   // Tier 2 Language enum list (22)
 456   //   SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
 457   //   HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
 458   //   VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
 459   //   TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
 460   //   UKRAINIAN, HINDI,
 461   //
 462   //   use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
 463   //
 464   // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
 465
 466
 467 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
 468   // REVISIT
 469 }
 470
 471 void PrintText(FILE* f, Language cur_lang, const string& temp) {
 472   if (temp.size() == 0) {return;}
 473   fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
 474 }
 475
 476
 477 //------------------------------------------------------------------------------
 478 // For --cld_html debugging output. Not thread safe
 479 //------------------------------------------------------------------------------
 480 static Language prior_lang = UNKNOWN_LANGUAGE;
 481 static bool prior_unreliable = false;
 482
 483 //------------------------------------------------------------------------------
 484 // End For --cld_html debugging output
 485 //------------------------------------------------------------------------------
 486
 487
 488 // Backscan to word boundary, returning how many bytes n to go back
 489 // so that src - n is non-space ans src - n - 1 is space.
 490 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
 491 int BackscanToSpace(const char* src, int limit) {
 492   int n = 0;
 493   limit = minint(limit, kMaxSpaceScan);
 494   while (n < limit) {
 495     if (src[-n - 1] == ' ') {return n;}    // We are at _X
 496     ++n;
 497   }
 498   n = 0;
 499   while (n < limit) {
 500     if ((src[-n] & 0xc0) != 0x80) {return n;}    // We are at char begin
 501     ++n;
 502   }
 503   return 0;
 504 }
 505
 506 // Forwardscan to word boundary, returning how many bytes n to go forward
 507 // so that src + n is non-space ans src + n - 1 is space.
 508 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
 509 int ForwardscanToSpace(const char* src, int limit) {
 510   int n = 0;
 511   limit = minint(limit, kMaxSpaceScan);
 512   while (n < limit) {
 513     if (src[n] == ' ') {return n + 1;}    // We are at _X
 514     ++n;
 515   }
 516   n = 0;
 517   while (n < limit) {
 518     if ((src[n] & 0xc0) != 0x80) {return n;}    // We are at char begin
 519     ++n;
 520   }
 521   return 0;
 522 }
 523
 524
 525 // This uses a cheap predictor to get a measure of compression, and
 526 // hence a measure of repetitiveness. It works on complete UTF-8 characters
 527 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
 528 // all the time when done with a byte-based count. Sigh.
 529 //
 530 // To allow running prediction across multiple chunks, caller passes in current
 531 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
 532 //
 533 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
 534 // each correctly-predicted character.
 535 //
 536 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
 537 //
 538
 539 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
 540
 541 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
 542   int p_count = 0;
 543   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 544   const uint8* srclimit = src + src_len;
 545   int local_hash = *hash;
 546
 547   while (src < srclimit) {
 548     int c = src[0];
 549     int incr = 1;
 550
 551     // Pick up one char and length
 552     if (c < 0xc0) {
 553       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
 554       // Do nothing more
 555     } else if ((c & 0xe0) == 0xc0) {
 556       // Two-byte
 557       c = (c << 8) | src[1];
 558       incr = 2;
 559     } else if ((c & 0xf0) == 0xe0) {
 560       // Three-byte
 561       c = (c << 16) | (src[1] << 8) | src[2];
 562       incr = 3;
 563     } else {
 564       // Four-byte
 565       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
 566       incr = 4;
 567     }
 568     src += incr;
 569
 570     int p = tbl[local_hash];            // Prediction
 571     tbl[local_hash] = c;                // Update prediction
 572     if (c == p) {
 573       p_count += incr;                  // Count bytes of good predictions
 574     }
 575
 576     local_hash = ((local_hash << 4) ^ c) & 0xfff;
 577   }
 578   *hash = local_hash;
 579   return p_count;
 580 }
 581
 582
 583
 584 // Counts number of spaces; a little faster than one-at-a-time
 585 // Doesn't count odd bytes at end
 586 int CountSpaces4(const char* src, int src_len) {
 587   int s_count = 0;
 588   for (int i = 0; i < (src_len & ~3); i += 4) {
 589     s_count += (src[i] == ' ');
 590     s_count += (src[i+1] == ' ');
 591     s_count += (src[i+2] == ' ');
 592     s_count += (src[i+3] == ' ');
 593   }
 594   return s_count;
 595 }
 596
 597
 598 // Remove words of text that have more than half their letters predicted
 599 // correctly by our cheap predictor, moving the remaining words in-place
 600 // to the front of the input buffer.
 601 //
 602 // To allow running prediction across multiple chunks, caller passes in current
 603 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
 604 //
 605 // Return the new, possibly-shorter length
 606 //
 607 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
 608 // if input does
 609 //
 610 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
 611   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 612   const uint8* srclimit = src + src_len;
 613   char* dst = isrc;
 614   int local_hash = *hash;
 615   char* word_dst = dst;           // Start of next word
 616   int good_predict_bytes = 0;
 617   int word_length_bytes = 0;
 618
 619   while (src < srclimit) {
 620     int c = src[0];
 621     int incr = 1;
 622     *dst++ = c;
 623
 624     if (c == ' ') {
 625       if ((good_predict_bytes * 2) > word_length_bytes) {
 626         // Word is well-predicted: backup to start of this word
 627         dst = word_dst;
 628         if (FLAGS_cld_showme) {
 629           // Mark the deletion point with period
 630           // Don't repeat multiple periods
 631           // Cannot mark with more bytes or may overwrite unseen input
 632           if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
 633             *dst++ = '.';
 634             *dst++ = ' ';
 635           }
 636         }
 637       }
 638       word_dst = dst;              // Start of next word
 639       good_predict_bytes = 0;
 640       word_length_bytes = 0;
 641     }
 642
 643     // Pick up one char and length
 644     if (c < 0xc0) {
 645       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
 646       // Do nothing more
 647     } else if ((c & 0xe0) == 0xc0) {
 648       // Two-byte
 649       *dst++ = src[1];
 650       c = (c << 8) | src[1];
 651       incr = 2;
 652     } else if ((c & 0xf0) == 0xe0) {
 653       // Three-byte
 654       *dst++ = src[1];
 655       *dst++ = src[2];
 656       c = (c << 16) | (src[1] << 8) | src[2];
 657       incr = 3;
 658     } else {
 659       // Four-byte
 660       *dst++ = src[1];
 661       *dst++ = src[2];
 662       *dst++ = src[3];
 663       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
 664       incr = 4;
 665     }
 666     src += incr;
 667     word_length_bytes += incr;
 668
 669     int p = tbl[local_hash];            // Prediction
 670     tbl[local_hash] = c;                // Update prediction
 671     if (c == p) {
 672       good_predict_bytes += incr;       // Count good predictions
 673     }
 674
 675     local_hash = ((local_hash << 4) ^ c) & 0xfff;
 676   }
 677
 678   *hash = local_hash;
 679
 680   if ((dst - isrc) < (src_len - 3)) {
 681     // Pad and make last char clean UTF-8 by putting following spaces
 682     dst[0] = ' ';
 683     dst[1] = ' ';
 684     dst[2] = ' ';
 685     dst[3] = '\0';
 686   } else  if ((dst - isrc) < src_len) {
 687     // Make last char clean UTF-8 by putting following space off the end
 688     dst[0] = ' ';
 689   }
 690
 691   return static_cast<int>(dst - isrc);
 692 }
 693
 694
 695 // This alternate form overwrites redundant words, thus avoiding corrupting the
 696 // backmap for generating a vector of original-text ranges.
 697 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
 698   const uint8* src = reinterpret_cast<const uint8*>(isrc);
 699   const uint8* srclimit = src + src_len;
 700   char* dst = isrc;
 701   int local_hash = *hash;
 702   char* word_dst = dst;           // Start of next word
 703   int good_predict_bytes = 0;
 704   int word_length_bytes = 0;
 705
 706   while (src < srclimit) {
 707     int c = src[0];
 708     int incr = 1;
 709     *dst++ = c;
 710
 711     if (c == ' ') {
 712       if ((good_predict_bytes * 2) > word_length_bytes) {
 713         // Word [word_dst..dst-1) is well-predicted: overwrite
 714         for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
 715       }
 716       word_dst = dst;              // Start of next word
 717       good_predict_bytes = 0;
 718       word_length_bytes = 0;
 719     }
 720
 721     // Pick up one char and length
 722     if (c < 0xc0) {
 723       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
 724       // Do nothing more
 725     } else if ((c & 0xe0) == 0xc0) {
 726       // Two-byte
 727       *dst++ = src[1];
 728       c = (c << 8) | src[1];
 729       incr = 2;
 730     } else if ((c & 0xf0) == 0xe0) {
 731       // Three-byte
 732       *dst++ = src[1];
 733       *dst++ = src[2];
 734       c = (c << 16) | (src[1] << 8) | src[2];
 735       incr = 3;
 736     } else {
 737       // Four-byte
 738       *dst++ = src[1];
 739       *dst++ = src[2];
 740       *dst++ = src[3];
 741       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
 742       incr = 4;
 743     }
 744     src += incr;
 745     word_length_bytes += incr;
 746
 747     int p = tbl[local_hash];            // Prediction
 748     tbl[local_hash] = c;                // Update prediction
 749     if (c == p) {
 750       good_predict_bytes += incr;       // Count good predictions
 751     }
 752
 753     local_hash = ((local_hash << 4) ^ c) & 0xfff;
 754   }
 755
 756   *hash = local_hash;
 757
 758   if ((dst - isrc) < (src_len - 3)) {
 759     // Pad and make last char clean UTF-8 by putting following spaces
 760     dst[0] = ' ';
 761     dst[1] = ' ';
 762     dst[2] = ' ';
 763     dst[3] = '\0';
 764   } else  if ((dst - isrc) < src_len) {
 765     // Make last char clean UTF-8 by putting following space off the end
 766     dst[0] = ' ';
 767   }
 768
 769   return static_cast<int>(dst - isrc);
 770 }
 771
 772
 773 // Remove portions of text that have a high density of spaces, or that are
 774 // overly repetitive, squeezing the remaining text in-place to the front of the
 775 // input buffer.
 776 //
 777 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
 778 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
 779 //
 780 // Return the new, possibly-shorter length
 781 //
 782 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
 783 // if input does
 784 //
 785 int CheapSqueezeInplace(char* isrc,
 786                                             int src_len,
 787                                             int ichunksize) {
 788   char* src = isrc;
 789   char* dst = src;
 790   char* srclimit = src + src_len;
 791   bool skipping = false;
 792
 793   int hash = 0;
 794   // Allocate local prediction table.
 795   int* predict_tbl = new int[kPredictionTableSize];
 796   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
 797
 798   int chunksize = ichunksize;
 799   if (chunksize == 0) {chunksize = kChunksizeDefault;}
 800   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
 801   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
 802
 803   while (src < srclimit) {
 804     int remaining_bytes = srclimit - src;
 805     int len = minint(chunksize, remaining_bytes);
 806     // Make len land us on a UTF-8 character boundary.
 807     // Ah. Also fixes mispredict because we could get out of phase
 808     // Loop always terminates at trailing space in buffer
 809     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
 810
 811     int space_n = CountSpaces4(src, len);
 812     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
 813     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
 814       // Skip the text
 815       if (!skipping) {
 816         // Keeping-to-skipping transition; do it at a space
 817         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
 818         dst -= n;
 819         if (dst == isrc) {
 820           // Force a leading space if the first chunk is deleted
 821           *dst++ = ' ';
 822         }
 823         if (FLAGS_cld_showme) {
 824           // Mark the deletion point with black square U+25A0
 825           *dst++ = static_cast<unsigned char>(0xe2);
 826           *dst++ = static_cast<unsigned char>(0x96);
 827           *dst++ = static_cast<unsigned char>(0xa0);
 828           *dst++ = ' ';
 829         }
 830         skipping = true;
 831       }
 832     } else {
 833       // Keep the text
 834       if (skipping) {
 835         // Skipping-to-keeping transition; do it at a space
 836         int n = ForwardscanToSpace(src, len);
 837         src += n;
 838         remaining_bytes -= n;   // Shrink remaining length
 839         len -= n;
 840         skipping = false;
 841       }
 842       // "len" can be negative in some cases
 843       if (len > 0) {
 844         memmove(dst, src, len);
 845         dst += len;
 846       }
 847     }
 848     src += len;
 849   }
 850
 851   if ((dst - isrc) < (src_len - 3)) {
 852     // Pad and make last char clean UTF-8 by putting following spaces
 853     dst[0] = ' ';
 854     dst[1] = ' ';
 855     dst[2] = ' ';
 856     dst[3] = '\0';
 857   } else   if ((dst - isrc) < src_len) {
 858     // Make last char clean UTF-8 by putting following space off the end
 859     dst[0] = ' ';
 860   }
 861
 862   // Deallocate local prediction table
 863   delete[] predict_tbl;
 864   return static_cast<int>(dst - isrc);
 865 }
 866
 867 // This alternate form overwrites redundant words, thus avoiding corrupting the
 868 // backmap for generating a vector of original-text ranges.
 869 int CheapSqueezeInplaceOverwrite(char* isrc,
 870                                             int src_len,
 871                                             int ichunksize) {
 872   char* src = isrc;
 873   char* dst = src;
 874   char* srclimit = src + src_len;
 875   bool skipping = false;
 876
 877   int hash = 0;
 878   // Allocate local prediction table.
 879   int* predict_tbl = new int[kPredictionTableSize];
 880   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
 881
 882   int chunksize = ichunksize;
 883   if (chunksize == 0) {chunksize = kChunksizeDefault;}
 884   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
 885   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
 886
 887   // Always keep first byte (space)
 888   ++src;
 889   ++dst;
 890   while (src < srclimit) {
 891     int remaining_bytes = srclimit - src;
 892     int len = minint(chunksize, remaining_bytes);
 893     // Make len land us on a UTF-8 character boundary.
 894     // Ah. Also fixes mispredict because we could get out of phase
 895     // Loop always terminates at trailing space in buffer
 896     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
 897
 898     int space_n = CountSpaces4(src, len);
 899     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
 900     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
 901       // Overwrite the text [dst-n..dst)
 902       if (!skipping) {
 903         // Keeping-to-skipping transition; do it at a space
 904         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
 905         // Text [word_dst..dst) is well-predicted: overwrite
 906         for (char* p = dst - n; p < dst; ++p) {*p = '.';}
 907         skipping = true;
 908       }
 909       // Overwrite the text [dst..dst+len)
 910       for (char* p = dst; p < dst + len; ++p) {*p = '.';}
 911       dst[len - 1] = ' ';    // Space at end so we can see what is happening
 912     } else {
 913       // Keep the text
 914       if (skipping) {
 915         // Skipping-to-keeping transition; do it at a space
 916         int n = ForwardscanToSpace(src, len);
 917         // Text [dst..dst+n) is well-predicted: overwrite
 918         for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
 919         skipping = false;
 920       }
 921     }
 922     dst += len;
 923     src += len;
 924   }
 925
 926   if ((dst - isrc) < (src_len - 3)) {
 927     // Pad and make last char clean UTF-8 by putting following spaces
 928     dst[0] = ' ';
 929     dst[1] = ' ';
 930     dst[2] = ' ';
 931     dst[3] = '\0';
 932   } else   if ((dst - isrc) < src_len) {
 933     // Make last char clean UTF-8 by putting following space off the end
 934     dst[0] = ' ';
 935   }
 936
 937   // Deallocate local prediction table
 938   delete[] predict_tbl;
 939   return static_cast<int>(dst - isrc);
 940 }
 941
 942 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
 943 //  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
 944 //  Just CountSpaces is about 340 MB/sec
 945 //  Byte-only CountPredictedBytes is about 150 MB/sec
 946 //  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
 947 //  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
 948 //  Unjammed byte-only both = 170 MB/sec
 949 //  Jammed byte-only both = 120 MB/sec
 950 //  Back to original w/slight updates, 110 MB/sec
 951 //
 952 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
 953   // Don't trigger at all on short text
 954   if (src_len < testsize) {return false;}
 955   int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
 956   int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
 957   int hash = 0;
 958   // Allocate local prediction table.
 959   int* predict_tbl = new int[kPredictionTableSize];
 960   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
 961
 962   bool retval = false;
 963   if ((CountSpaces4(src, testsize) >= space_thresh) ||
 964       (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
 965        predict_thresh)) {
 966     retval = true;
 967   }
 968   // Deallocate local prediction table
 969   delete[] predict_tbl;
 970   return retval;
 971 }
 972
 973
 974
 975
 976 // Delete any extended languages from doc_tote
 977 void RemoveExtendedLanguages(DocTote* doc_tote) {
 978   // Now a nop
 979 }
 980
 981 static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
 982
 983 // For Tier3 languages, require a minimum number of bytes to be first-place lang
 984 static const int kGoodFirstT3MinBytes = 24;         // <this => no first
 985
 986 // Move bytes for unreliable langs to another lang or UNKNOWN
 987 // doc_tote is sorted, so cannot Add
 988 //
 989 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
 990 // merge both into CHINESE.
 991 //
 992 //dsites 2009.03.19
 993 // we also want to remove Tier3 languages as the first lang if there is very
 994 // little text like ej1 ej2 ej3 ej4
 995 // maybe fold this back in earlier
 996 //
 997 void RemoveUnreliableLanguages(DocTote* doc_tote,
 998                                bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
 999   // Prepass to merge some low-reliablility languages
1000   // TODO: this shouldn't really reach in to the internal structure of doc_tote
1001   int total_bytes = 0;
1002   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1003     int plang = doc_tote->Key(sub);
1004     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
1005
1006     Language lang = static_cast<Language>(plang);
1007     int bytes = doc_tote->Value(sub);
1008     int reli = doc_tote->Reliability(sub);
1009     if (bytes == 0) {continue;}                     // Zero bytes
1010     total_bytes += bytes;
1011
1012     // Reliable percent = stored reliable score over stored bytecount
1013     int reliable_percent = reli / bytes;
1014     if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
1015
1016     // This language is too unreliable to keep, but we might merge it.
1017     Language altlang = UNKNOWN_LANGUAGE;
1018     if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
1019     if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
1020
1021     // Look for alternative in doc_tote
1022     int altsub = doc_tote->Find(altlang);
1023     if (altsub < 0) {continue;}                     // No alternative text
1024
1025     int bytes2 = doc_tote->Value(altsub);
1026     int reli2 = doc_tote->Reliability(altsub);
1027     if (bytes2 == 0) {continue;}                    // Zero bytes
1028
1029     // Reliable percent is stored reliable score over stored bytecount
1030     int reliable_percent2 = reli2 / bytes2;
1031
1032     // Merge one language into the other. Break ties toward lower lang #
1033     int tosub = altsub;
1034     int fromsub = sub;
1035     bool into_lang = false;
1036     if ((reliable_percent2 < reliable_percent) ||
1037         ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
1038       tosub = sub;
1039       fromsub = altsub;
1040       into_lang = true;
1041     }
1042
1043     // Make sure merged reliability doesn't drop and is enough to avoid delete
1044     int newpercent = maxint(reliable_percent, reliable_percent2);
1045     newpercent = maxint(newpercent, kMinReliableKeepPercent);
1046     int newbytes = bytes + bytes2;
1047     int newreli = newpercent * newbytes;
1048
1049     doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
1050     doc_tote->SetScore(fromsub, 0);
1051     doc_tote->SetReliability(fromsub, 0);
1052     doc_tote->SetScore(tosub, newbytes);
1053     doc_tote->SetReliability(tosub, newreli);
1054
1055     // Show fate of unreliable languages if at least 10 bytes
1056     if (FLAGS_cld2_html && (newbytes >= 10) &&
1057         !FLAGS_cld2_quiet) {
1058       if (into_lang) {
1059         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1060                 LanguageCode(altlang), reliable_percent2, bytes2,
1061                 LanguageCode(lang));
1062       } else {
1063         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
1064                 LanguageCode(lang), reliable_percent, bytes,
1065                 LanguageCode(altlang));
1066       }
1067     }
1068   }
1069
1070
1071   // Pass to delete any remaining unreliable languages
1072   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1073     int plang = doc_tote->Key(sub);
1074     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
1075
1076     Language lang = static_cast<Language>(plang);
1077     int bytes = doc_tote->Value(sub);
1078     int reli = doc_tote->Reliability(sub);
1079     if (bytes == 0) {continue;}                     // Zero bytes
1080
1081     // Reliable percent is stored as reliable score over stored bytecount
1082     int reliable_percent = reli / bytes;
1083     if (reliable_percent >= kMinReliableKeepPercent) {  // Keeper?
1084        continue;                                        // yes
1085     }
1086
1087     // Delete unreliable entry
1088     doc_tote->SetKey(sub, DocTote::kUnusedKey);
1089     doc_tote->SetScore(sub, 0);
1090     doc_tote->SetReliability(sub, 0);
1091
1092     // Show fate of unreliable languages if at least 10 bytes
1093     if (FLAGS_cld2_html && (bytes >= 10) &&
1094         !FLAGS_cld2_quiet) {
1095       fprintf(stderr, "{Unreli %s.%dR,%dB} ",
1096               LanguageCode(lang), reliable_percent, bytes);
1097     }
1098   }
1099
1100   ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
1101 }
1102
1103
1104 // Move all the text bytes from lower byte-count to higher one
1105 void MoveLang1ToLang2(Language lang1, Language lang2,
1106                       int lang1_sub, int lang2_sub,
1107                       DocTote* doc_tote,
1108                       ResultChunkVector* resultchunkvector) {
1109   // In doc_tote, move all the bytes lang1 => lang2
1110   int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
1111   doc_tote->SetValue(lang2_sub, sum);
1112   sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
1113   doc_tote->SetScore(lang2_sub, sum);
1114   sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
1115   doc_tote->SetReliability(lang2_sub, sum);
1116
1117   // Delete old entry
1118   doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
1119   doc_tote->SetScore(lang1_sub, 0);
1120   doc_tote->SetReliability(lang1_sub, 0);
1121
1122   // In resultchunkvector, move all the bytes lang1 => lang2
1123   if (resultchunkvector == NULL) {return;}
1124
1125   int k = 0;
1126   uint16 prior_lang = UNKNOWN_LANGUAGE;
1127   for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
1128     ResultChunk* rc = &(*resultchunkvector)[i];
1129     if (rc->lang1 == lang1) {
1130       // Update entry[i] lang1 => lang2
1131       rc->lang1 = lang2;
1132     }
1133     // One change may produce two merges -- entry before and entry after
1134     if ((rc->lang1 == prior_lang) && (k > 0)) {
1135       // Merge with previous, deleting entry[i]
1136       ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
1137       prior_rc->bytes += rc->bytes;
1138       // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
1139     } else {
1140       // Keep entry[i]
1141       (*resultchunkvector)[k] = (*resultchunkvector)[i];
1142       // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
1143       ++k;
1144     }
1145     prior_lang = rc->lang1;
1146   }
1147   resultchunkvector->resize(k);
1148 }
1149
1150
1151
1152 // Move less likely byte count to more likely for close pairs of languages
1153 // If given, also update resultchunkvector
1154 void RefineScoredClosePairs(DocTote* doc_tote,
1155                             ResultChunkVector* resultchunkvector,
1156                             bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1157   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1158     int close_packedlang = doc_tote->Key(sub);
1159     int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
1160     if (subscr == 0) {continue;}
1161
1162     // We have a close pair language -- if the other one is also scored and the
1163     // longword score differs enough, put all our eggs into one basket
1164
1165     // Nonzero longword score: Go look for the other of this pair
1166     for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1167       if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
1168         // We have a matching pair
1169         int close_packedlang2 = doc_tote->Key(sub2);
1170
1171         // Move all the text bytes from lower byte-count to higher one
1172         int from_sub, to_sub;
1173         Language from_lang, to_lang;
1174         if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1175           from_sub = sub;
1176           to_sub = sub2;
1177           from_lang = static_cast<Language>(close_packedlang);
1178           to_lang = static_cast<Language>(close_packedlang2);
1179         } else {
1180           from_sub = sub2;
1181           to_sub = sub;
1182           from_lang = static_cast<Language>(close_packedlang2);
1183           to_lang = static_cast<Language>(close_packedlang);
1184         }
1185
1186         if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1187           // Show fate of closepair language
1188           int val = doc_tote->Value(from_sub);           // byte count
1189           int reli = doc_tote->Reliability(from_sub);
1190           int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
1191           fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
1192                   LanguageCode(from_lang),
1193                   reliable_percent,
1194                   doc_tote->Value(from_sub),
1195                   LanguageCode(to_lang));
1196         }
1197         MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
1198                          doc_tote, resultchunkvector);
1199         break;    // Exit inner for sub2 loop
1200       }
1201     }     // End for sub2
1202   }   // End for sub
1203 }
1204
1205
1206 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
1207                         uint8* lang_hint_boost) {
1208 }
1209
1210
1211 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1212    string temp(txt, len);
1213    fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
1214 }
1215
1216 void PrintLang(FILE* f, Tote* chunk_tote,
1217               Language cur_lang, bool cur_unreliable,
1218               Language prior_lang, bool prior_unreliable) {
1219   if (cur_lang == prior_lang) {
1220     fprintf(f, "[]");
1221   } else {
1222     fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
1223   }
1224 }
1225
1226
1227 void PrintTopLang(Language top_lang) {
1228   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1229     fprintf(stderr, "[] ");
1230   } else {
1231     fprintf(stderr, "[%s] ", LanguageName(top_lang));
1232     prior_lang = top_lang;
1233   }
1234 }
1235
1236 void PrintTopLangSpeculative(Language top_lang) {
1237   fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1238   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1239     fprintf(stderr, "[] ");
1240   } else {
1241     fprintf(stderr, "[%s] ", LanguageName(top_lang));
1242     prior_lang = top_lang;
1243   }
1244   fprintf(stderr, "</span>\n");
1245 }
1246
1247 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1248                 const int* text_bytes, const bool* is_reliable) {
1249   fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1250   if (language3[0] != UNKNOWN_LANGUAGE) {
1251     fprintf(f, "%s%s(%d%%)  ",
1252             LanguageName(language3[0]),
1253             *is_reliable ? "" : "*",
1254             percent3[0]);
1255   }
1256   if (language3[1] != UNKNOWN_LANGUAGE) {
1257     fprintf(f, "%s(%d%%)  ", LanguageName(language3[1]), percent3[1]);
1258   }
1259   if (language3[2] != UNKNOWN_LANGUAGE) {
1260     fprintf(f, "%s(%d%%)  ", LanguageName(language3[2]), percent3[2]);
1261   }
1262   fprintf(f, "%d bytes \n", *text_bytes);
1263
1264   fprintf(f, "<br>\n");
1265 }
1266
1267
1268 // Return internal probability score (sum) per 1024 bytes
1269 double GetNormalizedScore(Language lang, ULScript ulscript,
1270                           int bytecount, int score) {
1271   if (bytecount <= 0) {return 0.0;}
1272   return (score << 10) / bytecount;
1273 }
1274
1275 // Extract return values before fixups
1276 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
1277                     int* reliable_percent3, Language* language3, int* percent3,
1278                     double*  normalized_score3,
1279                     int* text_bytes, bool* is_reliable) {
1280   reliable_percent3[0] = 0;
1281   reliable_percent3[1] = 0;
1282   reliable_percent3[2] = 0;
1283   language3[0] = UNKNOWN_LANGUAGE;
1284   language3[1] = UNKNOWN_LANGUAGE;
1285   language3[2] = UNKNOWN_LANGUAGE;
1286   percent3[0] = 0;
1287   percent3[1] = 0;
1288   percent3[2] = 0;
1289   normalized_score3[0] = 0.0;
1290   normalized_score3[1] = 0.0;
1291   normalized_score3[2] = 0.0;
1292
1293   *text_bytes = total_text_bytes;
1294   *is_reliable = false;
1295
1296   int bytecount1 = 0;
1297   int bytecount2 = 0;
1298   int bytecount3 = 0;
1299
1300   int lang1 = doc_tote->Key(0);
1301   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1302     // We have a top language
1303     language3[0] = static_cast<Language>(lang1);
1304     bytecount1 = doc_tote->Value(0);
1305     int reli1 = doc_tote->Reliability(0);
1306     reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
1307     normalized_score3[0] = GetNormalizedScore(language3[0],
1308                                                   ULScript_Common,
1309                                                   bytecount1,
1310                                                   doc_tote->Score(0));
1311   }
1312
1313   int lang2 = doc_tote->Key(1);
1314   if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
1315     language3[1] = static_cast<Language>(lang2);
1316     bytecount2 = doc_tote->Value(1);
1317     int reli2 = doc_tote->Reliability(1);
1318     reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
1319     normalized_score3[1] = GetNormalizedScore(language3[1],
1320                                                   ULScript_Common,
1321                                                   bytecount2,
1322                                                   doc_tote->Score(1));
1323   }
1324
1325   int lang3 = doc_tote->Key(2);
1326   if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
1327     language3[2] = static_cast<Language>(lang3);
1328     bytecount3 = doc_tote->Value(2);
1329     int reli3 = doc_tote->Reliability(2);
1330     reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
1331     normalized_score3[2] = GetNormalizedScore(language3[2],
1332                                                   ULScript_Common,
1333                                                   bytecount3,
1334                                                   doc_tote->Score(2));
1335   }
1336
1337   // Increase total bytes to sum (top 3) if low for some reason
1338   int total_bytecount12 = bytecount1 + bytecount2;
1339   int total_bytecount123 = total_bytecount12 + bytecount3;
1340   if (total_text_bytes < total_bytecount123) {
1341     total_text_bytes = total_bytecount123;
1342     *text_bytes = total_text_bytes;
1343   }
1344
1345   // Sum minus previous % gives better roundoff behavior than bytecount/total
1346   int total_text_bytes_div = maxint(1, total_text_bytes);    // Avoid zdiv
1347   percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1348   percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1349   percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1350   percent3[2] -= percent3[1];
1351   percent3[1] -= percent3[0];
1352
1353   // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1354   // Fix this explicitly
1355   if (percent3[1] < percent3[2]) {
1356     ++percent3[1];
1357     --percent3[2];
1358   }
1359   if (percent3[0] < percent3[1]) {
1360     ++percent3[0];
1361     --percent3[1];
1362   }
1363
1364   *text_bytes = total_text_bytes;
1365
1366   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1367     // We have a top language
1368     // Its reliability is overall result reliability
1369     int bytecount = doc_tote->Value(0);
1370     int reli = doc_tote->Reliability(0);
1371     int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
1372     *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
1373   } else {
1374     // No top language at all. This can happen with zero text or 100% Klingon
1375     // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
1376     *is_reliable = false;
1377   }
1378
1379   // If ignore percent is too large, set unreliable.
1380   int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1381   if ((ignore_percent > kIgnoreMaxPercent)) {
1382     *is_reliable = false;
1383   }
1384 }
1385
1386 bool IsFIGS(Language lang) {
1387   if (lang == FRENCH) {return true;}
1388   if (lang == ITALIAN) {return true;}
1389   if (lang == GERMAN) {return true;}
1390   if (lang == SPANISH) {return true;}
1391   return false;
1392 }
1393
1394 bool IsEFIGS(Language lang) {
1395   if (lang == ENGLISH) {return true;}
1396   if (lang == FRENCH) {return true;}
1397   if (lang == ITALIAN) {return true;}
1398   if (lang == GERMAN) {return true;}
1399   if (lang == SPANISH) {return true;}
1400   return false;
1401 }
1402
1403 // For Tier3 languages, require more bytes of text to override
1404 // the first-place language
1405 static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
1406 static const int kGoodSecondT3MinBytes = 128;         // <this => no second
1407
1408 // Calculate a single summary language for the document, and its reliability.
1409 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1410 // This is the heart of matching human-rater perception.
1411 // reliable_percent3[] is currently unused
1412 //
1413 // Do not return Tier3 second language unless there are at least 128 bytes
1414 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
1415                      const int* reliable_percent3,
1416                      const Language* language3,
1417                      const int* percent3,
1418                      Language* summary_lang, bool* is_reliable,
1419                      bool FLAGS_cld2_html, bool FLAGS_cld2_quiet,
1420                      int flags) {
1421   // Vector of active languages; changes if we delete some
1422   int slot_count = 3;
1423   int active_slot[3] = {0, 1, 2};
1424
1425   int ignore_percent = 0;
1426   int return_percent = percent3[0];   // Default to top lang
1427   *summary_lang = language3[0];
1428   *is_reliable = true;
1429   if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1430
1431   // If any of top 3 is IGNORE, remove it and increment ignore_percent
1432   for (int i = 0; i < 3; ++i) {
1433     if (language3[i] == TG_UNKNOWN_LANGUAGE) {
1434       ignore_percent += percent3[i];
1435       // Move the rest up, leaving input vectors unchanged
1436       for (int j=i+1; j < 3; ++j) {
1437         active_slot[j - 1] = active_slot[j];
1438       }
1439       -- slot_count;
1440       // Logically remove Ignore from percentage-text calculation
1441       // (extra 1 in 101 avoids zdiv, biases slightly small)
1442       return_percent = (percent3[0] * 100) / (101 - ignore_percent);
1443       *summary_lang = language3[active_slot[0]];
1444       if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
1445     }
1446   }
1447
1448
1449   // If English and X, where X (not UNK) is big enough,
1450   // assume the English is boilerplate and return X.
1451   // Logically remove English from percentage-text calculation
1452   int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
1453   // Require more bytes of text for Tier3 languages
1454   int minbytesneeded = kGoodSecondT1T2MinBytes;
1455   int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
1456
1457   if ((language3[active_slot[0]] == ENGLISH) &&
1458       (language3[active_slot[1]] != ENGLISH) &&
1459       (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1460       (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
1461       (second_bytes >= minbytesneeded)) {
1462     ignore_percent += percent3[active_slot[0]];
1463     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1464     *summary_lang = language3[active_slot[1]];
1465     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1466
1467   // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
1468   // assume the FIGS is boilerplate and return X.
1469   // Logically remove FIGS from percentage-text calculation
1470   } else if (IsFIGS(language3[active_slot[0]]) &&
1471              !IsEFIGS(language3[active_slot[1]]) &&
1472              (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1473              (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
1474              (second_bytes >= minbytesneeded)) {
1475     ignore_percent += percent3[active_slot[0]];
1476     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1477     *summary_lang = language3[active_slot[1]];
1478     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1479
1480   // Else we are returning the first language, but want to improve its
1481   // return_percent if the second language should be ignored
1482   } else  if ((language3[active_slot[1]] == ENGLISH) &&
1483               (language3[active_slot[0]] != ENGLISH)) {
1484     ignore_percent += percent3[active_slot[1]];
1485     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1486   } else  if (IsFIGS(language3[active_slot[1]]) &&
1487               !IsEFIGS(language3[active_slot[0]])) {
1488     ignore_percent += percent3[active_slot[1]];
1489     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1490   }
1491
1492   // If return percent is too small (too many languages), return UNKNOWN
1493   if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) {
1494     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1495       fprintf(stderr, "{Unreli %s %d%% percent too small} ",
1496               LanguageCode(*summary_lang), return_percent);
1497     }
1498     *summary_lang = UNKNOWN_LANGUAGE;
1499     *is_reliable = false;
1500   }
1501
1502   // If return percent is small, return language but set unreliable.
1503   if ((return_percent < kGoodFirstReliableMinPercent)) {
1504     *is_reliable = false;
1505   }
1506
1507   // If ignore percent is too large, set unreliable.
1508   ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1509   if ((ignore_percent > kIgnoreMaxPercent)) {
1510     *is_reliable = false;
1511   }
1512
1513   // If we removed all the active languages, return UNKNOWN
1514   if (slot_count == 0) {
1515     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1516       fprintf(stderr, "{Unreli %s no languages left} ",
1517               LanguageCode(*summary_lang));
1518     }
1519     *summary_lang = UNKNOWN_LANGUAGE;
1520     *is_reliable = false;
1521   }
1522 }
1523
1524 void AddLangPriorBoost(Language lang, uint32 langprob,
1525                        ScoringContext* scoringcontext) {
1526   // This is called 0..n times with language hints
1527   // but we don't know the script -- so boost either or both Latn, Othr.
1528
1529   if (IsLatnLanguage(lang)) {
1530     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
1531     int n = langprior_boost->n;
1532     langprior_boost->langprob[n] = langprob;
1533     langprior_boost->n = langprior_boost->wrap(n + 1);
1534   }
1535
1536   if (IsOthrLanguage(lang)) {
1537     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
1538     int n = langprior_boost->n;
1539     langprior_boost->langprob[n] = langprob;
1540     langprior_boost->n = langprior_boost->wrap(n + 1);
1541   }
1542
1543 }
1544
1545 void AddOneWhack(Language whacker_lang, Language whackee_lang,
1546                  ScoringContext* scoringcontext) {
1547   uint32 langprob = MakeLangProb(whackee_lang, 1);
1548   // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
1549   if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
1550     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
1551     int n = langprior_whack->n;
1552     langprior_whack->langprob[n] = langprob;
1553     langprior_whack->n = langprior_whack->wrap(n + 1);
1554   }
1555   if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
1556     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
1557     int n = langprior_whack->n;
1558     langprior_whack->langprob[n] = langprob;
1559     langprior_whack->n = langprior_whack->wrap(n + 1);
1560  }
1561 }
1562
1563 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
1564   // We do not in general want zh-Hans and zh-Hant to be close pairs,
1565   // but we do here.
1566   if (lang == CLD2::CHINESE) {
1567     AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
1568     return;
1569   }
1570   if (lang == CLD2::CHINESE_T) {
1571     AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
1572     return;
1573   }
1574
1575   int base_lang_set = LanguageCloseSet(lang);
1576   if (base_lang_set == 0) {return;}
1577   // TODO: add an explicit list of each set to avoid this 512-times loop
1578   for (int i = 0; i < kLanguageToPLangSize; ++i) {
1579     Language lang2 = static_cast<Language>(i);
1580     if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
1581       AddOneWhack(lang, lang2, scoringcontext);
1582     }
1583   }
1584 }
1585
1586
1587 void ApplyHints(const char* buffer,
1588                 int buffer_length,
1589                 bool is_plain_text,
1590                 const CLDHints* cld_hints,
1591                 ScoringContext* scoringcontext) {
1592   CLDLangPriors lang_priors;
1593   InitCLDLangPriors(&lang_priors);
1594
1595   // We now use lang= tags.
1596   // Last look, circa 2008 found only 15% of web pages with lang= tags and
1597   // many of those were wrong. Now (July 2011), we find 44% of web pages have
1598   // lang= tags, and most of them are correct. So we now give them substantial
1599   // weight in each chunk scored.
1600   if (!is_plain_text) {
1601     // Get any contained language tags in first n KB
1602     int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
1603     string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
1604                                            max_scan_bytes);
1605     SetCLDLangTagsHint(lang_tags, &lang_priors);
1606     if (scoringcontext->flags_cld2_html) {
1607       if (!lang_tags.empty()) {
1608         fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
1609                 lang_tags.c_str());
1610       }
1611     }
1612   }
1613
1614   if (cld_hints != NULL) {
1615     if ((cld_hints->content_language_hint != NULL) &&
1616         (cld_hints->content_language_hint[0] != '\0')) {
1617       SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
1618     }
1619
1620     // Input is from GetTLD(), already lowercased
1621     if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
1622       SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
1623     }
1624
1625     if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
1626       Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
1627       SetCLDEncodingHint(enc, &lang_priors);
1628     }
1629
1630     if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
1631       SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
1632     }
1633   }
1634
1635   // Keep no more than four different languages with hints
1636   TrimCLDLangPriors(4, &lang_priors);
1637
1638   if (scoringcontext->flags_cld2_html) {
1639     string print_temp = DumpCLDLangPriors(&lang_priors);
1640     if (!print_temp.empty()) {
1641       fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
1642               print_temp.c_str());
1643     }
1644   }
1645
1646   // Put boosts into ScoringContext
1647   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1648     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1649     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1650     if (qprob > 0) {
1651       uint32 langprob = MakeLangProb(lang, qprob);
1652       AddLangPriorBoost(lang, langprob, scoringcontext);
1653     }
1654   }
1655
1656   // Put whacks into scoring context
1657   // We do not in general want zh-Hans and zh-Hant to be close pairs,
1658   // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
1659   std::vector<int> close_set_count(kCloseSetSize + 1, 0);
1660
1661   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1662     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1663     ++close_set_count[LanguageCloseSet(lang)];
1664     if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
1665     if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
1666   }
1667
1668   // If a boost language is in a close set, force suppressing the others in
1669   // that set, if exactly one of the set is present
1670   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1671     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1672     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1673     if (qprob > 0) {
1674       int close_set = LanguageCloseSet(lang);
1675       if ((close_set > 0) && (close_set_count[close_set] == 1)) {
1676         AddCloseLangWhack(lang, scoringcontext);
1677       }
1678       if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
1679           (close_set_count[kCloseSetSize] == 1)) {
1680         AddCloseLangWhack(lang, scoringcontext);
1681       }
1682     }
1683   }
1684 }
1685
1686
1687 // Extend results to fully cover the [lo..hi) range
1688 void FinishResultVector(int lo, int hi, ResultChunkVector* vec) {
1689   if (vec == NULL) {return;}
1690   if (vec->size() == 0) {return;}
1691   ResultChunk* rc = &(*vec)[0];
1692   if (rc->offset > lo) {
1693     int diff = rc->offset - lo;
1694     rc->offset -= diff;
1695     rc->bytes += diff;
1696   }
1697   ResultChunk* rc2 = &(*vec)[vec->size() - 1];
1698   int rc2hi = rc2->offset + rc2->bytes;
1699   if (rc2hi < hi) {
1700     int diff = hi - rc2hi;
1701     rc2->bytes += diff;
1702   }
1703 }
1704
1705
1706 // Results language3/percent3/text_bytes must be exactly three items
1707 Language DetectLanguageSummaryV2(
1708                         const char* buffer,
1709                         int buffer_length,
1710                         bool is_plain_text,
1711                         const CLDHints* cld_hints,
1712                         bool allow_extended_lang,
1713                         int flags,
1714                         Language plus_one,
1715                         Language* language3,
1716                         int* percent3,
1717                         double* normalized_score3,
1718                         ResultChunkVector* resultchunkvector,
1719                         int* text_bytes,
1720                         bool* is_reliable) {
1721   language3[0] = UNKNOWN_LANGUAGE;
1722   language3[1] = UNKNOWN_LANGUAGE;
1723   language3[2] = UNKNOWN_LANGUAGE;
1724   percent3[0] = 0;
1725   percent3[1] = 0;
1726   percent3[2] = 0;
1727   normalized_score3[0] = 0.0;
1728   normalized_score3[1] = 0.0;
1729   normalized_score3[2] = 0.0;
1730   if (resultchunkvector != NULL) {
1731     resultchunkvector->clear();
1732   }
1733   *text_bytes = 0;
1734   *is_reliable = false;
1735
1736   if ((flags & kCLDFlagEcho) != 0) {
1737      string temp(buffer, buffer_length);
1738      if ((flags & kCLDFlagHtml) != 0) {
1739         fprintf(stderr, "CLD2[%d] '%s'<br>\n",
1740                 buffer_length, GetHtmlEscapedText(temp).c_str());
1741      } else {
1742         fprintf(stderr, "CLD2[%d] '%s'\n",
1743                 buffer_length, GetPlainEscapedText(temp).c_str());
1744      }
1745   }
1746
1747 #ifdef CLD2_DYNAMIC_MODE
1748   // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
1749   // hasn't been loaded yet. This is the only sane thing we can do, as there
1750   // are no scoring tables to consult.
1751   bool dataLoaded = isDataLoaded();
1752   if ((flags & kCLDFlagVerbose) != 0) {
1753     fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
1754   }
1755   if (!dataLoaded) {
1756     return UNKNOWN_LANGUAGE;
1757   }
1758 #endif
1759
1760   // Exit now if no text
1761   if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
1762   if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
1763
1764   // Document totals
1765   DocTote doc_tote;   // Reliability = 0..100
1766
1767   // ScoringContext carries state across scriptspans
1768   ScoringContext scoringcontext;
1769   scoringcontext.debug_file = stderr;
1770   scoringcontext.flags_cld2_score_as_quads =
1771     ((flags & kCLDFlagScoreAsQuads) != 0);
1772   scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
1773   scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
1774   scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
1775   scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
1776   scoringcontext.ulscript = ULScript_Common;
1777   scoringcontext.scoringtables = &kScoringtables;
1778   scoringcontext.scanner = NULL;
1779   scoringcontext.init();            // Clear the internal memory arrays
1780
1781   // Now thread safe.
1782   bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
1783   bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
1784
1785   ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
1786
1787   // Four individual script totals, Latin, Han, other2, other3
1788   int next_other_tote = 2;
1789   int tote_num = 0;
1790
1791   // Four totes for up to four different scripts pending at once
1792   Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
1793   bool tote_seen[4] = {false, false, false, false};
1794   int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
1795   ULScript tote_script[4] =
1796     {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
1797
1798   // Loop through text spans in a single script
1799   ScriptScanner ss(buffer, buffer_length, is_plain_text);
1800   LangSpan scriptspan;
1801
1802   scoringcontext.scanner = &ss;
1803
1804   scriptspan.text = NULL;
1805   scriptspan.text_bytes = 0;
1806   scriptspan.offset = 0;
1807   scriptspan.ulscript = ULScript_Common;
1808   scriptspan.lang = UNKNOWN_LANGUAGE;
1809
1810   int total_text_bytes = 0;
1811   int textlimit = FLAGS_cld_textlimit << 10;    // in KB
1812   if (textlimit == 0) {textlimit = 0x7fffffff;}
1813
1814   int advance_by = 2;                   // Advance 2 bytes
1815   int advance_limit = textlimit >> 3;   // For first 1/8 of max document
1816
1817   int initial_word_span = kDefaultWordSpan;
1818   if (FLAGS_cld_forcewords) {
1819     initial_word_span = kReallyBigWordSpan;
1820   }
1821
1822   // Pick up chunk sizes
1823   // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
1824   // Sanity check -- force into a reasonable range
1825   int chunksizequads = FLAGS_cld_smoothwidth;
1826   chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
1827                                kMaxChunkSizeQuads);
1828   int chunksizeunis = (chunksizequads * 5) >> 1;
1829
1830   // Varying short-span limit doesn't work well -- skips too much beyond 20KB
1831   // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
1832   int spantooshortlimit = kShortSpanThresh;
1833
1834   // For debugging only. Not thread-safe
1835   prior_lang = UNKNOWN_LANGUAGE;
1836   prior_unreliable = false;
1837
1838   // Allocate full-document prediction table for finding repeating words
1839   int hash = 0;
1840   int* predict_tbl = new int[kPredictionTableSize];
1841   if (FlagRepeats(flags)) {
1842     memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1843   }
1844
1845
1846
1847   // Loop through scriptspans accumulating number of text bytes in each language
1848   while (ss.GetOneScriptSpanLower(&scriptspan)) {
1849     ULScript ulscript = scriptspan.ulscript;
1850
1851     // Squeeze out big chunks of text span if asked to
1852     if (FlagSqueeze(flags)) {
1853       // Remove repetitive or mostly-spaces chunks
1854       int newlen;
1855       int chunksize = 0;    // Use the default
1856       if (resultchunkvector != NULL) {
1857          newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
1858                                                scriptspan.text_bytes,
1859                                                chunksize);
1860       } else {
1861          newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
1862                                       chunksize);
1863       }
1864       scriptspan.text_bytes = newlen;
1865     } else {
1866       // Check now and then to see if we should be squeezing
1867       if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
1868           !FlagFinish(flags)) {
1869         // fprintf(stderr, "CheapSqueezeTriggerTest, "
1870         //                 "first %d bytes of %d (>%d/2)<br>\n",
1871         //         kCheapSqueezeTestLen,
1872         //         scriptspan.text_bytes,
1873         //         kCheapSqueezeTestThresh);
1874
1875         if (CheapSqueezeTriggerTest(scriptspan.text,
1876                                       scriptspan.text_bytes,
1877                                       kCheapSqueezeTestLen)) {
1878           // Recursive call with big-chunk squeezing set
1879           if (FLAGS_cld2_html || FLAGS_dbgscore) {
1880             fprintf(stderr,
1881                     "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
1882                     total_text_bytes);
1883           }
1884           // Deallocate full-document prediction table
1885           delete[] predict_tbl;
1886
1887           return DetectLanguageSummaryV2(
1888                             buffer,
1889                             buffer_length,
1890                             is_plain_text,
1891                             cld_hints,
1892                             allow_extended_lang,
1893                             flags | kCLDFlagSqueeze,
1894                             plus_one,
1895                             language3,
1896                             percent3,
1897                             normalized_score3,
1898                             resultchunkvector,
1899                             text_bytes,
1900                             is_reliable);
1901         }
1902       }
1903     }
1904
1905     // Remove repetitive words if asked to
1906     if (FlagRepeats(flags)) {
1907       // Remove repetitive words
1908       int newlen;
1909       if (resultchunkvector != NULL) {
1910         newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
1911                                                scriptspan.text_bytes,
1912                                                &hash, predict_tbl);
1913       } else {
1914         newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
1915                                       &hash, predict_tbl);
1916       }
1917       scriptspan.text_bytes = newlen;
1918     }
1919
1920     // Scoring depends on scriptspan buffer ALWAYS having
1921     // leading space and off-the-end space space space NUL,
1922     // DCHECK(scriptspan.text[0] == ' ');
1923     // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
1924     // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
1925     // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
1926     // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
1927
1928     // The real scoring
1929     // Accumulate directly into the document total, or accmulate in one of four
1930     // chunk totals. The purpose of the multiple chunk totals is to piece
1931     // together short choppy pieces of text in alternating scripts. One total is
1932     // dedicated to Latin text, one to Han text, and the other two are dynamicly
1933     // assigned.
1934
1935     scoringcontext.ulscript = scriptspan.ulscript;
1936     // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
1937
1938     ScoreOneScriptSpan(scriptspan,
1939                        &scoringcontext,
1940                        &doc_tote,
1941                        resultchunkvector);
1942
1943     total_text_bytes += scriptspan.text_bytes;
1944   }     // End while (ss.GetOneScriptSpanLower())
1945
1946   // Deallocate full-document prediction table
1947   delete[] predict_tbl;
1948
1949   if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1950     // If no forced <cr>, put one in front of dump
1951     if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
1952     doc_tote.Dump(stderr);
1953   }
1954
1955
1956   // If extended langauges are disallowed, remove them here
1957   if (!allow_extended_lang) {
1958     RemoveExtendedLanguages(&doc_tote);
1959   }
1960
1961   // Force close pairs to one or the other
1962   // If given, also update resultchunkvector
1963   RefineScoredClosePairs(&doc_tote, resultchunkvector,
1964                          FLAGS_cld2_html, FLAGS_cld2_quiet);
1965
1966
1967   // Calculate return results
1968   // Find top three byte counts in tote heap
1969   int reliable_percent3[3];
1970
1971   // Cannot use Add, etc. after sorting
1972   doc_tote.Sort(3);
1973
1974   ExtractLangEtc(&doc_tote, total_text_bytes,
1975                  reliable_percent3, language3, percent3, normalized_score3,
1976                  text_bytes, is_reliable);
1977
1978   bool have_good_answer = false;
1979   if (FlagFinish(flags)) {
1980     // Force a result
1981     have_good_answer = true;
1982   } else if (total_text_bytes <= kShortTextThresh) {
1983     // Don't recurse on short text -- we already did word scores
1984     have_good_answer = true;
1985   } else if (*is_reliable &&
1986              (percent3[0] >= kGoodLang1Percent)) {
1987     have_good_answer = true;
1988   } else if (*is_reliable &&
1989              ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
1990     have_good_answer = true;
1991   }
1992
1993
1994   if (have_good_answer) {
1995     // This is the real, non-recursive return
1996
1997     // Move bytes for unreliable langs to another lang or UNKNOWN
1998     if (!FlagBestEffort(flags)) {
1999       RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
2000     }
2001
2002     // Redo the result extraction after the removal above
2003     doc_tote.Sort(3);
2004     ExtractLangEtc(&doc_tote, total_text_bytes,
2005                    reliable_percent3, language3, percent3, normalized_score3,
2006                    text_bytes, is_reliable);
2007
2008     Language summary_lang;
2009     CalcSummaryLang(&doc_tote, total_text_bytes,
2010                     reliable_percent3, language3, percent3,
2011                     &summary_lang, is_reliable,
2012                     FLAGS_cld2_html, FLAGS_cld2_quiet, flags);
2013
2014     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
2015       for (int i = 0; i < 3; ++i) {
2016         if (language3[i] != UNKNOWN_LANGUAGE) {
2017           fprintf(stderr, "%s.%dR(%d%%) ",
2018                   LanguageCode(language3[i]),
2019                   reliable_percent3[i],
2020                   percent3[i]);
2021         }
2022       }
2023
2024       fprintf(stderr, "%d bytes ", total_text_bytes);
2025       fprintf(stderr, "= %s%c ",
2026               LanguageName(summary_lang), *is_reliable ? ' ' : '*');
2027       fprintf(stderr, "<br><br>\n");
2028     }
2029
2030     // Slightly condensed if quiet
2031     if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
2032       fprintf(stderr, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ");
2033       for (int i = 0; i < 3; ++i) {
2034         if (language3[i] != UNKNOWN_LANGUAGE) {
2035           fprintf(stderr, "&nbsp;&nbsp;%s %d%% ",
2036                   LanguageCode(language3[i]),
2037                   percent3[i]);
2038         }
2039       }
2040       fprintf(stderr, "= %s%c ",
2041               LanguageName(summary_lang), *is_reliable ? ' ' : '*');
2042       fprintf(stderr, "<br>\n");
2043     }
2044
2045     // Extend results to fully cover the input buffer
2046     FinishResultVector(0, buffer_length, resultchunkvector);
2047
2048     return summary_lang;
2049   }
2050
2051   // Not a good answer -- do recursive call to refine
2052   if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
2053     // This is what we hope to improve on in the recursive call, if any
2054     PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
2055   }
2056
2057   // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
2058   // For this purpose, we treate "Ignore" as top40
2059   Language new_plus_one = UNKNOWN_LANGUAGE;
2060
2061   if (total_text_bytes < kShortTextThresh) {
2062       // Short text: Recursive call with top40 and short set
2063       if (FLAGS_cld2_html || FLAGS_dbgscore) {
2064         fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
2065                 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
2066                 total_text_bytes);
2067       }
2068       return DetectLanguageSummaryV2(
2069                         buffer,
2070                         buffer_length,
2071                         is_plain_text,
2072                         cld_hints,
2073                         allow_extended_lang,
2074                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2075                           kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
2076                         new_plus_one,
2077                         language3,
2078                         percent3,
2079                         normalized_score3,
2080                         resultchunkvector,
2081                         text_bytes,
2082                         is_reliable);
2083   }
2084
2085   // Longer text: Recursive call with top40 set
2086   if (FLAGS_cld2_html || FLAGS_dbgscore) {
2087     fprintf(stderr,
2088             "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2089             total_text_bytes);
2090   }
2091   return DetectLanguageSummaryV2(
2092                         buffer,
2093                         buffer_length,
2094                         is_plain_text,
2095                         cld_hints,
2096                         allow_extended_lang,
2097                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2098                           kCLDFlagFinish,
2099                         new_plus_one,
2100                         language3,
2101                         percent3,
2102                         normalized_score3,
2103                         resultchunkvector,
2104                         text_bytes,
2105                         is_reliable);
2106 }
2107
2108
2109 // For debugging and wrappers. Not thread safe.
2110 static char temp_detectlanguageversion[32];
2111
2112 // Return version text string
2113 // String is "code_version - data_build_date"
2114 const char* DetectLanguageVersion() {
2115   if (kScoringtables.quadgram_obj == NULL) {return "";}
2116   sprintf(temp_detectlanguageversion,
2117           "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
2118   return temp_detectlanguageversion;
2119 }
2120
2121
2122 }       // End namespace CLD2