src/third_party/cld/encodings/compact_lang_det/cldutil.cc

   1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <string>
   6 #include "encodings/compact_lang_det/cldutil.h"
   7
   8 #include "base/basictypes.h"
   9 #include "encodings/compact_lang_det/cldutil_dbg.h"
  10 #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
  11 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
  12 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
  13 #include "encodings/compact_lang_det/win/cld_logging.h"
  14 #include "encodings/compact_lang_det/win/cld_unilib.h"
  15 #include "encodings/compact_lang_det/win/cld_utf.h"
  16 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
  17
  18 // Runtime routines for hashing, looking up, and scoring
  19 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
  20 // Unigrams and bigrams are for CJK languages only, including simplified/
  21 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
  22 // Zhuang Han characters. Surrounding spaces are not considered.
  23 // Quadgrams and octagrams for for non-CJK and include two bits indicating
  24 // preceding and trailing spaces (word boundaries).
  25
  26
  27 // Indicator bits for leading/trailing space around quad/octagram
  28 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
  29 // 1-, 2-, or 3-bytes each.
  30 static const uint32 kPreSpaceIndicator =  0x00004444;
  31 static const uint32 kPostSpaceIndicator = 0x44440000;
  32
  33 // Little-endian masks for 0..24 bytes picked up as uint32's
  34 static const uint32 kWordMask0[4] = {
  35   0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
  36 };
  37
  38 static const int kMinCJKUTF8CharBytes = 3;
  39
  40 static const int kMinGramCount = 3;
  41 static const int kMaxGramCount = 16;
  42
  43
  44
  45
  46 // Routines to access a hash table of <key:wordhash, value:probs> pairs
  47 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
  48 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
  49 // bucket subscript.
  50 // Probs is a packed: three languages plus a subscript for probability table
  51 // Buckets have all the keys together, then all the values.Key array never
  52 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
  53 // Match case may sometimes take an additional cache miss on value access.
  54 //
  55 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
  56 // byte buckets with single cache miss.
  57 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
  58 //------------------------------------------------------------------------------
  59
  60
  61 //------------------------------------------------------------------------------
  62 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
  63 //------------------------------------------------------------------------------
  64
  65 // Design principles for these hash functions
  66 // - Few operations
  67 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
  68 //   Latin script expect 1- and 2-byte mixtures.
  69 // - Last byte of each character has about 5 bits of information
  70 // - Spread good bits around so they can interact in at least two ways
  71 //   with other characters
  72 // - Use add for additional mixing thorugh carries
  73
  74 // CJK Three-byte bigram
  75 //   ....dddd..cccccc..bbbbbb....aaaa
  76 //   ..................ffffff..eeeeee
  77 // make
  78 //   ....dddd..cccccc..bbbbbb....aaaa
  79 //   000....dddd..cccccc..bbbbbb....a
  80 //   ..................ffffff..eeeeee
  81 //   ffffff..eeeeee000000000000000000
  82 //
  83 // CJK Four-byte bigram
  84 //   ..dddddd..cccccc....bbbb....aaaa
  85 //   ..hhhhhh..gggggg....ffff....eeee
  86 // make
  87 //   ..dddddd..cccccc....bbbb....aaaa
  88 //   000..dddddd..cccccc....bbbb....a
  89 //   ..hhhhhh..gggggg....ffff....eeee
  90 //   ..ffff....eeee000000000000000000
  91
  92 // BIGRAM
  93 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
  94 // OVERSHOOTS up to 3 bytes
  95 // For runtime use of tables
  96 uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
  97   if (bytecount == 0) {
  98     return 0;
  99   }
 100   uint32 word0, word1;
 101   if (bytecount <= 4) {
 102     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
 103     word0 = word0 ^ (word0 >> 3);
 104     return word0;
 105   }
 106   // Else do 8 bytes
 107   word0 = UnalignedLoad32(word_ptr);
 108   word0 = word0 ^ (word0 >> 3);
 109   word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
 110   word1 = word1 ^ (word1 << 18);
 111   return word0 + word1;
 112 }
 113
 114 //
 115 // Ascii-7 One-byte chars
 116 //   ...ddddd...ccccc...bbbbb...aaaaa
 117 // make
 118 //   ...ddddd...ccccc...bbbbb...aaaaa
 119 //   000...ddddd...ccccc...bbbbb...aa
 120 //
 121 // Latin 1- and 2-byte chars
 122 //   ...ddddd...ccccc...bbbbb...aaaaa
 123 //   ...................fffff...eeeee
 124 // make
 125 //   ...ddddd...ccccc...bbbbb...aaaaa
 126 //   000...ddddd...ccccc...bbbbb...aa
 127 //   ...................fffff...eeeee
 128 //   ...............fffff...eeeee0000
 129 //
 130 // Non-CJK Two-byte chars
 131 //   ...ddddd...........bbbbb........
 132 //   ...hhhhh...........fffff........
 133 // make
 134 //   ...ddddd...........bbbbb........
 135 //   000...ddddd...........bbbbb.....
 136 //   ...hhhhh...........fffff........
 137 //   hhhh...........fffff........0000
 138 //
 139 // Non-CJK Three-byte chars
 140 //   ...........ccccc................
 141 //   ...................fffff........
 142 //   ...lllll...................iiiii
 143 // make
 144 //   ...........ccccc................
 145 //   000...........ccccc.............
 146 //   ...................fffff........
 147 //   ...............fffff........0000
 148 //   ...lllll...................iiiii
 149 //   .lllll...................iiiii00
 150 //
 151
 152 // QUADGRAM
 153 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
 154 // OVERSHOOTS up to 3 bytes
 155 // For runtime use of tables
 156 uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
 157   uint32 word0, word1, word2;
 158   if (bytecount <= 4) {
 159     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
 160     word0 = word0 ^ (word0 >> 3);
 161     return word0 ^ prepost;
 162   } else if (bytecount <= 8) {
 163     word0 = UnalignedLoad32(word_ptr);
 164     word0 = word0 ^ (word0 >> 3);
 165     word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
 166     word1 = word1 ^ (word1 << 4);
 167     return (word0 ^ prepost) + word1;
 168   }
 169   // else do 12 bytes
 170   word0 = UnalignedLoad32(word_ptr);
 171   word0 = word0 ^ (word0 >> 3);
 172   word1 = UnalignedLoad32(word_ptr + 4);
 173   word1 = word1 ^ (word1 << 4);
 174   word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
 175   word2 = word2 ^ (word2 << 2);
 176   return (word0 ^ prepost) + word1 + word2;
 177 }
 178
 179
 180 // QUADGRAM wrapper with surrounding spaces
 181 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
 182 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 183 // For runtime use of tables
 184 uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
 185   if (bytecount == 0) {
 186     return 0;
 187   }
 188   uint32 prepost = 0;
 189   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
 190   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
 191   return QuadHashV25Mix(word_ptr, bytecount, prepost);
 192 }
 193
 194 // QUADGRAM wrapper with surrounding underscores (offline use)
 195 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
 196 // OVERSHOOTS up to 3 bytes
 197 // For offline construction of tables
 198 uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
 199   if (bytecount == 0) {
 200     return 0;
 201   }
 202   const char* local_word_ptr = word_ptr;
 203   int local_bytecount = bytecount;
 204   uint32 prepost = 0;
 205   if (local_word_ptr[0] == '_') {
 206     prepost |= kPreSpaceIndicator;
 207     ++local_word_ptr;
 208     --local_bytecount;
 209   }
 210   if (local_word_ptr[local_bytecount - 1] == '_') {
 211     prepost |= kPostSpaceIndicator;
 212     --local_bytecount;
 213   }
 214   return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
 215 }
 216
 217
 218 // OCTAGRAM
 219 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
 220 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 221 //
 222 // The low 32 bits follow the pattern from above, tuned to different scripts
 223 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
 224 // For runtime use of tables V3
 225 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
 226   uint64 word0;
 227   uint64 word1;
 228   uint64 sum;
 229
 230   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
 231   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
 232   switch ((bytecount - 1) >> 2) {
 233   case 0:       // 1..4 bytes
 234     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
 235     sum = word0;
 236     word0 = word0 ^ (word0 >> 3);
 237     break;
 238   case 1:       // 5..8 bytes
 239     word0 = UnalignedLoad32(word_ptr);
 240     sum = word0;
 241     word0 = word0 ^ (word0 >> 3);
 242     word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
 243     sum += word1;
 244     word1 = word1 ^ (word1 << 4);
 245     word0 += word1;
 246     break;
 247   case 2:       // 9..12 bytes
 248     word0 = UnalignedLoad32(word_ptr);
 249     sum = word0;
 250     word0 = word0 ^ (word0 >> 3);
 251     word1 = UnalignedLoad32(word_ptr + 4);
 252     sum += word1;
 253     word1 = word1 ^ (word1 << 4);
 254     word0 += word1;
 255     word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
 256     sum += word1;
 257     word1 = word1 ^ (word1 << 2);
 258     word0 += word1;
 259     break;
 260   case 3:       // 13..16 bytes
 261     word0 = UnalignedLoad32(word_ptr);
 262     sum = word0;
 263     word0 = word0 ^ (word0 >> 3);
 264     word1 = UnalignedLoad32(word_ptr + 4);
 265     sum += word1;
 266     word1 = word1 ^ (word1 << 4);
 267     word0 += word1;
 268     word1 = UnalignedLoad32(word_ptr + 8);
 269     sum += word1;
 270     word1 = word1 ^ (word1 << 2);
 271     word0 += word1;
 272     word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
 273     sum += word1;
 274     word1 = word1 ^ (word1 >> 8);
 275     word0 += word1;
 276     break;
 277   case 4:       // 17..20 bytes
 278     word0 = UnalignedLoad32(word_ptr);
 279     sum = word0;
 280     word0 = word0 ^ (word0 >> 3);
 281     word1 = UnalignedLoad32(word_ptr + 4);
 282     sum += word1;
 283     word1 = word1 ^ (word1 << 4);
 284     word0 += word1;
 285     word1 = UnalignedLoad32(word_ptr + 8);
 286     sum += word1;
 287     word1 = word1 ^ (word1 << 2);
 288     word0 += word1;
 289     word1 = UnalignedLoad32(word_ptr + 12);
 290     sum += word1;
 291     word1 = word1 ^ (word1 >> 8);
 292     word0 += word1;
 293     word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
 294     sum += word1;
 295     word1 = word1 ^ (word1 >> 4);
 296     word0 += word1;
 297     break;
 298   default:      // 21..24 bytes and higher (ignores beyond 24)
 299     word0 = UnalignedLoad32(&word_ptr);
 300     sum = word0;
 301     word0 = word0 ^ (word0 >> 3);
 302     word1 = UnalignedLoad32(word_ptr + 4);
 303     sum += word1;
 304     word1 = word1 ^ (word1 << 4);
 305     word0 += word1;
 306     word1 = UnalignedLoad32(word_ptr + 8);
 307     sum += word1;
 308     word1 = word1 ^ (word1 << 2);
 309     word0 += word1;
 310     word1 = UnalignedLoad32(word_ptr + 12);
 311     sum += word1;
 312     word1 = word1 ^ (word1 >> 8);
 313     word0 += word1;
 314     word1 = UnalignedLoad32(word_ptr + 16);
 315     sum += word1;
 316     word1 = word1 ^ (word1 >> 4);
 317     word0 += word1;
 318     word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
 319     sum += word1;
 320     word1 = word1 ^ (word1 >> 6);
 321     word0 += word1;
 322     break;
 323   }
 324
 325   sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
 326   sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
 327   sum = (sum & 0xff) << 32;
 328   return (word0 ^ prepost) + sum;
 329 }
 330
 331 // OCTAGRAM wrapper with surrounding spaces
 332 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
 333 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 334 //
 335 // The low 32 bits follow the pattern from above, tuned to different scripts
 336 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
 337 // For runtime use of tables V3
 338 uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
 339   if (bytecount == 0) {
 340     return 0;
 341   }
 342   uint64 prepost = 0;
 343   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
 344   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
 345   return OctaHash40Mix(word_ptr, bytecount, prepost);
 346 }
 347
 348
 349 // OCTAGRAM wrapper with surrounding underscores (offline use)
 350 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
 351 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 352 //
 353 // The low 32 bits follow the pattern from above, tuned to different scripts
 354 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
 355 // For offline construction of tables
 356 uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
 357   if (bytecount == 0) {
 358     return 0;
 359   }
 360   const char* local_word_ptr = word_ptr;
 361   int local_bytecount = bytecount;
 362   uint64 prepost = 0;
 363   if (local_word_ptr[0] == '_') {
 364     prepost |= kPreSpaceIndicator;
 365     ++local_word_ptr;
 366     --local_bytecount;
 367   }
 368   if (local_word_ptr[local_bytecount - 1] == '_') {
 369     prepost |= kPostSpaceIndicator;
 370     --local_bytecount;
 371   }
 372   return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
 373 }
 374
 375
 376
 377
 378 //------------------------------------------------------------------------------
 379 // Scoring single groups of letters
 380 //------------------------------------------------------------------------------
 381
 382 // UNIGRAM score one => tote
 383 // Input: 1-byte entry of subscript into unigram probs, plus
 384 //  an accumulator tote.
 385 // Output: running sums in tote updated
 386 void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
 387   tote->AddGram();
 388   const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
 389   if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
 390   if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
 391   if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
 392   if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
 393   if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
 394   if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
 395 }
 396
 397 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
 398 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
 399 //  an accumulator tote. (language 0 means unused entry)
 400 // Output: running sums in tote updated
 401 void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
 402   tote->AddGram();
 403   uint8 prob123 = (probs >> 0) & 0xff;
 404   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
 405
 406   uint8 top1 = (probs >> 8) & 0xff;
 407   if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
 408   uint8 top2 = (probs >> 16) & 0xff;
 409   if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
 410   uint8 top3 = (probs >> 24) & 0xff;
 411   if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
 412 }
 413
 414
 415 //------------------------------------------------------------------------------
 416 // Routines to accumulate probabilities
 417 //------------------------------------------------------------------------------
 418
 419
 420 // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
 421 // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
 422 // Score up to n unigrams, returning number of bytes consumed
 423 // Updates tote_grams
 424 int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
 425                       const char* isrc, int srclen, int advance_by,
 426                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
 427   const char* src = isrc;
 428   if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
 429
 430   // Property-based CJK unigram lookup
 431   if (src[0] == ' ') {++src; --srclen;}
 432
 433   const uint8* usrc = reinterpret_cast<const uint8*>(src);
 434   int usrclen = srclen;
 435
 436   while (usrclen > 0) {
 437     int len = kAdvanceOneChar[usrc[0]];
 438     // Look up property of one UTF-8 character and advance over it
 439     // Return 0 if input length is zero
 440     // Return 0 and advance one byte if input is ill-formed
 441
 442     int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
 443
 444     if (FLAGS_dbglookup) {
 445       DbgUniTermToStderr(propval, usrc, len);
 446     }
 447
 448     if (propval > 0) {
 449       ProcessProbV25UniTote(propval, chunk_tote);
 450       ++(*tote_grams);
 451       if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
 452     }
 453
 454     // Advance by 1/2/4/8 characters (half of quad advance)
 455     if (advance_by == 2) {
 456       // Already advanced by 1
 457     } else if (advance_by == 4) {
 458       // Advance by 2 chars total, if not at end
 459       if (UTFmax <= usrclen) {
 460         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 461       }
 462     } else if (advance_by == 8) {
 463       // Advance by 4 chars total, if not at end
 464       if ((UTFmax * 3) <= usrclen) {
 465         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 466         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 467         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 468       }
 469     } else {
 470       // Advance by 8 chars total, if not at end
 471       if ((UTFmax * 7) <= usrclen) {
 472         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 473         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 474         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 475         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 476         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 477         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 478         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
 479       }
 480     }
 481     DCHECK(usrclen >= 0);
 482
 483     if (*tote_grams >= gram_limit) {
 484       break;
 485     }
 486   }
 487   if (FLAGS_dbgscore) {
 488     // With advance_by>2, we consume more input to get the same number of quads
 489     int len = src - isrc;
 490     DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
 491     DbgScoreFlush();
 492   }
 493
 494   int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
 495   return consumed2;
 496 }
 497
 498
 499 // BIGRAM, using hash table, always advancing by 1 char
 500 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
 501 // Score all bigrams in isrc, using languages that have bigrams (CJK)
 502 // Return number of bigrams that hit in the hash table
 503 int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
 504                          const char* isrc, int srclen, Tote* chunk_tote) {
 505   int hit_count = 0;
 506   const char* src = isrc;
 507
 508   // Hashtable-based CJK bigram lookup
 509   const uint8* usrc = reinterpret_cast<const uint8*>(src);
 510   const uint8* usrclimit1 = usrc + srclen - UTFmax;
 511   if (FLAGS_dbgscore) {
 512     fprintf(stderr, "  " );
 513   }
 514
 515   while (usrc < usrclimit1) {
 516     int len = kAdvanceOneChar[usrc[0]];
 517     int len2 = kAdvanceOneChar[usrc[len]] + len;
 518
 519     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
 520       // Lookup and score this bigram
 521       // Always ignore pre/post spaces
 522       uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
 523       uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
 524       // Now go indirect on the subscript
 525       probs = bigram_obj->kCLDTableInd[probs &
 526         ~bigram_obj->kCLDTableKeyMask];
 527
 528       // Process the bigram
 529       if (FLAGS_dbglookup) {
 530         const char* ssrc = reinterpret_cast<const char*>(usrc);
 531         DbgBiTermToStderr(bihash, probs, ssrc, len2);
 532         DbgScoreRecord(NULL, probs, len2);
 533       } else if (FLAGS_dbgscore && (probs != 0)) {
 534         const char* ssrc = reinterpret_cast<const char*>(usrc);
 535         DbgScoreRecord(NULL, probs, len2);
 536         string temp(ssrc, len2);
 537         fprintf(stderr, "%s ", temp.c_str());
 538       }
 539
 540       if (probs != 0) {
 541         ProcessProbV25Tote(probs, chunk_tote);
 542         ++hit_count;
 543       }
 544     }
 545     usrc += len;  // Advance by one char
 546   }
 547
 548   if (FLAGS_dbgscore) {
 549     fprintf(stderr, "[%d bigrams scored]\n", hit_count);
 550     DbgScoreState();
 551   }
 552   return hit_count;
 553 }
 554
 555
 556
 557 // QUADGRAM, using hash table, advancing by 2/4/8/16 chars
 558 // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
 559 // Score up to n quadgrams, returning number of bytes consumed
 560 // Updates tote_grams
 561 int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
 562                        const char* isrc, int srclen, int advance_by,
 563                        int* tote_grams, int gram_limit, Tote* chunk_tote) {
 564   const char* src = isrc;
 565   const char* srclimit = src + srclen;
 566   // Limit is end, which has extra 20 20 20 00 past len
 567   const char* srclimit7 = src + srclen - (UTFmax * 7);
 568   const char* srclimit15 = src + srclen - (UTFmax * 15);
 569
 570   if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
 571
 572   // Run a little cache of last hits to catch overly-repetitive "text"
 573   int next_prior = 0;
 574   uint32 prior_quads[2] = {0, 0};
 575
 576   // Visit all quadgrams
 577   if (src[0] == ' ') {++src;}
 578   while (src < srclimit) {
 579     // Find one quadgram
 580     const char* src_end = src;
 581     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 582     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 583     const char* src_mid = src_end;
 584     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 585     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 586     int len = src_end - src;
 587
 588     // Lookup and score this quadgram
 589     uint32 quadhash = QuadHashV25(src, len);
 590     uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
 591     // Now go indirect on the subscript
 592     probs = quadgram_obj->kCLDTableInd[probs &
 593       ~quadgram_obj->kCLDTableKeyMask];
 594
 595     // Process the quadgram
 596     if (FLAGS_dbglookup) {
 597       DbgQuadTermToStderr(quadhash, probs, src, len);
 598     }
 599     if (probs != 0) {
 600       // Filter out recent repeats. If this works out, use in the other lookups
 601       if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
 602         prior_quads[next_prior] = quadhash;
 603         next_prior = (next_prior + 1) & 1;
 604         ProcessProbV25Tote(probs, chunk_tote);
 605         ++(*tote_grams);
 606         if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
 607       }
 608     }
 609
 610     // Advance all the way past word if at end-of-word
 611     if (src_end[0] == ' ') {
 612       src_mid = src_end;
 613     }
 614
 615     // Advance by 2/4/8/16 characters
 616     if (advance_by == 2) {
 617       src = src_mid;
 618     } else if (advance_by == 4) {
 619       src = src_end;
 620     } else if (advance_by == 8) {
 621       // Advance by 8 chars total (4 more), if not at end
 622       if (src < srclimit7) {
 623         src_end += kAdvanceOneChar[(uint8)src_end[0]];
 624         src_end += kAdvanceOneChar[(uint8)src_end[0]];
 625         src_end += kAdvanceOneChar[(uint8)src_end[0]];
 626         src_end += kAdvanceOneChar[(uint8)src_end[0]];
 627       }
 628       src = src_end;
 629     } else {
 630       // Advance by 16 chars total (12 more), if not at end
 631       if (src < srclimit15) {
 632         // Advance by ~16 chars by adding 3 * current bytelen
 633         int fourcharlen = src_end - src;
 634         src = src_end + (3 * fourcharlen);
 635         // Advance a bit more if mid-character
 636         src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
 637         src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
 638       } else {
 639         src = src_end;
 640       }
 641     }
 642     DCHECK(src < srclimit);
 643     src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
 644
 645     if (*tote_grams >= gram_limit) {
 646       break;
 647     }
 648   }
 649
 650   if (FLAGS_dbgscore) {
 651     // With advance_by>2, we consume more input to get the same number of quads
 652     int len = src - isrc;
 653     DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
 654     DbgScoreFlush();
 655   }
 656
 657   int consumed = src - isrc;
 658
 659   // If advancing by more than 2, src may have overshot srclimit
 660   if (consumed > srclen) {
 661     consumed = srclen;
 662   }
 663
 664   return consumed;
 665 }
 666
 667
 668 // OCTAGRAM, using hash table, always advancing by 1 word
 669 // Caller supplies table, such as &kLongWord8Table_obj
 670 // Score all words in isrc, using languages that have quadgrams
 671 // We don't normally use this routine except on the first quadgram run,
 672 // but it can be used to resolve unreliable pages.
 673 // This routine does not have an optimized advance_by
 674 // SOON: Uses indirect language/probability longword
 675 //
 676 // Return number of words that hit in the hash table
 677 int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
 678                        const char* isrc, int srclen, Tote* chunk_tote) {
 679   int hit_count = 0;
 680   const char* src = isrc;
 681   const char* srclimit = src + srclen + 1;
 682   // Limit is end+1, to include extra space char (0x20) off the end
 683   //
 684   // Score all words truncated to 8 characters
 685   int charcount = 0;
 686   // Skip any initial space
 687   if (src[0] == ' ') {++src;}
 688   const char* word_ptr = src;
 689   const char* word_end = word_ptr;
 690   if (FLAGS_dbgscore) {
 691     fprintf(stderr, "  " );
 692   }
 693   while (src < srclimit) {
 694     // Terminate previous word or continue current word
 695     if (src[0] == ' ') {
 696       int bytecount = word_end - word_ptr;
 697       if (bytecount == 0)
 698         break;
 699       // Lookup and score this word
 700       uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
 701       uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
 702       // Now go indirect on the subscript
 703       probs = octagram_obj->kCLDTableInd[probs &
 704         ~octagram_obj->kCLDTableKeyMask];
 705
 706       // // Lookup and score this word
 707       // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
 708       // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
 709       //                                kLongWord8TableSize);
 710       //
 711       if (FLAGS_dbglookup) {
 712         DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
 713         DbgScoreRecord(NULL, probs, bytecount);
 714       } else if (FLAGS_dbgscore && (probs != 0)) {
 715         DbgScoreRecord(NULL, probs, bytecount);
 716         string temp(word_ptr, bytecount);
 717         fprintf(stderr, "%s ", temp.c_str());
 718       }
 719
 720       if (probs != 0) {
 721         ProcessProbV25Tote(probs, chunk_tote);
 722         ++hit_count;
 723       }
 724       charcount = 0;
 725       word_ptr = src + 1;   // Over the space
 726       word_end = word_ptr;
 727     } else {
 728       ++charcount;
 729     }
 730
 731     // Advance to next char
 732     src += cld_UniLib::OneCharLen(src);
 733     if (charcount <= 8) {
 734       word_end = src;
 735     }
 736   }
 737
 738   if (FLAGS_dbgscore) {
 739     fprintf(stderr, "[%d words scored]\n", hit_count);
 740     DbgScoreState();
 741   }
 742   return hit_count;
 743 }
 744
 745
 746
 747 //------------------------------------------------------------------------------
 748 // Reliability calculations, for single language and between languages
 749 //------------------------------------------------------------------------------
 750
 751 // Return reliablity of result 0..100 for top two scores
 752 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
 753 // (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
 754 // Threshold is uni/quadgram increment count, bounded above and below.
 755 //
 756 // Requiring a factor of 3 improvement (e.g. +1 log base 3)
 757 // for each scored quadgram is too stringent, so I've backed this off to a
 758 // factor of 2 (e.g. +5/8 log base 3).
 759 //
 760 // I also somewhat lowered the Min/MaxGramCount limits above
 761 //
 762 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
 763 //
 764 int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
 765   int max_reliability_percent = 100;
 766   if (gramcount < 8) {
 767     max_reliability_percent = 12 * gramcount;
 768   }
 769   int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
 770   if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
 771     fully_reliable_thresh = kMinGramCount;
 772   } else if (fully_reliable_thresh > kMaxGramCount) {
 773     fully_reliable_thresh = kMaxGramCount;
 774   }
 775
 776   int delta = value1 - value2;
 777   if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
 778   if (delta <= 0) {return 0;}
 779   return cld::minint(max_reliability_percent,
 780                      (100 * delta) / fully_reliable_thresh);
 781 }
 782
 783 // Return reliablity of result 0..100 for top score vs. mainsteam score
 784 // Values are score per 1024 bytes of input
 785 // ratio = max(top/mainstream, mainstream/top)
 786 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
 787 // Change: short-text word scoring can give unusually good results.
 788 //  Let top exceed mainstream by 4x at 50% reliable
 789 int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
 790   if (mean_score == 0) {return 100;}    // No reliability data available yet
 791   if (topscore == 0) {return 0;}        // zero score = unreliable
 792   if (len == 0) {return 0;}             // zero len = unreliable
 793   int top_kb = (topscore << 10) / len;
 794   double ratio;
 795   double ratio_cutoff;
 796   if (top_kb > mean_score) {
 797     ratio = (1.0 * top_kb) / mean_score;
 798     ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
 799   } else {
 800     ratio = (1.0 * mean_score) / top_kb;
 801     ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
 802   }
 803   if (ratio <= ratio_cutoff - 2.0) {return 100;}
 804   if (ratio > ratio_cutoff) {return 0;}
 805
 806   int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
 807   return iratio;
 808 }
 809
 810 // Calculate ratio of score per 1KB vs. expected score per 1KB
 811 double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
 812                           int bytes, int score) {
 813   // Average training-data score for this language-script combo, per 1KB
 814   int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
 815   if (lscript == ULScript_Common) {
 816     // We don't know the script (only happens with second-chance score)
 817     // Look for first non-zero mean value
 818     for (int i = 2; i >= 0; --i) {
 819       if (kMeanScore[lang * 4 + i] > 0) {
 820         expected_score = kMeanScore[lang * 4 + i];
 821         break;
 822       }
 823     }
 824   }
 825   if (expected_score < 100) {
 826       expected_score = 1000;
 827   }
 828
 829   // Our score per 1KB
 830   double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
 831   double ratio = our_score / expected_score;
 832
 833   // Just the raw count normalized as though each language has mean=1000;
 834   ratio = (score * 1000.0) /  expected_score;
 835   return ratio;
 836 }
 837
 838 // Calculate reliablity of len bytes of script lscript with chunk_tote
 839 int cld::GetReliability(int len, UnicodeLScript lscript,
 840                    const Tote* chunk_tote) {
 841   Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
 842   // Average score for this language-script combo
 843   int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
 844   if (lscript == ULScript_Common) {
 845     // We don't know the script (only happens with second-chance score)
 846     // Look for first non-zero mean value
 847     for (int i = 2; i >= 0; --i) {
 848       if (kMeanScore[cur_lang * 4 + i] > 0) {
 849         mean_score = kMeanScore[cur_lang * 4 + i];
 850         break;
 851       }
 852     }
 853   }
 854   int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
 855                                            chunk_tote->Value(1),
 856                                            chunk_tote->GetGramCount());
 857
 858   int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
 859                                                len,
 860                                                mean_score);
 861
 862   int reliability_min = minint(reliability_delta, reliability_main);
 863
 864
 865   if (FLAGS_dbgreli) {
 866     char temp1[4];
 867     char temp2[4];
 868     cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
 869     if (temp1[2] == ' ') {temp1[2] = '\0';}
 870     cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
 871     if (temp2[2] == ' ') {temp2[2] = '\0';}
 872     int srclen = len;
 873     fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
 874                    "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
 875                    "lang/lscript %d %d\n",
 876            chunk_tote->GetGramCount(),
 877            chunk_tote->GetIncrCount(),
 878            srclen,
 879            temp1, chunk_tote->Value(0),
 880            temp2, chunk_tote->Value(1),
 881            (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
 882            mean_score,
 883            reliability_delta,
 884            reliability_main,
 885            cur_lang, lscript);
 886   }
 887
 888   return reliability_min;
 889 }
 890
 891
 892 //------------------------------------------------------------------------------
 893 // Miscellaneous
 894 //------------------------------------------------------------------------------
 895
 896 // Demote all languages except Top40 and plus_one
 897 // Do this just before sorting chunk_tote results
 898 void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
 899   for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
 900     if (chunk_tote->Key(sub) == 0) continue;
 901     if (chunk_tote->Key(sub) == packed_plus_one) continue;
 902     if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
 903     // Quarter the score of others
 904     chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
 905   }
 906 }