src/third_party/cld_2/src/internal/cldutil.cc

   1 // Copyright 2013 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 //
  16 // Author: dsites@google.com (Dick Sites)
  17 // Updated 2014.01 for dual table lookup
  18 //
  19
  20 #include "cldutil.h"
  21 #include <string>
  22
  23 #include "cld2tablesummary.h"
  24 #include "integral_types.h"
  25 #include "port.h"
  26 #include "utf8statetable.h"
  27
  28 namespace CLD2 {
  29
  30 // Caller supplies the right tables in scoringcontext
  31
  32 // Runtime routines for hashing, looking up, and scoring
  33 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
  34 // Unigrams and bigrams are for CJK languages only, including simplified/
  35 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
  36 // Zhuang Han characters. Surrounding spaces are not considered.
  37 // Quadgrams and octagrams for for non-CJK and include two bits indicating
  38 // preceding and trailing spaces (word boundaries).
  39
  40
  41 static const int kMinCJKUTF8CharBytes = 3;
  42
  43 static const int kMinGramCount = 3;
  44 static const int kMaxGramCount = 16;
  45
  46 static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character
  47
  48   // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
  49   static const uint8 kSkipSpaceVowelContinue[256] = {
  50     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  51     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  52     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
  53     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
  54
  55     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  56     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  57     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  58     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  59   };
  60
  61   // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
  62   static const uint8 kSkipSpaceContinue[256] = {
  63     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  64     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  65     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  66     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  67
  68     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  69     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  70     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  71     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  72   };
  73
  74
  75   // Always advances one UTF-8 character
  76   static const uint8 kAdvanceOneChar[256] = {
  77     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  78     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  79     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  80     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  81
  82     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  83     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  84     2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  85     3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
  86   };
  87
  88   // Advances *only* on space (or illegal byte)
  89   static const uint8 kAdvanceOneCharSpace[256] = {
  90     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  91     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  92     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  93     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  94
  95     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  96     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  97     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  98     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  99   };
 100
 101
 102 // Routines to access a hash table of <key:wordhash, value:probs> pairs
 103 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
 104 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
 105 // bucket subscript.
 106 // Probs is a packed: three languages plus a subscript for probability table
 107 // Buckets have all the keys together, then all the values.Key array never
 108 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
 109 // Match case may sometimes take an additional cache miss on value access.
 110 //
 111 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
 112 // byte buckets with single cache miss.
 113 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
 114 //------------------------------------------------------------------------------
 115
 116 //----------------------------------------------------------------------------//
 117 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
 118 //----------------------------------------------------------------------------//
 119
 120 //----------------------------------------------------------------------------//
 121 // Scoring single groups of letters                                           //
 122 //----------------------------------------------------------------------------//
 123
 124 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
 125 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
 126 //  an accumulator tote. (language 0 means unused entry)
 127 // Output: running sums in tote updated
 128 void ProcessProbV2Tote(uint32 probs, Tote* tote) {
 129   uint8 prob123 = (probs >> 0) & 0xff;
 130   const uint8* prob123_entry = LgProb2TblEntry(prob123);
 131
 132   uint8 top1 = (probs >> 8) & 0xff;
 133   if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
 134   uint8 top2 = (probs >> 16) & 0xff;
 135   if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
 136   uint8 top3 = (probs >> 24) & 0xff;
 137   if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
 138 }
 139
 140 // Return score for a particular per-script language, or zero
 141 int GetLangScore(uint32 probs, uint8 pslang) {
 142   uint8 prob123 = (probs >> 0) & 0xff;
 143   const uint8* prob123_entry = LgProb2TblEntry(prob123);
 144   int retval = 0;
 145   uint8 top1 = (probs >> 8) & 0xff;
 146   if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}
 147   uint8 top2 = (probs >> 16) & 0xff;
 148   if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}
 149   uint8 top3 = (probs >> 24) & 0xff;
 150   if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}
 151   return retval;
 152 }
 153
 154 //----------------------------------------------------------------------------//
 155 // Routines to accumulate probabilities                                       //
 156 //----------------------------------------------------------------------------//
 157
 158
 159 // BIGRAM, using hash table, always advancing by 1 char
 160 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
 161 // Score all bigrams in isrc, using languages that have bigrams (CJK)
 162 // Return number of bigrams that hit in the hash table
 163 int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,
 164                          const char* isrc, int srclen, Tote* chunk_tote) {
 165   int hit_count = 0;
 166   const char* src = isrc;
 167
 168   // Hashtable-based CJK bigram lookup
 169   const uint8* usrc = reinterpret_cast<const uint8*>(src);
 170   const uint8* usrclimit1 = usrc + srclen - UTFmax;
 171
 172   while (usrc < usrclimit1) {
 173     int len = kAdvanceOneChar[usrc[0]];
 174     int len2 = kAdvanceOneChar[usrc[len]] + len;
 175
 176     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
 177       // Lookup and score this bigram
 178       // Always ignore pre/post spaces
 179       uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);
 180       uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
 181       // Now go indirect on the subscript
 182       probs = bigram_obj->kCLDTableInd[probs &
 183         ~bigram_obj->kCLDTableKeyMask];
 184
 185       // Process the bigram
 186       if (probs != 0) {
 187         ProcessProbV2Tote(probs, chunk_tote);
 188         ++hit_count;
 189       }
 190     }
 191     usrc += len;  // Advance by one char
 192   }
 193
 194   return hit_count;
 195 }
 196
 197
 198 // Score up to 64KB of a single script span in one pass
 199 // Make a dummy entry off the end to calc length of last span
 200 // Return offset of first unused input byte
 201 int GetUniHits(const char* text,
 202                      int letter_offset, int letter_limit,
 203                      ScoringContext* scoringcontext,
 204                      ScoringHitBuffer* hitbuffer) {
 205   const char* isrc = &text[letter_offset];
 206   const char* src = isrc;
 207   // Limit is end, which has extra 20 20 20 00 past len
 208   const char* srclimit = &text[letter_limit];
 209
 210   // Local copies
 211   const UTF8PropObj* unigram_obj =
 212     scoringcontext->scoringtables->unigram_obj;
 213   int next_base = hitbuffer->next_base;
 214   int next_base_limit = hitbuffer->maxscoringhits;
 215
 216   // Visit all unigrams
 217   if (src[0] == ' ') {++src;}   // skip any initial space
 218   while (src < srclimit) {
 219     const uint8* usrc = reinterpret_cast<const uint8*>(src);
 220     int len = kAdvanceOneChar[usrc[0]];
 221     src += len;
 222     // Look up property of one UTF-8 character and advance over it.
 223     // Updates usrc and len (bad interface design), hence increment above
 224     int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);
 225     if (propval > 0) {
 226       // Save indirect subscript for later scoring; 1 or 2 langprobs
 227       int indirect_subscr = propval;
 228       hitbuffer->base[next_base].offset = src - text;     // Offset in text
 229       hitbuffer->base[next_base].indirect = indirect_subscr;
 230       ++next_base;
 231     }
 232
 233     if (next_base >= next_base_limit) {break;}
 234   }
 235
 236   hitbuffer->next_base = next_base;
 237
 238   // Make a dummy entry off the end to calc length of last span
 239   int dummy_offset = src - text;
 240   hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
 241   hitbuffer->base[hitbuffer->next_base].indirect = 0;
 242
 243   return src - text;
 244 }
 245
 246 // Score up to 64KB of a single script span, doing both delta-bi and
 247 // distinct bis in one pass
 248 void GetBiHits(const char* text,
 249                      int letter_offset, int letter_limit,
 250                      ScoringContext* scoringcontext,
 251                      ScoringHitBuffer* hitbuffer) {
 252   const char* isrc = &text[letter_offset];
 253   const char* src = isrc;
 254   // Limit is end
 255   const char* srclimit1 = &text[letter_limit];
 256
 257   // Local copies
 258   const CLD2TableSummary* deltabi_obj =
 259     scoringcontext->scoringtables->deltabi_obj;
 260   const CLD2TableSummary* distinctbi_obj =
 261     scoringcontext->scoringtables->distinctbi_obj;
 262   int next_delta = hitbuffer->next_delta;
 263   int next_delta_limit = hitbuffer->maxscoringhits;
 264   int next_distinct = hitbuffer->next_distinct;
 265   // We can do 2 inserts per loop, so -1
 266   int next_distinct_limit = hitbuffer->maxscoringhits - 1;
 267
 268   while (src < srclimit1) {
 269     const uint8* usrc = reinterpret_cast<const uint8*>(src);
 270     int len = kAdvanceOneChar[usrc[0]];
 271     int len2 = kAdvanceOneChar[usrc[len]] + len;
 272
 273     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
 274       // Lookup and this bigram and save <offset, indirect>
 275       uint32 bihash = BiHashV2(src, len2);
 276       uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);
 277       // Now go indirect on the subscript
 278       if (probs != 0) {
 279         // Save indirect subscript for later scoring; 1 langprob
 280         int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;
 281         hitbuffer->delta[next_delta].offset = src - text;
 282         hitbuffer->delta[next_delta].indirect = indirect_subscr;
 283         ++next_delta;
 284       }
 285       // Lookup this distinct bigram and save <offset, indirect>
 286       probs = QuadHashV3Lookup4(distinctbi_obj, bihash);
 287       if (probs != 0) {
 288         int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;
 289         hitbuffer->distinct[next_distinct].offset = src - text;
 290         hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
 291         ++next_distinct;
 292       }
 293     }
 294     src += len;  // Advance by one char (not two)
 295
 296     // Almost always srclimit hit first
 297     if (next_delta >= next_delta_limit) {break;}
 298     if (next_distinct >= next_distinct_limit) {break;}
 299   }
 300
 301   hitbuffer->next_delta = next_delta;
 302   hitbuffer->next_distinct = next_distinct;
 303
 304   // Make a dummy entry off the end to calc length of last span
 305   int dummy_offset = src - text;
 306   hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
 307   hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
 308   hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
 309   hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
 310 }
 311
 312 // Score up to 64KB of a single script span in one pass
 313 // Make a dummy entry off the end to calc length of last span
 314 // Return offset of first unused input byte
 315 int GetQuadHits(const char* text,
 316                      int letter_offset, int letter_limit,
 317                      ScoringContext* scoringcontext,
 318                      ScoringHitBuffer* hitbuffer) {
 319   const char* isrc = &text[letter_offset];
 320   const char* src = isrc;
 321   // Limit is end, which has extra 20 20 20 00 past len
 322   const char* srclimit = &text[letter_limit];
 323
 324   // Local copies
 325   const CLD2TableSummary* quadgram_obj =
 326     scoringcontext->scoringtables->quadgram_obj;
 327   const CLD2TableSummary* quadgram_obj2 =
 328     scoringcontext->scoringtables->quadgram_obj2;
 329   int next_base = hitbuffer->next_base;
 330   int next_base_limit = hitbuffer->maxscoringhits;
 331
 332   // Run a little cache of last quad hits to catch overly-repetitive "text"
 333   // We don't care if we miss a couple repetitions at scriptspan boundaries
 334   int next_prior_quadhash = 0;
 335   uint32 prior_quadhash[2] = {0, 0};
 336
 337   // Visit all quadgrams
 338   if (src[0] == ' ') {++src;}   // skip any initial space
 339   while (src < srclimit) {
 340     // Find one quadgram
 341     const char* src_end = src;
 342     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 343     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 344     const char* src_mid = src_end;
 345     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 346     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
 347     int len = src_end - src;
 348     // Hash the quadgram
 349     uint32 quadhash = QuadHashV2(src, len);
 350
 351     // Filter out recent repeats
 352     if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {
 353       // Look up this quadgram and save <offset, indirect>
 354       uint32 indirect_flag = 0;   // For dual tables
 355       const CLD2TableSummary* hit_obj = quadgram_obj;
 356       uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
 357       if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) {
 358         // Try lookup in dual table if not found in first one
 359         // Note: we need to know later which of two indirect tables to use.
 360         indirect_flag = 0x80000000u;
 361         hit_obj = quadgram_obj2;
 362         probs = QuadHashV3Lookup4(quadgram_obj2, quadhash);
 363       }
 364       if (probs != 0) {
 365         // Round-robin two entries of actual hits
 366         prior_quadhash[next_prior_quadhash] = quadhash;
 367         next_prior_quadhash = (next_prior_quadhash + 1) & 1;
 368
 369         // Save indirect subscript for later scoring; 1 or 2 langprobs
 370         int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask;
 371         hitbuffer->base[next_base].offset = src - text;     // Offset in text
 372         // Flip the high bit for table2
 373         hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag;
 374         ++next_base;
 375       }
 376     }
 377
 378     // Advance: all the way past word if at end-of-word, else 2 chars
 379     if (src_end[0] == ' ') {
 380       src = src_end;
 381     } else {
 382       src = src_mid;
 383     }
 384
 385     // Skip over space at end of word, or ASCII vowel in middle of word
 386     // Use kAdvanceOneCharSpace instead to get rid of vowel hack
 387     if (src < srclimit) {
 388       src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
 389     } else {
 390       // Advancing by 4/8/16 can overshoot, but we are about to exit anyway
 391       src = srclimit;
 392     }
 393
 394     if (next_base >= next_base_limit) {break;}
 395   }
 396
 397   hitbuffer->next_base = next_base;
 398
 399   // Make a dummy entry off the end to calc length of last span
 400   int dummy_offset = src - text;
 401   hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
 402   hitbuffer->base[hitbuffer->next_base].indirect = 0;
 403
 404   return src - text;
 405 }
 406
 407 // inputs:
 408 //  const tables
 409 //  const char* isrc, int srclen (in sscriptbuffer)
 410 // intermediates:
 411 //  vector of octa <offset, probs>   (which need indirect table to decode)
 412 //  vector of distinct <offset, probs>   (which need indirect table to decode)
 413
 414 // Score up to 64KB of a single script span, doing both delta-octa and
 415 // distinct words in one pass
 416 void GetOctaHits(const char* text,
 417                      int letter_offset, int letter_limit,
 418                      ScoringContext* scoringcontext,
 419                      ScoringHitBuffer* hitbuffer) {
 420   const char* isrc = &text[letter_offset];
 421   const char* src = isrc;
 422   // Limit is end+1, to include extra space char (0x20) off the end
 423   const char* srclimit = &text[letter_limit + 1];
 424
 425   // Local copies
 426   const CLD2TableSummary* deltaocta_obj =
 427     scoringcontext->scoringtables->deltaocta_obj;
 428   int next_delta = hitbuffer->next_delta;
 429   int next_delta_limit = hitbuffer->maxscoringhits;
 430
 431   const CLD2TableSummary* distinctocta_obj =
 432     scoringcontext->scoringtables->distinctocta_obj;
 433   int next_distinct = hitbuffer->next_distinct;
 434   // We can do 2 inserts per loop, so -1
 435   int next_distinct_limit = hitbuffer->maxscoringhits - 1;
 436
 437   // Run a little cache of last octa hits to catch overly-repetitive "text"
 438   // We don't care if we miss a couple repetitions at scriptspan boundaries
 439   int next_prior_octahash = 0;
 440   uint64 prior_octahash[2] = {0, 0};
 441
 442   // Score all words truncated to 8 characters
 443   int charcount = 0;
 444   // Skip any initial space
 445   if (src[0] == ' ') {++src;}
 446
 447   // Begin the first word
 448   const char* prior_word_start = src;
 449   const char* word_start = src;
 450   const char* word_end = word_start;
 451   while (src < srclimit) {
 452     // Terminate previous word or continue current word
 453     if (src[0] == ' ') {
 454       int len = word_end - word_start;
 455       // Hash the word
 456       uint64 wordhash40 = OctaHash40(word_start, len);
 457       uint32 probs;
 458
 459       // Filter out recent repeats. Unlike quads, we update even if no hit,
 460       // so we can get hits on same word if separated by non-hit words
 461       if ((wordhash40 != prior_octahash[0]) &&
 462           (wordhash40 != prior_octahash[1])) {
 463         // Round-robin two entries of words
 464         prior_octahash[next_prior_octahash] = wordhash40;
 465         next_prior_octahash = 1 - next_prior_octahash;    // Alternates 0,1,0,1
 466
 467         // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical
 468         // function of the two word hashs. For words A B C, B-A and C-B are good
 469         // enough and fast. We use the same table as distinct single words
 470         // Do not look up a pair of identical words -- all pairs hash to zero
 471         // Both 1- and 2-word distinct lookups are in distinctocta_obj now
 472         // Do this first, because it has the lowest offset
 473         uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];
 474         if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {
 475           uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);
 476           probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);
 477           if (probs != 0) {
 478             int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
 479             hitbuffer->distinct[next_distinct].offset = prior_word_start - text;
 480             hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
 481             ++next_distinct;
 482           }
 483         }
 484
 485         // (2) Lookup this distinct word and save <offset, indirect>
 486         probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);
 487         if (probs != 0) {
 488           int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
 489           hitbuffer->distinct[next_distinct].offset = word_start - text;
 490           hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
 491           ++next_distinct;
 492         }
 493
 494         // (3) Lookup this word and save <offset, indirect>
 495         probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);
 496         if (probs != 0) {
 497           // Save indirect subscript for later scoring; 1 langprob
 498           int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;
 499           hitbuffer->delta[next_delta].offset = word_start - text;
 500           hitbuffer->delta[next_delta].indirect = indirect_subscr;
 501           ++next_delta;
 502         }
 503       }
 504
 505       // Begin the next word
 506       charcount = 0;
 507       prior_word_start = word_start;
 508       word_start = src + 1;   // Over the space
 509       word_end = word_start;
 510     } else {
 511       ++charcount;
 512     }
 513
 514     // Advance to next char
 515     src += UTF8OneCharLen(src);
 516     if (charcount <= 8) {
 517       word_end = src;
 518     }
 519     // Almost always srclimit hit first
 520     if (next_delta >= next_delta_limit) {break;}
 521     if (next_distinct >= next_distinct_limit) {break;}
 522   }
 523
 524   hitbuffer->next_delta = next_delta;
 525   hitbuffer->next_distinct = next_distinct;
 526
 527   // Make a dummy entry off the end to calc length of last span
 528   int dummy_offset = src - text;
 529   hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
 530   hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
 531   hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
 532   hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
 533 }
 534
 535
 536 //----------------------------------------------------------------------------//
 537 // Reliability calculations, for single language and between languages        //
 538 //----------------------------------------------------------------------------//
 539
 540 // Return reliablity of result 0..100 for top two scores
 541 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
 542 // (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
 543 // Threshold is uni/quadgram increment count, bounded above and below.
 544 //
 545 // Requiring a factor of 3 improvement (e.g. +1 log base 3)
 546 // for each scored quadgram is too stringent, so I've backed this off to a
 547 // factor of 2 (e.g. +5/8 log base 3).
 548 //
 549 // I also somewhat lowered the Min/MaxGramCount limits above
 550 //
 551 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
 552 //
 553 int ReliabilityDelta(int value1, int value2, int gramcount) {
 554   int max_reliability_percent = 100;
 555   if (gramcount < 8) {
 556     max_reliability_percent = 12 * gramcount;
 557   }
 558   int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
 559   if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
 560     fully_reliable_thresh = kMinGramCount;
 561   } else if (fully_reliable_thresh > kMaxGramCount) {
 562     fully_reliable_thresh = kMaxGramCount;
 563   }
 564
 565   int delta = value1 - value2;
 566   if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
 567   if (delta <= 0) {return 0;}
 568   return minint(max_reliability_percent,
 569                      (100 * delta) / fully_reliable_thresh);
 570 }
 571
 572 // Return reliablity of result 0..100 for top score vs. expected mainsteam score
 573 // Values are score per 1024 bytes of input
 574 // ratio = max(top/mainstream, mainstream/top)
 575 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
 576 // Change: short-text word scoring can give unusually good results.
 577 //  Let top exceed mainstream by 4x at 50% reliable
 578 //
 579 // dsites April 2010: These could be tightened up. It would be
 580 // reasonable with newer data and round-robin table allocation to start ramping
 581 // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,
 582 // but just barely.
 583 //
 584 // dsites March 2013: Tightened up a bit.
 585 static const double kRatio100 = 1.5;
 586 static const double kRatio0 = 4.0;
 587 int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {
 588   if (expected_score_1kb == 0) {return 100;}    // No reliability data available yet
 589   if (actual_score_1kb == 0) {return 0;}        // zero score = unreliable
 590   double ratio;
 591   if (expected_score_1kb > actual_score_1kb) {
 592     ratio = (1.0 * expected_score_1kb) / actual_score_1kb;
 593   } else {
 594     ratio = (1.0 * actual_score_1kb) / expected_score_1kb;
 595   }
 596   // Ratio 1.0 .. 1.5 scores 100%
 597   // Ratio 2.0 scores 80%
 598   // Linear decline, to ratio 4.0 scores 0%
 599   if (ratio <= kRatio100) {return 100;}
 600   if (ratio > kRatio0) {return 0;}
 601
 602   int percent_good =
 603       static_cast<int>(100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100));
 604   return percent_good;
 605 }
 606
 607 // Create a langprob packed value from its parts.
 608 // qprob is quantized [0..12]
 609 // We use Latn script to represent any RTypeMany language
 610 uint32 MakeLangProb(Language lang, int qprob) {
 611   uint32 pslang = PerScriptNumber(ULScript_Latin, lang);
 612   uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];
 613   return retval;
 614 }
 615
 616 }       // End namespace CLD2
 617
 618
 619
 620
 621