1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
23 #include "cld2tablesummary.h"
24 #include "integral_types.h"
26 #include "utf8statetable.h"
30 // Caller supplies the right tables in scoringcontext
32 // Runtime routines for hashing, looking up, and scoring
33 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
34 // Unigrams and bigrams are for CJK languages only, including simplified/
35 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
36 // Zhuang Han characters. Surrounding spaces are not considered.
37 // Quadgrams and octagrams for for non-CJK and include two bits indicating
38 // preceding and trailing spaces (word boundaries).
41 static const int kMinCJKUTF8CharBytes = 3;
43 static const int kMinGramCount = 3;
44 static const int kMaxGramCount = 16;
46 static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
48 // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
49 static const uint8 kSkipSpaceVowelContinue[256] = {
50 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
51 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
52 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
53 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
55 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
56 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
57 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
58 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
61 // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
62 static const uint8 kSkipSpaceContinue[256] = {
63 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
64 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
65 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
66 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
68 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
70 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
71 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
75 // Always advances one UTF-8 character
76 static const uint8 kAdvanceOneChar[256] = {
77 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
78 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
79 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
80 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
82 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
83 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
84 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
85 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
88 // Advances *only* on space (or illegal byte)
89 static const uint8 kAdvanceOneCharSpace[256] = {
90 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
91 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
92 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
93 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
95 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
96 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
97 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
98 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
102 // Routines to access a hash table of <key:wordhash, value:probs> pairs
103 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
104 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
106 // Probs is a packed: three languages plus a subscript for probability table
107 // Buckets have all the keys together, then all the values.Key array never
108 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
109 // Match case may sometimes take an additional cache miss on value access.
111 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
112 // byte buckets with single cache miss.
113 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
114 //------------------------------------------------------------------------------
116 //----------------------------------------------------------------------------//
117 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
118 //----------------------------------------------------------------------------//
120 //----------------------------------------------------------------------------//
121 // Scoring single groups of letters //
122 //----------------------------------------------------------------------------//
124 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
125 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
126 // an accumulator tote. (language 0 means unused entry)
127 // Output: running sums in tote updated
128 void ProcessProbV2Tote(uint32 probs, Tote* tote) {
129 uint8 prob123 = (probs >> 0) & 0xff;
130 const uint8* prob123_entry = LgProb2TblEntry(prob123);
132 uint8 top1 = (probs >> 8) & 0xff;
133 if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
134 uint8 top2 = (probs >> 16) & 0xff;
135 if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
136 uint8 top3 = (probs >> 24) & 0xff;
137 if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
140 // Return score for a particular per-script language, or zero
141 int GetLangScore(uint32 probs, uint8 pslang) {
142 uint8 prob123 = (probs >> 0) & 0xff;
143 const uint8* prob123_entry = LgProb2TblEntry(prob123);
145 uint8 top1 = (probs >> 8) & 0xff;
146 if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}
147 uint8 top2 = (probs >> 16) & 0xff;
148 if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}
149 uint8 top3 = (probs >> 24) & 0xff;
150 if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}
154 //----------------------------------------------------------------------------//
155 // Routines to accumulate probabilities //
156 //----------------------------------------------------------------------------//
159 // BIGRAM, using hash table, always advancing by 1 char
160 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
161 // Score all bigrams in isrc, using languages that have bigrams (CJK)
162 // Return number of bigrams that hit in the hash table
163 int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,
164 const char* isrc, int srclen, Tote* chunk_tote) {
166 const char* src = isrc;
168 // Hashtable-based CJK bigram lookup
169 const uint8* usrc = reinterpret_cast<const uint8*>(src);
170 const uint8* usrclimit1 = usrc + srclen - UTFmax;
172 while (usrc < usrclimit1) {
173 int len = kAdvanceOneChar[usrc[0]];
174 int len2 = kAdvanceOneChar[usrc[len]] + len;
176 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
177 // Lookup and score this bigram
178 // Always ignore pre/post spaces
179 uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);
180 uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
181 // Now go indirect on the subscript
182 probs = bigram_obj->kCLDTableInd[probs &
183 ~bigram_obj->kCLDTableKeyMask];
185 // Process the bigram
187 ProcessProbV2Tote(probs, chunk_tote);
191 usrc += len; // Advance by one char
198 // Score up to 64KB of a single script span in one pass
199 // Make a dummy entry off the end to calc length of last span
200 // Return offset of first unused input byte
201 int GetUniHits(const char* text,
202 int letter_offset, int letter_limit,
203 ScoringContext* scoringcontext,
204 ScoringHitBuffer* hitbuffer) {
205 const char* isrc = &text[letter_offset];
206 const char* src = isrc;
207 // Limit is end, which has extra 20 20 20 00 past len
208 const char* srclimit = &text[letter_limit];
211 const UTF8PropObj* unigram_obj =
212 scoringcontext->scoringtables->unigram_obj;
213 int next_base = hitbuffer->next_base;
214 int next_base_limit = hitbuffer->maxscoringhits;
216 // Visit all unigrams
217 if (src[0] == ' ') {++src;} // skip any initial space
218 while (src < srclimit) {
219 const uint8* usrc = reinterpret_cast<const uint8*>(src);
220 int len = kAdvanceOneChar[usrc[0]];
222 // Look up property of one UTF-8 character and advance over it.
223 // Updates usrc and len (bad interface design), hence increment above
224 int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);
226 // Save indirect subscript for later scoring; 1 or 2 langprobs
227 int indirect_subscr = propval;
228 hitbuffer->base[next_base].offset = src - text; // Offset in text
229 hitbuffer->base[next_base].indirect = indirect_subscr;
233 if (next_base >= next_base_limit) {break;}
236 hitbuffer->next_base = next_base;
238 // Make a dummy entry off the end to calc length of last span
239 int dummy_offset = src - text;
240 hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
241 hitbuffer->base[hitbuffer->next_base].indirect = 0;
246 // Score up to 64KB of a single script span, doing both delta-bi and
247 // distinct bis in one pass
248 void GetBiHits(const char* text,
249 int letter_offset, int letter_limit,
250 ScoringContext* scoringcontext,
251 ScoringHitBuffer* hitbuffer) {
252 const char* isrc = &text[letter_offset];
253 const char* src = isrc;
255 const char* srclimit1 = &text[letter_limit];
258 const CLD2TableSummary* deltabi_obj =
259 scoringcontext->scoringtables->deltabi_obj;
260 const CLD2TableSummary* distinctbi_obj =
261 scoringcontext->scoringtables->distinctbi_obj;
262 int next_delta = hitbuffer->next_delta;
263 int next_delta_limit = hitbuffer->maxscoringhits;
264 int next_distinct = hitbuffer->next_distinct;
265 // We can do 2 inserts per loop, so -1
266 int next_distinct_limit = hitbuffer->maxscoringhits - 1;
268 while (src < srclimit1) {
269 const uint8* usrc = reinterpret_cast<const uint8*>(src);
270 int len = kAdvanceOneChar[usrc[0]];
271 int len2 = kAdvanceOneChar[usrc[len]] + len;
273 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
274 // Lookup and this bigram and save <offset, indirect>
275 uint32 bihash = BiHashV2(src, len2);
276 uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);
277 // Now go indirect on the subscript
279 // Save indirect subscript for later scoring; 1 langprob
280 int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;
281 hitbuffer->delta[next_delta].offset = src - text;
282 hitbuffer->delta[next_delta].indirect = indirect_subscr;
285 // Lookup this distinct bigram and save <offset, indirect>
286 probs = QuadHashV3Lookup4(distinctbi_obj, bihash);
288 int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;
289 hitbuffer->distinct[next_distinct].offset = src - text;
290 hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
294 src += len; // Advance by one char (not two)
296 // Almost always srclimit hit first
297 if (next_delta >= next_delta_limit) {break;}
298 if (next_distinct >= next_distinct_limit) {break;}
301 hitbuffer->next_delta = next_delta;
302 hitbuffer->next_distinct = next_distinct;
304 // Make a dummy entry off the end to calc length of last span
305 int dummy_offset = src - text;
306 hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
307 hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
308 hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
309 hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
312 // Score up to 64KB of a single script span in one pass
313 // Make a dummy entry off the end to calc length of last span
314 // Return offset of first unused input byte
315 int GetQuadHits(const char* text,
316 int letter_offset, int letter_limit,
317 ScoringContext* scoringcontext,
318 ScoringHitBuffer* hitbuffer) {
319 const char* isrc = &text[letter_offset];
320 const char* src = isrc;
321 // Limit is end, which has extra 20 20 20 00 past len
322 const char* srclimit = &text[letter_limit];
325 const CLD2TableSummary* quadgram_obj =
326 scoringcontext->scoringtables->quadgram_obj;
327 const CLD2TableSummary* quadgram_obj2 =
328 scoringcontext->scoringtables->quadgram_obj2;
329 int next_base = hitbuffer->next_base;
330 int next_base_limit = hitbuffer->maxscoringhits;
332 // Run a little cache of last quad hits to catch overly-repetitive "text"
333 // We don't care if we miss a couple repetitions at scriptspan boundaries
334 int next_prior_quadhash = 0;
335 uint32 prior_quadhash[2] = {0, 0};
337 // Visit all quadgrams
338 if (src[0] == ' ') {++src;} // skip any initial space
339 while (src < srclimit) {
341 const char* src_end = src;
342 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
343 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
344 const char* src_mid = src_end;
345 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
346 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
347 int len = src_end - src;
349 uint32 quadhash = QuadHashV2(src, len);
351 // Filter out recent repeats
352 if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {
353 // Look up this quadgram and save <offset, indirect>
354 uint32 indirect_flag = 0; // For dual tables
355 const CLD2TableSummary* hit_obj = quadgram_obj;
356 uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
357 if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) {
358 // Try lookup in dual table if not found in first one
359 // Note: we need to know later which of two indirect tables to use.
360 indirect_flag = 0x80000000u;
361 hit_obj = quadgram_obj2;
362 probs = QuadHashV3Lookup4(quadgram_obj2, quadhash);
365 // Round-robin two entries of actual hits
366 prior_quadhash[next_prior_quadhash] = quadhash;
367 next_prior_quadhash = (next_prior_quadhash + 1) & 1;
369 // Save indirect subscript for later scoring; 1 or 2 langprobs
370 int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask;
371 hitbuffer->base[next_base].offset = src - text; // Offset in text
372 // Flip the high bit for table2
373 hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag;
378 // Advance: all the way past word if at end-of-word, else 2 chars
379 if (src_end[0] == ' ') {
385 // Skip over space at end of word, or ASCII vowel in middle of word
386 // Use kAdvanceOneCharSpace instead to get rid of vowel hack
387 if (src < srclimit) {
388 src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
390 // Advancing by 4/8/16 can overshoot, but we are about to exit anyway
394 if (next_base >= next_base_limit) {break;}
397 hitbuffer->next_base = next_base;
399 // Make a dummy entry off the end to calc length of last span
400 int dummy_offset = src - text;
401 hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
402 hitbuffer->base[hitbuffer->next_base].indirect = 0;
409 // const char* isrc, int srclen (in sscriptbuffer)
411 // vector of octa <offset, probs> (which need indirect table to decode)
412 // vector of distinct <offset, probs> (which need indirect table to decode)
414 // Score up to 64KB of a single script span, doing both delta-octa and
415 // distinct words in one pass
416 void GetOctaHits(const char* text,
417 int letter_offset, int letter_limit,
418 ScoringContext* scoringcontext,
419 ScoringHitBuffer* hitbuffer) {
420 const char* isrc = &text[letter_offset];
421 const char* src = isrc;
422 // Limit is end+1, to include extra space char (0x20) off the end
423 const char* srclimit = &text[letter_limit + 1];
426 const CLD2TableSummary* deltaocta_obj =
427 scoringcontext->scoringtables->deltaocta_obj;
428 int next_delta = hitbuffer->next_delta;
429 int next_delta_limit = hitbuffer->maxscoringhits;
431 const CLD2TableSummary* distinctocta_obj =
432 scoringcontext->scoringtables->distinctocta_obj;
433 int next_distinct = hitbuffer->next_distinct;
434 // We can do 2 inserts per loop, so -1
435 int next_distinct_limit = hitbuffer->maxscoringhits - 1;
437 // Run a little cache of last octa hits to catch overly-repetitive "text"
438 // We don't care if we miss a couple repetitions at scriptspan boundaries
439 int next_prior_octahash = 0;
440 uint64 prior_octahash[2] = {0, 0};
442 // Score all words truncated to 8 characters
444 // Skip any initial space
445 if (src[0] == ' ') {++src;}
447 // Begin the first word
448 const char* prior_word_start = src;
449 const char* word_start = src;
450 const char* word_end = word_start;
451 while (src < srclimit) {
452 // Terminate previous word or continue current word
454 int len = word_end - word_start;
456 uint64 wordhash40 = OctaHash40(word_start, len);
459 // Filter out recent repeats. Unlike quads, we update even if no hit,
460 // so we can get hits on same word if separated by non-hit words
461 if ((wordhash40 != prior_octahash[0]) &&
462 (wordhash40 != prior_octahash[1])) {
463 // Round-robin two entries of words
464 prior_octahash[next_prior_octahash] = wordhash40;
465 next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1
467 // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical
468 // function of the two word hashs. For words A B C, B-A and C-B are good
469 // enough and fast. We use the same table as distinct single words
470 // Do not look up a pair of identical words -- all pairs hash to zero
471 // Both 1- and 2-word distinct lookups are in distinctocta_obj now
472 // Do this first, because it has the lowest offset
473 uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];
474 if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {
475 uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);
476 probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);
478 int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
479 hitbuffer->distinct[next_distinct].offset = prior_word_start - text;
480 hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
485 // (2) Lookup this distinct word and save <offset, indirect>
486 probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);
488 int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
489 hitbuffer->distinct[next_distinct].offset = word_start - text;
490 hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
494 // (3) Lookup this word and save <offset, indirect>
495 probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);
497 // Save indirect subscript for later scoring; 1 langprob
498 int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;
499 hitbuffer->delta[next_delta].offset = word_start - text;
500 hitbuffer->delta[next_delta].indirect = indirect_subscr;
505 // Begin the next word
507 prior_word_start = word_start;
508 word_start = src + 1; // Over the space
509 word_end = word_start;
514 // Advance to next char
515 src += UTF8OneCharLen(src);
516 if (charcount <= 8) {
519 // Almost always srclimit hit first
520 if (next_delta >= next_delta_limit) {break;}
521 if (next_distinct >= next_distinct_limit) {break;}
524 hitbuffer->next_delta = next_delta;
525 hitbuffer->next_distinct = next_distinct;
527 // Make a dummy entry off the end to calc length of last span
528 int dummy_offset = src - text;
529 hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
530 hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
531 hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
532 hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
536 //----------------------------------------------------------------------------//
537 // Reliability calculations, for single language and between languages //
538 //----------------------------------------------------------------------------//
540 // Return reliablity of result 0..100 for top two scores
541 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
542 // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
543 // Threshold is uni/quadgram increment count, bounded above and below.
545 // Requiring a factor of 3 improvement (e.g. +1 log base 3)
546 // for each scored quadgram is too stringent, so I've backed this off to a
547 // factor of 2 (e.g. +5/8 log base 3).
549 // I also somewhat lowered the Min/MaxGramCount limits above
551 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
553 int ReliabilityDelta(int value1, int value2, int gramcount) {
554 int max_reliability_percent = 100;
556 max_reliability_percent = 12 * gramcount;
558 int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
559 if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
560 fully_reliable_thresh = kMinGramCount;
561 } else if (fully_reliable_thresh > kMaxGramCount) {
562 fully_reliable_thresh = kMaxGramCount;
565 int delta = value1 - value2;
566 if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
567 if (delta <= 0) {return 0;}
568 return minint(max_reliability_percent,
569 (100 * delta) / fully_reliable_thresh);
572 // Return reliablity of result 0..100 for top score vs. expected mainsteam score
573 // Values are score per 1024 bytes of input
574 // ratio = max(top/mainstream, mainstream/top)
575 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
576 // Change: short-text word scoring can give unusually good results.
577 // Let top exceed mainstream by 4x at 50% reliable
579 // dsites April 2010: These could be tightened up. It would be
580 // reasonable with newer data and round-robin table allocation to start ramping
581 // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,
584 // dsites March 2013: Tightened up a bit.
585 static const double kRatio100 = 1.5;
586 static const double kRatio0 = 4.0;
587 int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {
588 if (expected_score_1kb == 0) {return 100;} // No reliability data available yet
589 if (actual_score_1kb == 0) {return 0;} // zero score = unreliable
591 if (expected_score_1kb > actual_score_1kb) {
592 ratio = (1.0 * expected_score_1kb) / actual_score_1kb;
594 ratio = (1.0 * actual_score_1kb) / expected_score_1kb;
596 // Ratio 1.0 .. 1.5 scores 100%
597 // Ratio 2.0 scores 80%
598 // Linear decline, to ratio 4.0 scores 0%
599 if (ratio <= kRatio100) {return 100;}
600 if (ratio > kRatio0) {return 0;}
603 static_cast<int>(100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100));
607 // Create a langprob packed value from its parts.
608 // qprob is quantized [0..12]
609 // We use Latn script to represent any RTypeMany language
610 uint32 MakeLangProb(Language lang, int qprob) {
611 uint32 pslang = PerScriptNumber(ULScript_Latin, lang);
612 uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];
616 } // End namespace CLD2