1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
18 // Just the stuff shared between offline table builder and online detector
21 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
22 #define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
24 #include "integral_types.h"
25 #include "cld2tablesummary.h"
29 // Runtime routines for hashing, looking up, and scoring
30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
31 // Unigrams and bigrams are for CJK languages only, including simplified/
32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
33 // Zhuang Han characters. Surrounding spaces are not considered.
34 // Quadgrams and octagrams for for non-CJK and include two bits indicating
35 // preceding and trailing spaces (word boundaries).
38 //----------------------------------------------------------------------------//
39 // Main quantized probability table //
40 //----------------------------------------------------------------------------//
42 // Table has 240 eight-byte entries. Each entry has a five-byte array and
43 // a three-byte array of log base 2 probabilities in the range 1..12.
44 // The intended use is to express five or three probabilities in a single-byte
45 // subscript, then decode via this table. These probabilities are
46 // intended to go with an array of five or three language numbers.
48 // The corresponding language numbers will have to be sorted by descending
49 // probability, then the actual probability subscript chosen to match the
50 // closest available entry in this table.
52 // Pattern of probability values:
53 // hi 3/4 1/2 1/4 lo hi mid lo
54 // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4
55 // and mid is one of 3/4 1/2 or 1/4.
56 // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and
57 // lo running 1..hi. Only the first group is used for five-entry lookups.
58 // The mid value in the first group is 1/2, the second group 3/4, and the
59 // third group 1/4. For three-entry lookups, this allows the mid entry to be
60 // somewhat higher or lower than the midpoint, to allow a better match to the
61 // original probabilities.
62 static const int kLgProbV2TblSize = 240;
63 static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
64 1,1,1,1,1, 1,1,1, // [0]
65 2,2,2,1,1, 2,2,1, // [1]
67 3,3,2,2,1, 3,2,1, // [3]
70 4,3,3,2,1, 4,3,1, // [6]
74 5,4,3,2,1, 5,3,1, // [10]
79 6,5,4,2,1, 6,4,1, // [15]
85 7,6,4,3,1, 7,4,1, // [21]
92 8,6,5,3,1, 8,5,1, // [28]
100 9,7,5,3,1, 9,5,1, // [36]
109 10,8,6,3,1, 10,6,1, // [45]
117 10,10,10,9,9, 10,10,9,
118 10,10,10,10,10, 10,10,10,
119 11,9,6,4,1, 11,6,1, // [55]
126 11,10,10,9,8, 11,10,8,
127 11,11,10,10,9, 11,10,9,
128 11,11,11,10,10, 11,11,10,
129 11,11,11,11,11, 11,11,11,
130 12,9,7,4,1, 12,7,1, // [66]
136 12,11,10,8,7, 12,10,7,
137 12,11,10,9,8, 12,10,8,
138 12,11,11,10,9, 12,11,9,
139 12,12,11,11,10, 12,11,10,
140 12,12,12,11,11, 12,12,11,
141 12,12,12,12,12, 12,12,12,
195 10,10,9,9,8, 10,10,8,
196 10,10,10,9,9, 10,10,9,
197 10,10,10,10,10, 10,10,10,
202 11,10,8,7,5, 11,10,5,
203 11,10,9,7,6, 11,10,6,
204 11,10,9,8,7, 11,10,7,
205 11,10,10,9,8, 11,10,8,
206 11,11,10,10,9, 11,11,9,
207 11,11,11,10,10, 11,11,10,
208 11,11,11,11,11, 11,11,11,
210 12,10,7,5,2, 12,10,2,
211 12,10,8,5,3, 12,10,3,
212 12,10,8,6,4, 12,10,4,
213 12,10,9,7,5, 12,10,5,
214 12,11,9,8,6, 12,11,6,
215 12,11,10,8,7, 12,11,7,
216 12,11,10,9,8, 12,11,8,
217 12,11,11,10,9, 12,11,9,
218 12,12,11,11,10, 12,12,10,
219 12,12,12,11,11, 12,12,11,
220 12,12,12,12,12, 12,12,12,
275 10,10,10,9,9, 10,9,9,
276 10,10,10,10,10, 10,10,10,
284 11,10,10,9,8, 11,9,8,
285 11,11,10,10,9, 11,10,9,
286 11,11,11,10,10, 11,10,10,
287 11,11,11,11,11, 11,11,11,
294 12,11,10,8,7, 12,8,7,
295 12,11,10,9,8, 12,9,8,
296 12,11,11,10,9, 12,10,9,
297 12,12,11,11,10, 12,11,10,
298 12,12,12,11,11, 12,11,11,
299 12,12,12,12,12, 12,12,12,
301 // Added 2013.01.28 for CJK compatible mapping
310 // Backmap a single desired probability into an entry in kLgProbV2Tbl
311 static const uint8 kLgProbV2TblBackmap[13] = {
313 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
316 // Return address of 8-byte entry[i]
317 inline const uint8* LgProb2TblEntry(int i) {
318 return &kLgProbV2Tbl[i * 8];
321 // Return one of three probabilities in an entry
322 inline uint8 LgProb3(const uint8* entry, int j) {
327 // Routines to access a hash table of <key:wordhash, value:probs> pairs
328 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
329 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
331 // Probs is a packed: three languages plus a subscript for probability table
332 // Buckets have all the keys together, then all the values.Key array never
333 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
334 // Match case may sometimes take an additional cache miss on value access.
336 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
337 // byte buckets with single cache miss.
338 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
341 //----------------------------------------------------------------------------//
342 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
343 //----------------------------------------------------------------------------//
346 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
347 // OVERSHOOTS up to 3 bytes
348 // For runtime use of tables
349 // Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p)
350 uint32 BiHashV2(const char* word_ptr, int bytecount);
352 // QUADGRAM wrapper with surrounding spaces
353 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
354 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
355 // For runtime use of tables
356 uint32 QuadHashV2(const char* word_ptr, int bytecount);
358 // QUADGRAM wrapper with surrounding underscores (offline use)
359 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
360 // OVERSHOOTS up to 3 bytes
361 // For offline construction of tables
362 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount);
364 // OCTAGRAM wrapper with surrounding spaces
365 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
366 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
367 uint64 OctaHash40(const char* word_ptr, int bytecount);
370 // OCTAGRAM wrapper with surrounding underscores (offline use)
371 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
372 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
373 uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
375 // Hash a consecutive pair of tokens/words A B
376 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash);
379 // From 32-bit gram FP, return hash table subscript and remaining key
380 inline void QuadFPJustHash(uint32 quadhash,
383 uint32* subscr, uint32* hashkey) {
384 *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
385 *hashkey = quadhash & keymask;
388 // From 40-bit gram FP, return hash table subscript and remaining key
389 inline void OctaFPJustHash(uint64 longwordhash,
392 uint32* subscr, uint32* hashkey) {
393 uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
395 temp = static_cast<uint32>(longwordhash >> 4);
396 *hashkey = temp & keymask;
400 // Look up 32-bit gram FP in caller-passed table
401 // Typical size 256K entries (1.5MB)
403 inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj,
405 uint32 subscr, hashkey;
406 const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
407 uint32 keymask = gram_obj->kCLDTableKeyMask;
408 int bucketcount = gram_obj->kCLDTableSize;
409 QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
410 const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
411 // Four-way associative, 4 compares
412 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
413 return bucket_ptr->keyvalue[0];
415 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
416 return bucket_ptr->keyvalue[1];
418 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
419 return bucket_ptr->keyvalue[2];
421 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
422 return bucket_ptr->keyvalue[3];
427 // Look up 40-bit gram FP in caller-passed table
428 // Typical size 256K-4M entries (1-16MB)
429 // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
430 // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
431 inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj,
432 uint64 longwordhash) {
433 uint32 subscr, hashkey;
434 const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
435 uint32 keymask = gram_obj->kCLDTableKeyMask;
436 int bucketcount = gram_obj->kCLDTableSize;
437 OctaFPJustHash(longwordhash, keymask, bucketcount,
439 const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
440 // Four-way associative, 4 compares
441 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
442 return bucket_ptr->keyvalue[0];
444 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
445 return bucket_ptr->keyvalue[1];
447 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
448 return bucket_ptr->keyvalue[2];
450 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
451 return bucket_ptr->keyvalue[3];
457 //----------------------------------------------------------------------------//
458 // Finding groups of 1/2/4/8 letters //
459 //----------------------------------------------------------------------------//
461 // Does not advance past space or tab/cr/lf/nul
462 static const uint8 kAdvanceOneCharButSpace[256] = {
463 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
464 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
465 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
466 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
468 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
469 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
470 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
471 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
475 // Advances *only* on space or ASCII vowel (or illegal byte)
476 static const uint8 kAdvanceOneCharSpaceVowel[256] = {
477 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
478 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
479 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
480 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
482 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
483 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
484 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
485 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
489 // src points to a letter. Find the byte length of a unigram starting there.
490 int UniLen(const char* src);
492 // src points to a letter. Find the byte length of a bigram starting there.
493 int BiLen(const char* src);
495 // src points to a letter. Find the byte length of a quadgram starting there.
496 int QuadLen(const char* src);
498 // src points to a letter. Find the byte length of an octagram starting there.
499 int OctaLen(const char* src);
501 } // End namespace CLD2
503 #endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__