1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "encodings/compact_lang_det/tote.h"
6 #include <string.h> // memset
8 #include "encodings/compact_lang_det/win/cld_logging.h"
11 // Take a set of <key, value> pairs and tote them up.
12 // After explicitly sorting, retrieve top key, value pairs
17 memset(key_, 0, sizeof(key_));
18 // No need to initialize values
28 memset(key_, 0, sizeof(key_));
29 // No need to initialize values
32 // Increment count of quadgrams/trigrams/unigrams scored
33 void Tote::AddGram() {
37 // Three-way associative, guaranteeing that the largest two counts are always
38 // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39 // subscript calculations here, which are for 8 sets of 3-way associative
40 // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41 // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42 // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43 // script), this entry matches and we update/return. The second probe is
44 // the other of [N] and [N+8]. The third probe is only used as a fallback to
45 // these two, and is there only for the rare case that there are three or more
46 // languages with Language enum values equal mod 8, contending within the same
47 // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48 // the other scripts have fewer than 17 languages total.
49 // If you change kMaxSize, change the constants 7/8/15/16 below
50 void Tote::Add(uint8 ikey, int idelta) {
54 // Look for existing entry
56 if (key_[sub0] == ikey) {
57 value_[sub0] += idelta;
61 if (key_[sub1] == ikey) {
62 value_[sub1] += idelta;
65 int sub2 = (ikey & 7) + 16;
66 if (key_[sub2] == ikey) {
67 value_[sub2] += idelta;
73 if (key_[sub0] == 0) {
75 } else if (key_[sub1] == 0) {
77 } else if (key_[sub2] == 0) {
80 // All choices allocated, need to replace smallest one
82 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
86 value_[alloc] = idelta;
90 // Return current top key
91 int Tote::CurrentTopKey() {
94 for (int sub = 0; sub < kMaxSize_; ++sub) {
95 if (key_[sub] == 0) {continue;}
96 if (top_value < value_[sub]) {
97 top_value = value_[sub];
105 // Sort first n entries by decreasing order of value
106 // If key==0 other fields are not valid, treat value as -1
107 void Tote::Sort(int n) {
108 // This is n**2, but n is small
109 for (int sub = 0; sub < n; ++sub) {
110 if (key_[sub] == 0) {value_[sub] = -1;}
112 // Bubble sort key[sub] and entry[sub]
113 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114 if (key_[sub2] == 0) {value_[sub2] = -1;}
115 if (value_[sub] < value_[sub2]) {
117 uint8 tmpk = key_[sub];
118 key_[sub] = key_[sub2];
120 int tmpv = value_[sub];
121 value_[sub] = value_[sub2];
128 void Tote::Dump(FILE* f) {
129 for (int sub = 0; sub < kMaxSize_; ++sub) {
131 fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
134 fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
140 // Take a set of <key, value> pairs and tote them up.
141 // After explicitly sorting, retrieve top key, value pairs
142 ToteWithReliability::ToteWithReliability() {
143 // No need to initialize score_ or value_
146 memset(closepair_, 0, sizeof(closepair_));
147 memset(key_, 0, sizeof(key_));
150 ToteWithReliability::~ToteWithReliability() {
153 void ToteWithReliability::Reinit() {
154 // No need to initialize score_ or value_
157 memset(closepair_, 0, sizeof(closepair_));
158 memset(key_, 0, sizeof(key_));
162 // Weight reliability by ibytes
163 // Also see three-way associative comments above for Tote
164 void ToteWithReliability::Add(uint8 ikey, int ibytes,
165 int score, int ireliability) {
170 // Look for existing entry
171 int sub0 = ikey & 15;
172 if (key_[sub0] == ikey) {
173 value_[sub0] += ibytes;
174 score_[sub0] += score;
175 reliability_[sub0] += ireliability * ibytes;
179 if (key_[sub1] == ikey) {
180 value_[sub1] += ibytes;
181 score_[sub1] += score;
182 reliability_[sub1] += ireliability * ibytes;
185 int sub2 = (ikey & 7) + 16;
186 if (key_[sub2] == ikey) {
187 value_[sub2] += ibytes;
188 score_[sub2] += score;
189 reliability_[sub2] += ireliability * ibytes;
193 // Allocate new entry
195 if (key_[sub0] == 0) {
197 } else if (key_[sub1] == 0) {
199 } else if (key_[sub2] == 0) {
202 // All choices allocated, need to replace smallest one
204 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
208 value_[alloc] = ibytes;
209 score_[alloc] = score;
210 reliability_[alloc] = ireliability * ibytes;
214 // Find subscript of a given packed language, or -1
215 int ToteWithReliability::Find(uint8 ikey) {
219 // Linear search if sorted
220 for (int sub = 0; sub < kMaxSize_; ++sub) {
221 if (key_[sub] == ikey) {return sub;}
226 // Look for existing entry
227 int sub0 = ikey & 15;
228 if (key_[sub0] == ikey) {
232 if (key_[sub1] == ikey) {
235 int sub2 = (ikey & 7) + 16;
236 if (key_[sub2] == ikey) {
243 // Return current top key
244 int ToteWithReliability::CurrentTopKey() {
247 for (int sub = 0; sub < kMaxSize_; ++sub) {
248 if (key_[sub] == 0) {continue;}
249 if (top_value < value_[sub]) {
250 top_value = value_[sub];
258 // Sort first n entries by decreasing order of value
259 // If key==0 other fields are not valid, treat value as -1
260 void ToteWithReliability::Sort(int n) {
261 // This is n**2, but n is small
262 for (int sub = 0; sub < n; ++sub) {
263 if (key_[sub] == 0) {value_[sub] = -1;}
265 // Bubble sort key[sub] and entry[sub]
266 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267 if (key_[sub2] == 0) {value_[sub2] = -1;}
268 if (value_[sub] < value_[sub2]) {
270 uint8 tmpk = key_[sub];
271 key_[sub] = key_[sub2];
274 int tmpv = value_[sub];
275 value_[sub] = value_[sub2];
278 int tmps = score_[sub];
279 score_[sub] = score_[sub2];
282 int tmpr = reliability_[sub];
283 reliability_[sub] = reliability_[sub2];
284 reliability_[sub2] = tmpr;
291 void ToteWithReliability::Dump(FILE* f) {
292 for (int sub = 0; sub < kMaxSize_; ++sub) {
294 fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295 sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
298 fprintf(f, " %d#\n", incr_count_);