1 // Copyright 2013 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 // Author: dsites@google.com (Dick Sites)
20 #include "lang_script.h" // For LanguageCode in Dump
23 #include <string.h> // For memset
27 // Take a set of <key, value> pairs and tote them up.
28 // After explicitly sorting, retrieve top key, value pairs
29 // Normal use is key=per-script language and value = probability score
34 // No need to initialize values
44 // No need to initialize values
46 // Increment count of quadgrams/trigrams/unigrams scored
47 void Tote::AddScoreCount() {
52 void Tote::Add(uint8 ikey, int idelta) {
53 int key_group = ikey >> 2;
54 uint64 groupmask = (1ULL << key_group);
55 if ((in_use_mask_ & groupmask) == 0) {
56 // Initialize this group
57 gscore_[key_group] = 0;
58 in_use_mask_ |= groupmask;
60 score_[ikey] += idelta;
64 // Return current top three keys
65 void Tote::CurrentTopThreeKeys(int* key3) const {
69 int score3[3] = {-1, -1, -1};
70 uint64 tempmask = in_use_mask_;
72 while (tempmask != 0) {
74 // Look at four in-use keys
75 for (int i = 0; i < 4; ++i) {
76 int insert_me = score_[base + i];
77 // Favor lower numbers on ties
78 if (insert_me > score3[2]) {
81 if (insert_me > score3[1]) {
82 score3[2] = score3[1];
85 if (insert_me > score3[0]) {
86 score3[1] = score3[0];
91 score3[insert_at] = insert_me;
92 key3[insert_at] = base + i;
102 // Take a set of <key, value> pairs and tote them up.
103 // After explicitly sorting, retrieve top key, value pairs
104 // 0xFFFF in key signifies unused
106 // No need to initialize score_ or value_
109 memset(closepair_, 0, sizeof(closepair_));
110 memset(key_, 0xFF, sizeof(key_));
113 DocTote::~DocTote() {
116 void DocTote::Reinit() {
117 // No need to initialize score_ or value_
120 memset(closepair_, 0, sizeof(closepair_));
121 memset(key_, 0xFF, sizeof(key_));
122 runningscore_.Reinit();
125 // Weight reliability by ibytes
126 // Also see three-way associative comments above for Tote
127 void DocTote::Add(uint16 ikey, int ibytes,
128 int score, int ireliability) {
131 // Look for existing entry in top 2 positions of 3, times 8 columns
132 int sub0 = ikey & 15;
133 if (key_[sub0] == ikey) {
134 value_[sub0] += ibytes;
135 score_[sub0] += score;
136 reliability_[sub0] += ireliability * ibytes;
139 // Look for existing entry in other of top 2 positions of 3, times 8 columns
141 if (key_[sub1] == ikey) {
142 value_[sub1] += ibytes;
143 score_[sub1] += score;
144 reliability_[sub1] += ireliability * ibytes;
147 // Look for existing entry in third position of 3, times 8 columns
148 int sub2 = (ikey & 7) + 16;
149 if (key_[sub2] == ikey) {
150 value_[sub2] += ibytes;
151 score_[sub2] += score;
152 reliability_[sub2] += ireliability * ibytes;
156 // Allocate new entry
158 if (key_[sub0] == kUnusedKey) {
160 } else if (key_[sub1] == kUnusedKey) {
162 } else if (key_[sub2] == kUnusedKey) {
165 // All choices allocated, need to replace smallest one
167 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
168 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
171 value_[alloc] = ibytes;
172 score_[alloc] = score;
173 reliability_[alloc] = ireliability * ibytes;
177 // Find subscript of a given packed language, or -1
178 int DocTote::Find(uint16 ikey) {
180 // Linear search if sorted
181 for (int sub = 0; sub < kMaxSize_; ++sub) {
182 if (key_[sub] == ikey) {return sub;}
187 // Look for existing entry
188 int sub0 = ikey & 15;
189 if (key_[sub0] == ikey) {
193 if (key_[sub1] == ikey) {
196 int sub2 = (ikey & 7) + 16;
197 if (key_[sub2] == ikey) {
204 // Return current top key
205 int DocTote::CurrentTopKey() {
208 for (int sub = 0; sub < kMaxSize_; ++sub) {
209 if (key_[sub] == kUnusedKey) {continue;}
210 if (top_value < value_[sub]) {
211 top_value = value_[sub];
219 // Sort first n entries by decreasing order of value
220 // If key==0 other fields are not valid, treat value as -1
221 void DocTote::Sort(int n) {
222 // This is n**2, but n is small
223 for (int sub = 0; sub < n; ++sub) {
224 if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
226 // Bubble sort key[sub] and entry[sub]
227 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
228 if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
229 if (value_[sub] < value_[sub2]) {
231 uint16 tmpk = key_[sub];
232 key_[sub] = key_[sub2];
235 int tmpv = value_[sub];
236 value_[sub] = value_[sub2];
239 int tmps = score_[sub];
240 score_[sub] = score_[sub2];
243 int tmpr = reliability_[sub];
244 reliability_[sub] = reliability_[sub2];
245 reliability_[sub2] = tmpr;
252 void DocTote::Dump(FILE* f) {
253 fprintf(f, "DocTote::Dump\n");
254 for (int sub = 0; sub < kMaxSize_; ++sub) {
255 if (key_[sub] != kUnusedKey) {
256 Language lang = static_cast<Language>(key_[sub]);
257 fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
258 value_[sub], score_[sub], reliability_[sub]);
261 fprintf(f, " %d chunks scored<br>\n", incr_count_);
264 } // End namespace CLD2