2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
54 #include "../sim_slm.h"
57 #include "ValueCompress.h"
59 class CSIMSlmWithIteration : public CSIMSlm {
61 struct TLevelIterator {
62 std::vector<int> m_history;
79 getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history);
82 beginLevelIteration(int lvl, TLevelIterator& it);
85 next(TLevelIterator& it);
88 isEnd(TLevelIterator& it);
91 getNodePtr(TLevelIterator& it);
94 findState(int n, TSIMWordId*hw);
97 findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon);
102 adjustIterator(TLevelIterator& it);
106 CSIMSlmWithIteration::findState(int n, TSIMWordId*hw)
108 if (n == 0) return 0;
115 void* pstate = ((TNode*)level[0]);
116 for (int lvl = 0; lvl < n && pstate != NULL; ++lvl) {
117 int h = ((TNode*)pstate)->child;
118 int t = (((TNode*)pstate) + 1)->child;
120 TLeaf* p = (TLeaf*)level[lvl + 1];
121 pstate = (void*)binary_find_id(p + h, p + t, hw[lvl]);
122 m = (pstate != NULL) ? (((TLeaf*)pstate) - p) : (-1);
124 TNode* p = (TNode*)level[lvl + 1];
125 pstate = (void*)binary_find_id(p + h, p + t, hw[lvl]);
126 m = (pstate != NULL) ? (((TNode*)pstate) - p) : (-1);
133 CSIMSlmWithIteration::findBackOffState(int n,
140 int idx = findState(n, hw);
141 if (idx >= 0 && ((TNode*)(level[n]))[idx].child <
142 ((TNode*)(level[n]))[idx + 1].child) {
143 bol = n; bon = idx; return;
151 CSIMSlmWithIteration::getIdString(TLevelIterator& it,
152 std::vector<TSIMWordId>& history)
155 for (int i = 1, tmp_sz = it.m_history.size(); i < tmp_sz; ++i) {
156 int idx = it.m_history[i];
158 history.push_back(((TLeaf*)(level[i]))[idx].id);
160 history.push_back(((TNode*)(level[i]))[idx].id);
165 CSIMSlmWithIteration::beginLevelIteration(int lvl, TLevelIterator& it)
167 it.m_history.clear();
168 for (int i = 0, tmp_sz = lvl; i <= tmp_sz; ++i)
169 it.m_history.push_back(0);
174 CSIMSlmWithIteration::next(TLevelIterator& it)
176 ++(it.m_history.back());
181 CSIMSlmWithIteration::isEnd(TLevelIterator& it)
183 return((it.m_history.back() + 1 >= sz[it.m_history.size() - 1]));
187 CSIMSlmWithIteration::adjustIterator(TLevelIterator& it)
189 int ch = it.m_history.back();
190 for (int i = it.m_history.size() - 2; i >= 0; --i) {
192 int& parent = it.m_history[i];
193 TNode* pn = (TNode*)(level[i]);
194 while (parent < len && pn[parent + 1].child <= ch)
201 CSIMSlmWithIteration::getNodePtr(TLevelIterator& it)
203 int lvl = it.m_history.size() - 1;
204 int idx = it.m_history.back();
206 return(((TLeaf*)(level[lvl])) + idx);
208 return(((TNode*)(level[lvl])) + idx);
217 printf(" slmthread primitive_slm threaded_slm\n");
218 printf("\nDescription:\n");
220 " slmthread add back-off-state for each slm node in the primitive_slm. ");
221 printf("Also it compresses 32-bit float into 16 bit representation.\n\n");
226 CThreadSlm::TNode* levels[16];
227 CThreadSlm::TLeaf* lastLevel;
230 main(int argc, char* argv[])
233 unsigned int bol, bon;
234 CSIMSlmWithIteration slm;
235 std::vector<TSIMWordId> history;
236 float real_pr, eff_pr, real_bow, eff_bow;
238 std::map<float, float> pr_eff, bow_eff; // effval --> val
239 std::map<float, int> pr_values, bow_values; // effval --> freq
240 std::map<float, int> pr_map, bow_map; // result: val --> int
241 std::vector<float> pr_table, bow_table; // result: val vector
242 std::vector<float>::iterator itt, itte;
247 printf("Loading original slm..."); fflush(stdout);
248 if (slm.Load(argv[1]) == false)
251 bool usingLogPr = slm.isUseLogPr();
253 #define EffectivePr(a) (float((usingLogPr) ? ((a) / log(2.0)) : (-log2((a)))))
254 #define OriginalPr(b) (float((usingLogPr) ? ((b) * log(2.0)) : (exp2(-(b)))))
255 #define EffectiveBow(a) (float((usingLogPr) ? (exp(-(a))) : ((a))))
256 #define OriginalBow(b) (float((usingLogPr) ? (-log((b))) : ((b))))
258 printf("\nfirst pass..."); fflush(stdout);
259 for (int lvl = 0; lvl <= slm.getN(); ++lvl) {
260 CSIMSlmWithIteration::TLevelIterator it;
261 slm.beginLevelIteration(lvl, it);
262 for (; !slm.isEnd(it); slm.next(it)) {
263 CSIMSlm::TLeaf* pl = slm.getNodePtr(it);
265 eff_pr = EffectivePr(real_pr);
266 if (pr_eff.find(eff_pr) == pr_eff.end()) {
267 pr_eff[eff_pr] = real_pr;
268 } else { // precision error cause non 1:1 mapping
269 pr_eff[eff_pr] = OriginalPr(eff_pr);
271 ++(pr_values[eff_pr]);
272 if (lvl < slm.getN()) {
273 real_bow = ((CSIMSlm::TNode*)pl)->bow;
274 eff_bow = EffectiveBow(real_bow);
275 if (bow_eff.find(eff_bow) == bow_eff.end()) {
276 bow_eff[eff_bow] = real_bow;
277 } else { // two values map to same distance value due to precision error
278 bow_eff[eff_bow] = OriginalBow(eff_bow);
280 ++(bow_values[eff_bow]);
285 // Following pr value should not be grouped, or as milestone values.
286 static float msprs[] = {
288 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128,
289 1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192,
290 1.0 / 16384, 1.0 / 32768, 1.0 / 65536
293 for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
294 float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
295 float eff_pr = EffectivePr(real_pr);
296 if (pr_eff.find(eff_pr) == pr_eff.end()) {
297 pr_eff[eff_pr] = real_pr;
298 } else { // precision error cause non 1:1 mapping
299 pr_eff[eff_pr] = OriginalPr(eff_pr);
301 pr_values[eff_pr] = 0;
304 // Following bow value should not be grouped, or as milestone values.
305 static float msbows[] = {
306 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
307 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
308 0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
311 for (unsigned i = 0, sz = sizeof(msbows) / sizeof(float); i < sz; ++i) {
312 float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
313 float eff_bow = EffectiveBow(real_bow);
314 if (bow_eff.find(eff_bow) == bow_eff.end()) {
315 bow_eff[eff_bow] = real_bow;
316 } else { // two values map to same distance value due to precision error
317 bow_eff[eff_bow] = OriginalBow(eff_bow);
319 bow_values[eff_bow] = 0;
322 printf("\nCompressing pr values..."); fflush(stdout);
323 vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));
325 itte = pr_table.end();
326 for (itt = pr_table.begin(); itt != itte; ++itt) {
327 *itt = OriginalPr(*itt);
328 assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));
329 assert(!usingLogPr || *itt > 0.0);
331 printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());
333 printf("\nCompressing bow values..."); fflush(stdout);
334 vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));
336 itte = bow_table.end();
337 for (itt = bow_table.begin(); itt != itte; ++itt)
338 *itt = OriginalBow(*itt);
339 printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
342 printf("\nThreading the new model..."); fflush(stdout);
343 for (int lvl = 0; lvl < slm.getN(); ++lvl) {
344 levels[lvl] = new CThreadSlm::TNode[slm.getLevelSize(lvl)];
346 CSIMSlmWithIteration::TLevelIterator it;
347 slm.beginLevelIteration(lvl, it);
348 for (; !slm.isEnd(it); slm.next(it)) {
349 slm.getIdString(it, history);
350 if (history.size() == 0) {
351 slm.findBackOffState(lvl, NULL, bol, bon);
353 slm.findBackOffState(lvl, &history[0], bol, bon);
356 CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);
357 CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];
359 std::map<float, int>::iterator prit = pr_map.find(pn->pr);
360 if (prit == pr_map.end()) { // This would be cause by precision error
361 double val = EffectivePr(pn->pr);
362 val = OriginalPr(val);
363 prit = pr_map.find(val);
364 assert(prit != pr_map.end());
366 int idx_pr = prit->second;
373 std::map<float, int>::iterator bowit = bow_map.find(pn->bow);
374 if (bowit == bow_map.end()) { // precision error
375 double val = EffectiveBow(pn->bow);
376 val = OriginalBow(val);
377 bowit = bow_map.find(val);
378 assert(bowit != bow_map.end());
380 int idx_bow = bowit->second;
383 nn.set_ch(pn->child);
386 (pr_table[idx_pr] > 0.0 && pr_table[idx_pr] < 1.0));
387 assert(!usingLogPr || pr_table[idx_pr] > 0.0);
389 CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);
390 CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];
391 nn.set_ch(pn->child);
396 lastLevel = new CThreadSlm::TLeaf [slm.getLevelSize(slm.getN())];
397 CSIMSlmWithIteration::TLevelIterator it;
398 slm.beginLevelIteration(slm.getN(), it);
399 for (int lvl = slm.getN(); !slm.isEnd(it); slm.next(it)) {
400 CSIMSlm::TLeaf* pn = slm.getNodePtr(it);
401 slm.getIdString(it, history);
402 slm.findBackOffState(lvl, &history[0], bol, bon);
404 CThreadSlm::TLeaf& nn = lastLevel[it.m_history.back()];
406 std::map<float, int>::iterator prit = pr_map.find(pn->pr);
407 if (prit == pr_map.end()) { // This would be cause by precision error
408 double val = EffectivePr(pn->pr);
409 val = OriginalPr(val);
410 prit = pr_map.find(val);
411 assert(prit != pr_map.end());
413 int idx_pr = prit->second;
422 printf("\nWriting out..."); fflush(stdout);
425 fp = fopen(argv[2], "wb");
427 fwrite(&N, sizeof(int), 1, fp);
428 unsigned ulp = slm.isUseLogPr();
429 fwrite(&ulp, sizeof(unsigned), 1, fp);
431 for (int lvl = 0; lvl <= N; ++lvl) {
432 int len = slm.getLevelSize(lvl);
433 fwrite(&len, sizeof(int), 1, fp);
435 fwrite(&pr_table[0], sizeof(float), pr_table.size(), fp);
436 for (int i = pr_table.size(), sz = (1 << CThreadSlm::BITS_PR); i < sz; ++i)
437 fwrite(&dummy, sizeof(float), 1, fp);
439 fwrite(&bow_table[0], sizeof(float), bow_table.size(), fp);
440 for (int i = bow_table.size(), sz = (1 << CThreadSlm::BITS_BOW);
443 fwrite(&dummy, sizeof(float), 1, fp);
445 for (int lvl = 0; lvl < N; ++lvl)
446 fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(
448 fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N), fp);
451 printf("done!\n"); fflush(stdout);
454 for (int lvl = 0; lvl < N; ++lvl)
455 delete [] levels[lvl];