src/slm/thread/slmthread.cpp

   1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
   3  *
   4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
   5  *
   6  * The contents of this file are subject to the terms of either the GNU Lesser
   7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
   8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
   9  * file except in compliance with the License. You can obtain a copy of the CDDL at
  10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
  11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
  12  * specific language governing permissions and limitations under the License. When
  13  * distributing the software, include this License Header Notice in each file and
  14  * include the full text of the License in the License file as well as the
  15  * following notice:
  16  *
  17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  18  * (CDDL)
  19  * For Covered Software in this distribution, this License shall be governed by the
  20  * laws of the State of California (excluding conflict-of-law provisions).
  21  * Any litigation relating to this License shall be subject to the jurisdiction of
  22  * the Federal Courts of the Northern District of California and the state courts
  23  * of the State of California, with venue lying in Santa Clara County, California.
  24  *
  25  * Contributor(s):
  26  *
  27  * If you wish your version of this file to be governed by only the CDDL or only
  28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
  29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
  30  * license." If you don't indicate a single choice of license, a recipient has the
  31  * option to distribute your version of this file under either the CDDL or the LGPL
  32  * Version 2.1, or to extend the choice of license to its licensees as provided
  33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
  34  * Version 2 license, then the option applies only if the new code is made subject
  35  * to such option by the copyright holder.
  36  */
  37
  38 #ifdef HAVE_CONFIG_H
  39 #include "config.h"
  40 #endif
  41
  42 #ifdef HAVE_ASSERT_H
  43 #include <assert.h>
  44 #endif
  45
  46 #include <stdio.h>
  47 #include <unistd.h>
  48 #include <stdlib.h>
  49
  50 #include <vector>
  51 #include <map>
  52 #include <math.h>
  53
  54 #include "../sim_slm.h"
  55 #include "../slm.h"
  56
  57 #include "ValueCompress.h"
  58
  59 class CSIMSlmWithIteration : public CSIMSlm {
  60 public:
  61     struct TLevelIterator {
  62         std::vector<int>    m_history;
  63     };
  64
  65 public:
  66     int
  67     getLevelSize(int lvl)
  68     {
  69         return sz[lvl];
  70     }
  71
  72     int
  73     getN()
  74     {
  75         return N;
  76     }
  77
  78     void
  79     getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history);
  80
  81     void
  82     beginLevelIteration(int lvl, TLevelIterator& it);
  83
  84     void
  85     next(TLevelIterator& it);
  86
  87     bool
  88     isEnd(TLevelIterator& it);
  89
  90     TLeaf*
  91     getNodePtr(TLevelIterator& it);
  92
  93     int
  94     findState(int n, TSIMWordId*hw);
  95
  96     void
  97     findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon);
  98
  99
 100 protected:
 101     void
 102     adjustIterator(TLevelIterator& it);
 103 };
 104
 105 int
 106 CSIMSlmWithIteration::findState(int n, TSIMWordId*hw)
 107 {
 108     if (n == 0) return 0;
 109
 110     int m = -1;
 111     while (n > N) {
 112         --n; ++hw;
 113     }
 114
 115     void* pstate = ((TNode*)level[0]);
 116     for (int lvl = 0; lvl < n && pstate != NULL; ++lvl) {
 117         int h = ((TNode*)pstate)->child;
 118         int t = (((TNode*)pstate) + 1)->child;
 119         if (lvl == N - 1) {
 120             TLeaf* p = (TLeaf*)level[lvl + 1];
 121             pstate = (void*)binary_find_id(p + h, p + t, hw[lvl]);
 122             m = (pstate != NULL) ? (((TLeaf*)pstate) - p) : (-1);
 123         } else {
 124             TNode* p = (TNode*)level[lvl + 1];
 125             pstate = (void*)binary_find_id(p + h, p + t, hw[lvl]);
 126             m = (pstate != NULL) ? (((TNode*)pstate) - p) : (-1);
 127         }
 128     }
 129     return m;
 130 }
 131
 132 void
 133 CSIMSlmWithIteration::findBackOffState(int n,
 134                                        TSIMWordId*hw,
 135                                        unsigned & bol,
 136                                        unsigned& bon)
 137 {
 138     while (n > 1) {
 139         --n; ++hw;
 140         int idx = findState(n, hw);
 141         if (idx >= 0 && ((TNode*)(level[n]))[idx].child <
 142             ((TNode*)(level[n]))[idx + 1].child) {
 143             bol = n; bon = idx; return;
 144         }
 145     }
 146     bol = bon = 0;
 147     return;
 148 }
 149
 150 void
 151 CSIMSlmWithIteration::getIdString(TLevelIterator& it,
 152                                   std::vector<TSIMWordId>& history)
 153 {
 154     history.clear();
 155     for (int i = 1, tmp_sz = it.m_history.size(); i < tmp_sz; ++i) {
 156         int idx = it.m_history[i];
 157         if (i == N)
 158             history.push_back(((TLeaf*)(level[i]))[idx].id);
 159         else
 160             history.push_back(((TNode*)(level[i]))[idx].id);
 161     }
 162 }
 163
 164 void
 165 CSIMSlmWithIteration::beginLevelIteration(int lvl, TLevelIterator& it)
 166 {
 167     it.m_history.clear();
 168     for (int i = 0, tmp_sz = lvl; i <= tmp_sz; ++i)
 169         it.m_history.push_back(0);
 170     adjustIterator(it);
 171 }
 172
 173 void
 174 CSIMSlmWithIteration::next(TLevelIterator& it)
 175 {
 176     ++(it.m_history.back());
 177     adjustIterator(it);
 178 }
 179
 180 bool
 181 CSIMSlmWithIteration::isEnd(TLevelIterator& it)
 182 {
 183     return((it.m_history.back() + 1 >= sz[it.m_history.size() - 1]));
 184 }
 185
 186 void
 187 CSIMSlmWithIteration::adjustIterator(TLevelIterator& it)
 188 {
 189     int ch = it.m_history.back();
 190     for (int i = it.m_history.size() - 2; i >= 0; --i) {
 191         int len = sz[i];
 192         int& parent = it.m_history[i];
 193         TNode* pn = (TNode*)(level[i]);
 194         while (parent < len && pn[parent + 1].child <= ch)
 195             ++parent;
 196         ch = parent;
 197     }
 198 }
 199
 200 CSIMSlm::TLeaf*
 201 CSIMSlmWithIteration::getNodePtr(TLevelIterator& it)
 202 {
 203     int lvl = it.m_history.size() - 1;
 204     int idx = it.m_history.back();
 205     if (lvl == N)
 206         return(((TLeaf*)(level[lvl])) + idx);
 207     else
 208         return(((TNode*)(level[lvl])) + idx);
 209 }
 210
 211
 212
 213 void
 214 ShowUsage()
 215 {
 216     printf("Usage:\n");
 217     printf("    slmthread primitive_slm threaded_slm\n");
 218     printf("\nDescription:\n");
 219     printf(
 220         "    slmthread add back-off-state for each slm node in the primitive_slm. ");
 221     printf("Also it compresses 32-bit float into 16 bit representation.\n\n");
 222     exit(100);
 223 }
 224
 225 FILE* fp = NULL;
 226 CThreadSlm::TNode* levels[16];
 227 CThreadSlm::TLeaf* lastLevel;
 228
 229 int
 230 main(int argc, char* argv[])
 231 {
 232     CValueCompressor vc;
 233     unsigned int bol, bon;
 234     CSIMSlmWithIteration slm;
 235     std::vector<TSIMWordId> history;
 236     float real_pr, eff_pr, real_bow, eff_bow;
 237
 238     std::map<float, float> pr_eff, bow_eff;     // effval --> val
 239     std::map<float, int> pr_values, bow_values; // effval --> freq
 240     std::map<float, int> pr_map, bow_map;       // result: val --> int
 241     std::vector<float>   pr_table, bow_table;   // result: val vector
 242     std::vector<float>::iterator itt, itte;
 243
 244     if (argc != 3)
 245         ShowUsage();
 246
 247     printf("Loading original slm..."); fflush(stdout);
 248     if (slm.Load(argv[1]) == false)
 249         ShowUsage();
 250
 251     bool usingLogPr = slm.isUseLogPr();
 252
 253     #define EffectivePr(a)  (float((usingLogPr) ? ((a) / log(2.0)) : (-log2((a)))))
 254     #define OriginalPr(b)   (float((usingLogPr) ? ((b) * log(2.0)) : (exp2(-(b)))))
 255     #define EffectiveBow(a) (float((usingLogPr) ? (exp(-(a))) : ((a))))
 256     #define OriginalBow(b)  (float((usingLogPr) ? (-log((b))) : ((b))))
 257
 258     printf("\nfirst pass..."); fflush(stdout);
 259     for (int lvl = 0; lvl <= slm.getN(); ++lvl) {
 260         CSIMSlmWithIteration::TLevelIterator it;
 261         slm.beginLevelIteration(lvl, it);
 262         for (; !slm.isEnd(it); slm.next(it)) {
 263             CSIMSlm::TLeaf* pl = slm.getNodePtr(it);
 264             real_pr = pl->pr;
 265             eff_pr = EffectivePr(real_pr);
 266             if (pr_eff.find(eff_pr) == pr_eff.end()) {
 267                 pr_eff[eff_pr] = real_pr;
 268             } else { // precision error cause non 1:1 mapping
 269                 pr_eff[eff_pr] = OriginalPr(eff_pr);
 270             }
 271             ++(pr_values[eff_pr]);
 272             if (lvl < slm.getN()) {
 273                 real_bow = ((CSIMSlm::TNode*)pl)->bow;
 274                 eff_bow = EffectiveBow(real_bow);
 275                 if (bow_eff.find(eff_bow) == bow_eff.end()) {
 276                     bow_eff[eff_bow] = real_bow;
 277                 } else { // two values map to same distance value due to precision error
 278                     bow_eff[eff_bow] = OriginalBow(eff_bow);
 279                 }
 280                 ++(bow_values[eff_bow]);
 281             }
 282         }
 283     }
 284
 285     // Following pr value should not be grouped, or as milestone values.
 286     static float msprs[] = {
 287         0.9, 0.8, 0.7, 0.6,
 288         1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128,
 289         1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192,
 290         1.0 / 16384, 1.0 / 32768, 1.0 / 65536
 291     };
 292
 293     for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
 294         float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
 295         float eff_pr = EffectivePr(real_pr);
 296         if (pr_eff.find(eff_pr) == pr_eff.end()) {
 297             pr_eff[eff_pr] = real_pr;
 298         } else { // precision error cause non 1:1 mapping
 299             pr_eff[eff_pr] = OriginalPr(eff_pr);
 300         }
 301         pr_values[eff_pr] = 0;
 302     }
 303
 304     // Following bow value should not be grouped, or as milestone values.
 305     static float msbows[] = {
 306         1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
 307         0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
 308         0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
 309     };
 310
 311     for (unsigned i = 0, sz = sizeof(msbows) / sizeof(float); i < sz; ++i) {
 312         float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
 313         float eff_bow = EffectiveBow(real_bow);
 314         if (bow_eff.find(eff_bow) == bow_eff.end()) {
 315             bow_eff[eff_bow] = real_bow;
 316         } else { // two values map to same distance value due to precision error
 317             bow_eff[eff_bow] = OriginalBow(eff_bow);
 318         }
 319         bow_values[eff_bow] = 0;
 320     }
 321
 322     printf("\nCompressing pr values..."); fflush(stdout);
 323     vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));
 324     pr_values.clear();
 325     itte = pr_table.end();
 326     for (itt = pr_table.begin(); itt != itte; ++itt) {
 327         *itt = OriginalPr(*itt);
 328         assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));
 329         assert(!usingLogPr || *itt > 0.0);
 330     }
 331     printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());
 332
 333     printf("\nCompressing bow values..."); fflush(stdout);
 334     vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));
 335     bow_values.clear();
 336     itte = bow_table.end();
 337     for (itt = bow_table.begin(); itt != itte; ++itt)
 338         *itt = OriginalBow(*itt);
 339     printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
 340
 341
 342     printf("\nThreading the new model..."); fflush(stdout);
 343     for (int lvl = 0; lvl < slm.getN(); ++lvl) {
 344         levels[lvl] = new CThreadSlm::TNode[slm.getLevelSize(lvl)];
 345
 346         CSIMSlmWithIteration::TLevelIterator it;
 347         slm.beginLevelIteration(lvl, it);
 348         for (; !slm.isEnd(it); slm.next(it)) {
 349             slm.getIdString(it, history);
 350             if (history.size() == 0) {
 351                 slm.findBackOffState(lvl, NULL, bol, bon);
 352             } else {
 353                 slm.findBackOffState(lvl, &history[0], bol, bon);
 354             }
 355
 356             CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);
 357             CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];
 358
 359             std::map<float, int>::iterator prit = pr_map.find(pn->pr);
 360             if (prit == pr_map.end()) { // This would be cause by precision error
 361                 double val = EffectivePr(pn->pr);
 362                 val = OriginalPr(val);
 363                 prit = pr_map.find(val);
 364                 assert(prit != pr_map.end());
 365             }
 366             int idx_pr = prit->second;
 367             nn.set_pr(idx_pr);
 368
 369             nn.set_wid(pn->id);
 370             nn.set_bon(bon);
 371             nn.set_bol(bol);
 372
 373             std::map<float, int>::iterator bowit = bow_map.find(pn->bow);
 374             if (bowit == bow_map.end()) { // precision error
 375                 double val = EffectiveBow(pn->bow);
 376                 val = OriginalBow(val);
 377                 bowit = bow_map.find(val);
 378                 assert(bowit != bow_map.end());
 379             }
 380             int idx_bow = bowit->second;
 381             nn.set_bow(idx_bow);
 382
 383             nn.set_ch(pn->child);
 384
 385             assert(usingLogPr ||
 386                    (pr_table[idx_pr] > 0.0 && pr_table[idx_pr] < 1.0));
 387             assert(!usingLogPr || pr_table[idx_pr] > 0.0);
 388         }
 389         CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it);
 390         CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()];
 391         nn.set_ch(pn->child);
 392     }
 393     ;
 394
 395
 396     lastLevel = new CThreadSlm::TLeaf [slm.getLevelSize(slm.getN())];
 397     CSIMSlmWithIteration::TLevelIterator it;
 398     slm.beginLevelIteration(slm.getN(), it);
 399     for (int lvl = slm.getN(); !slm.isEnd(it); slm.next(it)) {
 400         CSIMSlm::TLeaf* pn = slm.getNodePtr(it);
 401         slm.getIdString(it, history);
 402         slm.findBackOffState(lvl, &history[0], bol, bon);
 403
 404         CThreadSlm::TLeaf& nn = lastLevel[it.m_history.back()];
 405
 406         std::map<float, int>::iterator prit = pr_map.find(pn->pr);
 407         if (prit == pr_map.end()) { // This would be cause by precision error
 408             double val = EffectivePr(pn->pr);
 409             val = OriginalPr(val);
 410             prit = pr_map.find(val);
 411             assert(prit != pr_map.end());
 412         }
 413         int idx_pr = prit->second;
 414         nn.set_pr(idx_pr);
 415
 416         nn.set_wid(pn->id);
 417         nn.set_bon(bon);
 418         nn.set_bol(bol);
 419     }
 420
 421
 422     printf("\nWriting out..."); fflush(stdout);
 423
 424     float dummy = 0.0;
 425     fp = fopen(argv[2], "wb");
 426     int N = slm.getN();
 427     fwrite(&N, sizeof(int), 1, fp);
 428     unsigned ulp = slm.isUseLogPr();
 429     fwrite(&ulp, sizeof(unsigned), 1, fp);
 430
 431     for (int lvl = 0; lvl <= N; ++lvl) {
 432         int len = slm.getLevelSize(lvl);
 433         fwrite(&len, sizeof(int), 1, fp);
 434     }
 435     fwrite(&pr_table[0], sizeof(float), pr_table.size(), fp);
 436     for (int i = pr_table.size(), sz = (1 << CThreadSlm::BITS_PR); i < sz; ++i)
 437         fwrite(&dummy, sizeof(float), 1, fp);
 438
 439     fwrite(&bow_table[0], sizeof(float), bow_table.size(), fp);
 440     for (int i = bow_table.size(), sz = (1 << CThreadSlm::BITS_BOW);
 441          i < sz;
 442          ++i)
 443         fwrite(&dummy, sizeof(float), 1, fp);
 444
 445     for (int lvl = 0; lvl < N; ++lvl)
 446         fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(
 447                    lvl), fp);
 448     fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N), fp);
 449     fclose(fp);
 450
 451     printf("done!\n"); fflush(stdout);
 452
 453     delete [] lastLevel;
 454     for (int lvl = 0; lvl < N; ++lvl)
 455         delete [] levels[lvl];
 456
 457     bow_values.clear();
 458     bow_map.clear();
 459     bow_table.clear();
 460
 461     pr_values.clear();
 462     pr_map.clear();
 463     pr_table.clear();
 464
 465     slm.Free();
 466
 467
 468     return 0;
 469 }