src/slm/tslmpack/slmpack.cpp

   1 /*
   2  * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
   3  *
   4  * The contents of this file are subject to the terms of either the GNU Lesser
   5  * General Public License Version 2.1 only ("LGPL") or the Common Development and
   6  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
   7  * file except in compliance with the License. You can obtain a copy of the CDDL at
   8  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
   9  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
  10  * specific language governing permissions and limitations under the License. When
  11  * distributing the software, include this License Header Notice in each file and
  12  * include the full text of the License in the License file as well as the
  13  * following notice:
  14  *
  15  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
  16  * (CDDL)
  17  * For Covered Software in this distribution, this License shall be governed by the
  18  * laws of the State of California (excluding conflict-of-law provisions).
  19  * Any litigation relating to this License shall be subject to the jurisdiction of
  20  * the Federal Courts of the Northern District of California and the state courts
  21  * of the State of California, with venue lying in Santa Clara County, California.
  22  *
  23  * Contributor(s):
  24  *
  25  * If you wish your version of this file to be governed by only the CDDL or only
  26  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
  27  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
  28  * license." If you don't indicate a single choice of license, a recipient has the
  29  * option to distribute your version of this file under either the CDDL or the LGPL
  30  * Version 2.1, or to extend the choice of license to its licensees as provided
  31  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
  32  * Version 2 license, then the option applies only if the new code is made subject
  33  * to such option by the copyright holder.
  34  */
  35
  36 /*
  37  * pack ARPA format to a binary format which can be consumed by SunPinyin
  38  */
  39
  40 #ifdef HAVE_CONFIG_H
  41 #include "config.h"
  42 #endif
  43
  44 #ifdef HAVE_ASSERT_H
  45 #include <assert.h>
  46 #endif
  47
  48 #include <stdio.h>
  49 #include <unistd.h>
  50 #include <stdlib.h>
  51
  52 #include <vector>
  53 #include <map>
  54 #include <iostream>
  55 #include <cmath>
  56
  57 //#include "../sim_slm.h"
  58 #include "../slm.h"
  59
  60 #include "../thread/ValueCompress.h"
  61 #include "arpa_slm.h"
  62 #include "arpa_conv.h"
  63
  64
  65 void
  66 ShowUsage(const char* progname)
  67 {
  68     printf("Usage:\n");
  69     printf("    %s arpa_slm dict_file threaded_slm\n", progname);
  70     printf("\n");
  71     printf("Description:\n");
  72     printf(
  73         "    %s converts the ARPA representation of SLM to the binary format of threaded SLM. \n",
  74         progname);
  75     exit(100);
  76 }
  77
  78 /**
  79  * slm [in]
  80  * pr_eff, pr_values [out]
  81  * bow_eff, bow_values [out]
  82  */
  83
  84 void
  85 build_map(const CArpaSlm& slm,
  86           EffRealMap &pr_eff,
  87           FreqMap& pr_values,
  88           EffRealMap &bow_eff,
  89           FreqMap& bow_values)
  90 {
  91     bool usingLogPr = slm.usingLogPr();
  92
  93     printf("\nfirst pass..."); fflush(stdout);
  94
  95     for (unsigned lvl = 0; lvl < slm.getN(); ++lvl) {
  96         typedef CArpaSlm::TNodeLevel TNodeLevel;
  97         const TNodeLevel& level = slm.getLevel(lvl);
  98         for (TNodeLevel::const_iterator node = level.begin();
  99              node != level.end();
 100              ++node) {
 101             float real_pr, eff_pr;
 102             real_pr = node->pr;
 103             eff_pr = EffectivePr(real_pr);
 104             if (pr_eff.find(eff_pr) == pr_eff.end()) {
 105                 pr_eff[eff_pr] = real_pr;
 106             } else { // precision error cause non 1:1 mapping
 107                 pr_eff[eff_pr] = OriginalPr(eff_pr);
 108             }
 109             ++(pr_values[eff_pr]);
 110
 111             float real_bow, eff_bow;
 112             real_bow = node->bow;
 113             eff_bow = EffectiveBow(real_bow);
 114             if (bow_eff.find(eff_bow) == bow_eff.end()) {
 115                 bow_eff[eff_bow] = real_bow;
 116             } else { // two values map to same distance value due to precision error
 117                 bow_eff[eff_bow] = OriginalBow(eff_bow);
 118             }
 119             ++(bow_values[eff_bow]);
 120         }
 121     }
 122     typedef CArpaSlm::TLeafLevel TLeafLevel;
 123     const TLeafLevel& level = slm.getLastLevel();
 124     for (TLeafLevel::const_iterator leaf = level.begin();
 125          leaf != level.end();
 126          ++leaf) {
 127         float real_pr, eff_pr;
 128         real_pr = leaf->pr;
 129         eff_pr = EffectivePr(real_pr);
 130         if (pr_eff.find(eff_pr) == pr_eff.end()) {
 131             pr_eff[eff_pr] = real_pr;
 132         } else { // precision error cause non 1:1 mapping
 133             pr_eff[eff_pr] = OriginalPr(eff_pr);
 134         }
 135         ++(pr_values[eff_pr]);
 136     }
 137     // Following pr value should not be grouped, or as milestone values.
 138     static const float msprs[] = {
 139         0.9, 0.8, 0.7, 0.6,
 140         1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128,
 141         1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192,
 142         1.0 / 16384, 1.0 / 32768, 1.0 / 65536
 143     };
 144
 145     for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
 146         float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
 147         float eff_pr = EffectivePr(real_pr);
 148         assert(usingLogPr || (real_pr > 0.0 && real_pr < 1.0));
 149         assert(!usingLogPr || real_pr > 0.0);
 150
 151         if (pr_eff.find(eff_pr) == pr_eff.end()) {
 152             pr_eff[eff_pr] = real_pr;
 153         } else { // precision error causes non 1:1 mapping
 154             pr_eff[eff_pr] = OriginalPr(eff_pr);
 155         }
 156         pr_values[eff_pr] = 0;
 157     }
 158
 159     // Following bow value should not be grouped, or as milestone values.
 160     static const float msbows[] = {
 161         1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
 162         0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
 163         0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
 164     };
 165
 166     for (unsigned i = 0; i < sizeof(msbows) / sizeof(msbows[0]); ++i) {
 167         float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
 168         float eff_bow = EffectiveBow(real_bow);
 169         if (bow_eff.find(eff_bow) == bow_eff.end()) {
 170             bow_eff[eff_bow] = real_bow;
 171         } else { // two values map to same distance value due to precision error
 172             bow_eff[eff_bow] = OriginalBow(eff_bow);
 173         }
 174         bow_values[eff_bow] = 0;
 175     }
 176 }
 177
 178 /**
 179  * group vaules into a smaller set of their approximations
 180  *
 181  * bow_eff [in], bow_values [in], bow_map [out], bow_table [out]
 182  * pr_eff [in], pr_values [in], pr_map [out], pr_table [out]
 183  *
 184  */
 185 void
 186 group_values(bool usingLogPr,
 187              EffRealMap& pr_eff,
 188              FreqMap& pr_values,
 189              CompressedTable& pr_table,
 190              RealIndexMap& pr_map,
 191              EffRealMap& bow_eff,
 192              FreqMap& bow_values,
 193              CompressedTable& bow_table,
 194              RealIndexMap& bow_map)
 195 {
 196     printf("\nCompressing pr values..."); fflush(stdout);
 197     CValueCompressor vc;
 198     vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));
 199     CompressedTable::iterator itt, itte;
 200     itte = pr_table.end();
 201     for (itt = pr_table.begin(); itt != itte; ++itt) {
 202         *itt = OriginalPr(*itt);
 203         assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));
 204         assert(!usingLogPr || *itt > 0.0);
 205     }
 206     printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());
 207
 208     printf("\nCompressing bow values..."); fflush(stdout);
 209     vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));
 210     itte = bow_table.end();
 211     for (itt = bow_table.begin(); itt != itte; ++itt)
 212         *itt = OriginalBow(*itt);
 213     printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
 214 }
 215
 216 TLexicon
 217 read_lexicon(const char* filename)
 218 {
 219     printf("Loading lexicon..."); fflush(stdout);
 220     static char word[1024 * 10];
 221     FILE* f_lex = fopen(filename, "r");
 222     TLexicon lexicon;
 223     while (fgets(word, sizeof(word), f_lex)) {
 224         if (strlen(word) > 0) {
 225             // skip to the first non hanzi character
 226             char* p = word;
 227             while (*p == ' ' || *p == '\t')
 228                 ++p;
 229             while (*p != 0 && *p != ' ' && *p != '\t')
 230                 ++p;
 231             if (*p == 0) continue;
 232             *p++ = 0;
 233             // skip to the word_id
 234             while (*p == ' ' || *p == '\t')
 235                 ++p;
 236             if (!(*p >= '0' && *p <= '9')) continue;
 237
 238             int id;
 239             for (id = 0; *p >= '0' && *p <= '9'; ++p)
 240                 id = 10 * id + (*p - '0');
 241             lexicon[std::string(word)] = id;
 242         }
 243     }
 244     fclose(f_lex);
 245     printf("done.\n"); fflush(stdout);
 246
 247     return lexicon;
 248 }
 249
 250
 251
 252 //
 253 // filename [in]
 254 // pr_table [in]
 255 // bow_table [in]
 256 // levels[0] [in]
 257 // ...
 258 // levels[N] [in]
 259 // lastLevel [in]
 260 //
 261 void
 262 write_out(const char* filename, const CArpaSlm& slm,
 263           CompressedTable& pr_table, CompressedTable& bow_table,
 264           const TNodeLevels& levels, const CThreadSlm::TLeaf* lastLevel)
 265 {
 266     printf("\nWriting out..."); fflush(stdout);
 267
 268     FILE* fp = fopen(filename, "wb");
 269     const int N = slm.getN();
 270     fwrite(&N, sizeof(int), 1, fp);
 271     const unsigned usingLogPr = slm.usingLogPr();
 272     fwrite(&usingLogPr, sizeof(unsigned), 1, fp);
 273
 274     for (int lvl = 0; lvl <= N; ++lvl) {
 275         unsigned len = slm.getLevelSize(lvl) + 1;
 276         fwrite(&len, sizeof(unsigned), 1, fp);
 277     }
 278
 279     for (int i = 0, sz = pr_table.size(); i < (1 << CThreadSlm::BITS_PR);
 280          ++i) {
 281         if (i < sz) {
 282             fwrite(&pr_table[i], sizeof(float), 1, fp);
 283         } else {
 284             float dummy = 0.0F;
 285             fwrite(&dummy, sizeof(float), 1, fp);
 286         }
 287     }
 288
 289     for (int i = 0, sz = bow_table.size(); i < (1 << CThreadSlm::BITS_BOW);
 290          ++i) {
 291         if (i < sz) {
 292             fwrite(&bow_table[i], sizeof(float), 1, fp);
 293         } else {
 294             float dummy = 0.0F;
 295             fwrite(&dummy, sizeof(float), 1, fp);
 296         }
 297     }
 298
 299     for (int lvl = 0; lvl < N; ++lvl) {
 300         fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(
 301                    lvl) + 1, fp);
 302     }
 303
 304     fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N) + 1, fp);
 305
 306     fclose(fp);
 307
 308     printf("done!\n"); fflush(stdout);
 309 }
 310
 311
 312 void
 313 cleanup(CompressedTable& pr_table, CompressedTable& bow_table,
 314         TNodeLevels& levels, CThreadSlm::TLeaf* lastLevel)
 315 {
 316     for (unsigned lvl = 0; lvl < levels.size(); ++lvl)
 317         delete[] levels[lvl];
 318     delete[] lastLevel;
 319     bow_table.clear();
 320     pr_table.clear();
 321 }
 322
 323 int
 324 main(int argc, char* argv[])
 325 {
 326     if (argc != 4)
 327         ShowUsage(argv[0]);
 328     const char* arpa_path = argv[1];
 329     const char* lexicon_path = argv[2];
 330     const char* threaded_path = argv[3];
 331
 332     CArpaSlm slm;
 333     TLexicon lexicon = read_lexicon(lexicon_path);
 334     slm.load(arpa_path, lexicon);
 335
 336     if (!slm.good()) {
 337         std::cerr << "Failed to load language model from " << arpa_path <<
 338         "." << std::endl;
 339         exit(1);
 340     }
 341     slm.threading();
 342
 343     EffRealMap pr_eff, bow_eff;    // effval --> val
 344     FreqMap pr_values, bow_values; // effval --> freq
 345     build_map(slm, pr_eff, pr_values, bow_eff, bow_values);
 346
 347     RealIndexMap pr_map, bow_map;               // result: val --> int
 348     CompressedTable pr_table, bow_table;        // result: val vector
 349     group_values(slm.usingLogPr(),
 350                  pr_eff, pr_values, pr_table, pr_map,
 351                  bow_eff, bow_values, bow_table, bow_map);
 352     pr_values.clear();
 353     bow_values.clear();
 354
 355     TNodeLevels levels;
 356     CThreadSlm::TLeaf* lastLevel;
 357     compress(slm, pr_table, pr_map, bow_table, bow_map,
 358              levels, lastLevel);
 359
 360     pr_map.clear();
 361     bow_map.clear();
 362     write_out(threaded_path, slm, pr_table, bow_table, levels, lastLevel);
 363
 364     cleanup(pr_table, bow_table, levels, lastLevel);
 365     return 0;
 366 }