2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
37 * pack ARPA format to a binary format which can be consumed by SunPinyin
57 //#include "../sim_slm.h"
60 #include "../thread/ValueCompress.h"
62 #include "arpa_conv.h"
66 ShowUsage(const char* progname)
69 printf(" %s arpa_slm dict_file threaded_slm\n", progname);
71 printf("Description:\n");
73 " %s converts the ARPA representation of SLM to the binary format of threaded SLM. \n",
80 * pr_eff, pr_values [out]
81 * bow_eff, bow_values [out]
85 build_map(const CArpaSlm& slm,
91 bool usingLogPr = slm.usingLogPr();
93 printf("\nfirst pass..."); fflush(stdout);
95 for (unsigned lvl = 0; lvl < slm.getN(); ++lvl) {
96 typedef CArpaSlm::TNodeLevel TNodeLevel;
97 const TNodeLevel& level = slm.getLevel(lvl);
98 for (TNodeLevel::const_iterator node = level.begin();
101 float real_pr, eff_pr;
103 eff_pr = EffectivePr(real_pr);
104 if (pr_eff.find(eff_pr) == pr_eff.end()) {
105 pr_eff[eff_pr] = real_pr;
106 } else { // precision error cause non 1:1 mapping
107 pr_eff[eff_pr] = OriginalPr(eff_pr);
109 ++(pr_values[eff_pr]);
111 float real_bow, eff_bow;
112 real_bow = node->bow;
113 eff_bow = EffectiveBow(real_bow);
114 if (bow_eff.find(eff_bow) == bow_eff.end()) {
115 bow_eff[eff_bow] = real_bow;
116 } else { // two values map to same distance value due to precision error
117 bow_eff[eff_bow] = OriginalBow(eff_bow);
119 ++(bow_values[eff_bow]);
122 typedef CArpaSlm::TLeafLevel TLeafLevel;
123 const TLeafLevel& level = slm.getLastLevel();
124 for (TLeafLevel::const_iterator leaf = level.begin();
127 float real_pr, eff_pr;
129 eff_pr = EffectivePr(real_pr);
130 if (pr_eff.find(eff_pr) == pr_eff.end()) {
131 pr_eff[eff_pr] = real_pr;
132 } else { // precision error cause non 1:1 mapping
133 pr_eff[eff_pr] = OriginalPr(eff_pr);
135 ++(pr_values[eff_pr]);
137 // Following pr value should not be grouped, or as milestone values.
138 static const float msprs[] = {
140 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32, 1.0 / 64, 1.0 / 128,
141 1.0 / 256, 1.0 / 512, 1.0 / 1024, 1.0 / 2048, 1.0 / 4096, 1.0 / 8192,
142 1.0 / 16384, 1.0 / 32768, 1.0 / 65536
145 for (unsigned i = 0, sz = sizeof(msprs) / sizeof(float); i < sz; ++i) {
146 float real_pr = (usingLogPr) ? (-log(msprs[i])) : (msprs[i]);
147 float eff_pr = EffectivePr(real_pr);
148 assert(usingLogPr || (real_pr > 0.0 && real_pr < 1.0));
149 assert(!usingLogPr || real_pr > 0.0);
151 if (pr_eff.find(eff_pr) == pr_eff.end()) {
152 pr_eff[eff_pr] = real_pr;
153 } else { // precision error causes non 1:1 mapping
154 pr_eff[eff_pr] = OriginalPr(eff_pr);
156 pr_values[eff_pr] = 0;
159 // Following bow value should not be grouped, or as milestone values.
160 static const float msbows[] = {
161 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
162 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
163 0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001
166 for (unsigned i = 0; i < sizeof(msbows) / sizeof(msbows[0]); ++i) {
167 float real_bow = (usingLogPr) ? (-log(msbows[i])) : (msbows[i]);
168 float eff_bow = EffectiveBow(real_bow);
169 if (bow_eff.find(eff_bow) == bow_eff.end()) {
170 bow_eff[eff_bow] = real_bow;
171 } else { // two values map to same distance value due to precision error
172 bow_eff[eff_bow] = OriginalBow(eff_bow);
174 bow_values[eff_bow] = 0;
179 * group vaules into a smaller set of their approximations
181 * bow_eff [in], bow_values [in], bow_map [out], bow_table [out]
182 * pr_eff [in], pr_values [in], pr_map [out], pr_table [out]
186 group_values(bool usingLogPr,
189 CompressedTable& pr_table,
190 RealIndexMap& pr_map,
193 CompressedTable& bow_table,
194 RealIndexMap& bow_map)
196 printf("\nCompressing pr values..."); fflush(stdout);
198 vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR));
199 CompressedTable::iterator itt, itte;
200 itte = pr_table.end();
201 for (itt = pr_table.begin(); itt != itte; ++itt) {
202 *itt = OriginalPr(*itt);
203 assert(usingLogPr || (*itt > 0.0 && *itt < 1.0));
204 assert(!usingLogPr || *itt > 0.0);
206 printf("%lu float values ==> %lu values", pr_eff.size(), pr_table.size());
208 printf("\nCompressing bow values..."); fflush(stdout);
209 vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW));
210 itte = bow_table.end();
211 for (itt = bow_table.begin(); itt != itte; ++itt)
212 *itt = OriginalBow(*itt);
213 printf("%lu float values ==> %lu values", bow_eff.size(), bow_table.size());
217 read_lexicon(const char* filename)
219 printf("Loading lexicon..."); fflush(stdout);
220 static char word[1024 * 10];
221 FILE* f_lex = fopen(filename, "r");
223 while (fgets(word, sizeof(word), f_lex)) {
224 if (strlen(word) > 0) {
225 // skip to the first non hanzi character
227 while (*p == ' ' || *p == '\t')
229 while (*p != 0 && *p != ' ' && *p != '\t')
231 if (*p == 0) continue;
233 // skip to the word_id
234 while (*p == ' ' || *p == '\t')
236 if (!(*p >= '0' && *p <= '9')) continue;
239 for (id = 0; *p >= '0' && *p <= '9'; ++p)
240 id = 10 * id + (*p - '0');
241 lexicon[std::string(word)] = id;
245 printf("done.\n"); fflush(stdout);
262 write_out(const char* filename, const CArpaSlm& slm,
263 CompressedTable& pr_table, CompressedTable& bow_table,
264 const TNodeLevels& levels, const CThreadSlm::TLeaf* lastLevel)
266 printf("\nWriting out..."); fflush(stdout);
268 FILE* fp = fopen(filename, "wb");
269 const int N = slm.getN();
270 fwrite(&N, sizeof(int), 1, fp);
271 const unsigned usingLogPr = slm.usingLogPr();
272 fwrite(&usingLogPr, sizeof(unsigned), 1, fp);
274 for (int lvl = 0; lvl <= N; ++lvl) {
275 unsigned len = slm.getLevelSize(lvl) + 1;
276 fwrite(&len, sizeof(unsigned), 1, fp);
279 for (int i = 0, sz = pr_table.size(); i < (1 << CThreadSlm::BITS_PR);
282 fwrite(&pr_table[i], sizeof(float), 1, fp);
285 fwrite(&dummy, sizeof(float), 1, fp);
289 for (int i = 0, sz = bow_table.size(); i < (1 << CThreadSlm::BITS_BOW);
292 fwrite(&bow_table[i], sizeof(float), 1, fp);
295 fwrite(&dummy, sizeof(float), 1, fp);
299 for (int lvl = 0; lvl < N; ++lvl) {
300 fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(
304 fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N) + 1, fp);
308 printf("done!\n"); fflush(stdout);
313 cleanup(CompressedTable& pr_table, CompressedTable& bow_table,
314 TNodeLevels& levels, CThreadSlm::TLeaf* lastLevel)
316 for (unsigned lvl = 0; lvl < levels.size(); ++lvl)
317 delete[] levels[lvl];
324 main(int argc, char* argv[])
328 const char* arpa_path = argv[1];
329 const char* lexicon_path = argv[2];
330 const char* threaded_path = argv[3];
333 TLexicon lexicon = read_lexicon(lexicon_path);
334 slm.load(arpa_path, lexicon);
337 std::cerr << "Failed to load language model from " << arpa_path <<
343 EffRealMap pr_eff, bow_eff; // effval --> val
344 FreqMap pr_values, bow_values; // effval --> freq
345 build_map(slm, pr_eff, pr_values, bow_eff, bow_values);
347 RealIndexMap pr_map, bow_map; // result: val --> int
348 CompressedTable pr_table, bow_table; // result: val vector
349 group_values(slm.usingLogPr(),
350 pr_eff, pr_values, pr_table, pr_map,
351 bow_eff, bow_values, bow_table, bow_map);
356 CThreadSlm::TLeaf* lastLevel;
357 compress(slm, pr_table, pr_map, bow_table, bow_map,
362 write_out(threaded_path, slm, pr_table, bow_table, levels, lastLevel);
364 cleanup(pr_table, bow_table, levels, lastLevel);