2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
51 #include "sim_slmbuilder.h"
54 CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
58 dis = new double[--n];
59 if (thres > n) thres = n;
60 for (int freq = 1; freq < n; ++freq) {
61 if (nr[freq] == 0 || nr[freq + 1] == 0)
64 dis[freq] = double(nr[freq + 1]) / nr[freq];
65 printf("%lf ", dis[freq]); fflush(stdout);
70 CSlmGTDiscounter::discount(int freq)
72 double newfreq = freq * ((freq < thres) ? dis[freq] : hd);
73 if (newfreq >= double(freq))
79 CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
81 // normally, c should not greater than 1.0, yet when cut-off is used, it could be so.
83 c = double(nr[1]) / (nr[1] + 2.0 * nr[2]);
84 printf("parameter c=%lf", c); fflush(stdout);
86 printf("Using given parameter c=%lf", c); fflush(stdout);
91 CSlmAbsoluteDiscounter::discount(int freq)
93 return (freq > 0) ? (freq - c) : (0.0);
97 CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
99 if (dis <= 0.0 || dis >= 1.0) {
100 dis = 1.0 - double(nr[1]) / nr[0];
101 printf("parameter d=%lf", dis); fflush(stdout);
103 printf("Using given parameter d=%lf", dis); fflush(stdout);
108 CSlmLinearDiscounter::discount(int freq)
113 // n=1 for unigram, n=2 for bigram;
114 // level[0] is for psuedo 0 gram, ...
116 CSlmBuilder::Create(int n)
120 level = new void * [n + 1];
121 for (int i = 0; i < n; ++i) {
122 level[i] = new std::vector<TNode>;
123 if (i) ((TNodeLevel*)level[i])->reserve(1024);
126 level[n] = new std::vector<TLeaf>;
127 ((TLeafLevel*)level[n])->reserve(1024);
129 //Add psuedo root node
130 ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0));
132 //Initialize the nr[n+1][SLM_MAX_R] 2-D array
133 nr = new FREQ_TYPE[n + 1][SLM_MAX_R];
134 for (int lvl = 0; lvl < n + 1; ++lvl)
135 for (int r = 0; r < SLM_MAX_R; ++r)
140 CSlmBuilder::SetCut(FREQ_TYPE threshold[])
144 cut = new FREQ_TYPE[nlevel + 1];
145 for (int i = 0; i < nlevel; ++i)
146 cut[i + 1] = threshold[i];
150 CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[])
152 if (discounter != NULL)
153 delete [] discounter;
154 discounter = new CSlmDiscounter* [nlevel + 1];
155 for (int i = 0; i < nlevel; ++i)
156 discounter[i + 1] = dis[i];
160 CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[])
163 for (int i = 0; i < nId; ++i)
164 breaker.push_back(brks[i]);
165 std::make_heap(breaker.begin(), breaker.end());
166 std::sort_heap(breaker.begin(), breaker.end());
170 CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[])
173 for (int i = 0; i < nId; ++i)
174 m_excludes.push_back(excludes[i]);
175 std::make_heap(m_excludes.begin(), m_excludes.end());
176 std::sort_heap(m_excludes.begin(), m_excludes.end());
180 CSlmBuilder::isBreakId(TSIMWordId id)
182 return std::binary_search(breaker.begin(), breaker.end(), id);
186 CSlmBuilder::isExcludeId(TSIMWordId id)
188 return std::binary_search(m_excludes.begin(), m_excludes.end(), id);
192 CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr)
195 bool brk = isExcludeId(*ngram);
197 for (int i = 1; i < nlevel; ++i) {
198 TNodeLevel* pnl = (TNodeLevel*)(level[i]);
199 if (pnl->capacity() == pnl->size()) {
200 size_t newsz = 2 * pnl->capacity();
201 if (pnl->capacity() > 1024 * 1024)
202 newsz = pnl->capacity() + 1024 * 1024;
206 TLeafLevel* pll = (TLeafLevel*)(level[nlevel]);
207 if (pll->capacity() == pll->size()) {
208 size_t newsz = 2 * pll->capacity();
209 if (pll->capacity() > 1024 * 1024)
210 newsz = pll->capacity() + 1024 * 1024;
215 (*(TNodeLevel*)(level[0]))[0].freq += fr;
218 for (int i = 1; (!brk && i < nlevel); ++i) {
219 std::vector<TNode> & pv = *(TNodeLevel*)(level[i - 1]);
220 std::vector<TNode> & v = *(TNodeLevel*)(level[i]);
221 branch = branch || (pv.back().child >= (int) v.size()) ||
222 (v.back().id != ngram[i - 1]);
225 ch = ((TLeafLevel*)(level[i + 1]))->size();
227 ch = ((TNodeLevel*)(level[i + 1]))->size();
228 v.push_back(TNode(ngram[i - 1], ch, fr));
232 brk = (i > 1 && isBreakId(ngram[i - 1])) || isExcludeId(ngram[i]);
235 // Insert to the leaf level
237 if (fr > cut[nlevel]) {
238 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
239 v.push_back(TLeaf(ngram[nlevel - 1], fr));
242 nr[nlevel][fr] += fr;
248 CSlmBuilder::CountNr()
250 printf("\nCounting Nr..."); fflush(stdout);
251 for (int lvl = 1; lvl < nlevel; ++lvl) {
252 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
253 for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it) {
254 FREQ_TYPE freq = it->freq;
256 if (freq < (int) SLM_MAX_R && freq > 0)
257 nr[lvl][freq] += freq;
260 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
261 for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it) {
262 FREQ_TYPE freq = it->freq;
263 nr[nlevel][0] += freq;
264 if (freq < (int) SLM_MAX_R && freq > 0)
265 nr[nlevel][freq] += freq;
267 printf("\n"); fflush(stdout);
271 CSlmBuilder::CutLeafLevel(TNodeIterator pfirst,
273 TLeafIterator chfirst,
274 TLeafIterator chlast,
277 int idxfirst, idxchk;
278 TLeafIterator chchk = chfirst;
279 for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) {
280 //do not cut item whoese 1. freq > thred; 2. psuedo tail
281 if ((int) chchk->freq > thred || (chchk + 1) == chlast) {
282 if (idxfirst < idxchk)
284 for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst)
285 pfirst->child = idxfirst;
290 assert(pfirst == plast);
295 CSlmBuilder::CutNodeLevel(TNodeIterator pfirst,
297 TNodeIterator chfirst,
298 TNodeIterator chlast,
301 int idxfirst, idxchk;
302 TNodeIterator chchk = chfirst;
303 for (idxfirst = idxchk = 0; chchk != chlast; ++chchk, ++idxchk) {
304 //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children
305 TNodeIterator chnext = chchk + 1;
306 if ((int) chchk->freq > thred || chnext == chlast ||
307 (chnext->child != chchk->child)) {
308 if (idxfirst < idxchk)
310 for (; pfirst != plast && pfirst->child <= idxchk; ++pfirst)
311 pfirst->child = idxfirst;
316 assert(pfirst == plast);
323 printf("\nCuting according freq..."); fflush(stdout);
324 for (int lvl = nlevel; lvl > 0; --lvl) {
325 printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]);
327 TNodeLevel& parent = *(TNodeLevel*)(level[lvl - 1]);
330 TLeafLevel& v = *(TLeafLevel*)(level[lvl]);
331 int newsize = CutLeafLevel(parent.begin(),
332 parent.end(), v.begin(),
334 v.erase(v.begin() + newsize, v.end());
338 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
339 int newsize = CutNodeLevel(parent.begin(),
340 parent.end(), v.begin(),
342 v.erase(v.begin() + newsize, v.end());
346 printf("\n"); fflush(stdout);
350 CSlmBuilder::AppendTails()
352 printf("\nAppending psuedo tail node for each level..."); fflush(stdout);
353 for (int lvl = 0; lvl < nlevel; ++lvl) {
355 if (lvl == nlevel - 1) {
356 child_size = ((TLeafLevel*)(level[lvl + 1]))->size();
358 child_size = ((TNodeLevel*)(level[lvl + 1]))->size();
360 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
361 v.push_back(TNode(0x00FFFFFF, child_size, 1));
363 //also make a psuedo tail node for the leaf level
364 ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1));
365 printf("\n"); fflush(stdout);
368 template<class TChildLevel>
370 DiscountOneLevel(CSlmBuilder::TNodeLevel& v,
372 CSlmDiscounter* disc,
375 CSlmBuilder::TNodeIterator it = v.begin();
376 CSlmBuilder::TNodeIterator ite = v.begin() + (v.size() - 1);
377 for (; it != ite; ++it) { //do not calc the psuedo tail item
378 CSlmBuilder::TNodeIterator itnext = it + 1;
379 double root_freq = it->freq;
380 for (int h = it->child, t = itnext->child; h < t; ++h) {
381 double pr = disc->discount(ch[h].freq) / root_freq;
382 assert(pr > 0.0 && pr < 1.0);
384 ch[h].pr = CSlmBuilder::PR_TYPE(-log(pr));
386 ch[h].pr = CSlmBuilder::PR_TYPE(pr);
393 CSlmBuilder::Discount()
395 printf("\nDiscounting...");
396 for (int lvl = nlevel; lvl > 0; --lvl) {
397 printf("\n Initializing level %d's %s discount method: ",
399 discounter[lvl]->getName());
400 discounter[lvl]->init(SLM_MAX_R, nr[lvl]);
403 for (int lvl = nlevel - 1; lvl >= 0; --lvl) {
404 printf("\n Discounting level %d ...", lvl + 1); fflush(stdout);
405 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
406 if (lvl == nlevel - 1) { //its child is leaf
407 TLeafLevel& ch = *(TLeafLevel*)(level[lvl + 1]);
408 DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr);
410 TNodeLevel& ch = *(TNodeLevel*)(level[lvl + 1]);
411 DiscountOneLevel(v, ch, discounter[lvl + 1], bUseLogPr);
414 printf("\n Giving psuedo root level 0 a distribution...");
415 //make the psuedo 0-gram a equal distribution
416 TNodeLevel& v0 = *(TNodeLevel*)(level[0]);
418 v0[0].pr = PR_TYPE(-log(double(1.0) / m_nWord));
420 v0[0].pr = PR_TYPE(double(1.0) / m_nWord);
422 printf("\n"); fflush(stdout);
425 template<class chIterator>
427 CalcNodeBow(CSlmBuilder* builder,
434 if (chh == cht) return 1.0;
435 double sumnext = 0.0, sum = 0.0;
436 for (; chh < cht; ++chh) {
438 sumnext += exp(-(chh->pr));
440 sumnext += double(chh->pr);
442 words[lvl + 1] = chh->id;
443 sum += builder->getPr(lvl, words + 2);
445 assert(sumnext > 0.0 && sumnext < 1.05);
446 assert(sum < 1.05 && sum > 0.0);
448 if (sumnext >= 1.0 || sum >= 1.0) {
449 double bow = ((sumnext > sum) ? sumnext : sum) + 0.0001;
450 bow = (bow - sumnext) / (bow - sum);
452 "\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:",
457 for (int i = 1; i <= lvl; ++i)
458 printf("%d ", words[i]);
461 return (1.0 - sumnext) / (1.0 - sum);
465 CSlmBuilder::CalcBOW()
467 printf("\nCalculating Back-Off Weight...");
468 for (int lvl = 0; lvl < nlevel; ++lvl) {
469 printf("\n Processing level %d ", lvl); fflush(stdout);
470 TNode* base[16]; //it should be lvl+1, yet some compiler does not support it
471 int idx[16]; //it should be lvl+1, yet some compiler does not support it
472 for (int i = 0; i <= lvl; ++i) {
473 base[i] = &((*(TNodeLevel*)level[i])[0]);
476 TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it
477 int sz = ((TNodeLevel*)(level[lvl]))->size() - 1;
478 printf("(%d items)...", sz + 1); fflush(stdout);
479 for (; idx[lvl] < sz; ++idx[lvl]) {
480 words[lvl] = base[lvl][idx[lvl]].id;
481 for (int k = lvl - 1; k >= 0; --k) {
482 while (base[k][idx[k] + 1].child <= idx[k + 1])
484 words[k] = base[k][idx[k]].id;
486 TNode & node = base[lvl][idx[lvl]];
487 TNode & nodenext = *((&node) + 1);
489 if (lvl == nlevel - 1) {
490 TLeaf * ch = &((*(TLeafLevel*)level[lvl + 1])[0]);
491 bow = CalcNodeBow(this,
498 TNode * ch = &((*(TNodeLevel*)level[lvl + 1])[0]);
499 bow = CalcNodeBow(this,
507 node.bow = PR_TYPE(-log(bow));
509 node.bow = PR_TYPE(bow);
513 printf("\n"); fflush(stdout);
517 CSlmBuilder::getPr(int n, TSIMWordId *words)
521 void* pnode = &((*(TNodeLevel*)level[0])[0]);
527 return exp(-((TNode*)pnode)->pr);
529 return ((TNode*)pnode)->pr;
533 for (lvl = 0; pnode != NULL && lvl < n; ++lvl) {
535 bow = exp(-((TNode*)pnode)->bow);
537 bow = ((TNode*)pnode)->bow;
539 pnode = FindChild(lvl, (TNode*)pnode, words[lvl]);
542 if (pnode != NULL) { // find the whole string
544 return exp(-((TLeaf*)pnode)->pr);
546 return ((TLeaf*)pnode)->pr;
548 } else if (lvl == n - 1) { // only find the history
549 return bow * getPr(n - 1, words + 1);
550 } else { //even not find the history
551 return getPr(n - 1, words + 1);
556 CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id)
558 int chh = root->child, cht = (root + 1)->child;
559 if (lvl == nlevel - 1) {
560 TLeaf* pleaf = &((*(TLeafLevel*)level[lvl + 1])[0]);
561 return (void*)binary_find(pleaf, chh, cht, TLeaf(id));
563 TNode* pnode = &((*(TNodeLevel*)level[lvl + 1])[0]);
564 return (void*)binary_find(pnode, chh, cht, TNode(id));
579 CSlmBuilder::Write(FILE *out)
581 fwrite(&nlevel, sizeof(nlevel), 1, out);
582 fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out);
583 for (int lvl = 0; lvl <= nlevel; ++lvl) {
586 sz = ((TLeafLevel*)(level[lvl]))->size();
588 sz = ((TNodeLevel*)(level[lvl]))->size();
589 fwrite(&sz, sizeof(sz), 1, out);
591 for (int lvl = 0; lvl < nlevel; ++lvl) {
592 TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
593 for (TNodeIterator it = v.begin(), ite = v.end(); it != ite; ++it)
594 fwrite(&(*it), sizeof(TNode), 1, out);
596 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
597 for (TLeafIterator it = v.begin(), ite = v.end(); it != ite; ++it)
598 fwrite(&(*it), sizeof(TLeaf), 1, out);
602 CSlmBuilder::Close(void)
605 for (int lvl = 0; lvl <= nlevel; ++lvl) {
607 delete (TLeafLevel*)(level[lvl]);
609 delete (TNodeLevel*)(level[lvl]);
618 if (discounter != NULL) {
619 for (int lvl = 1; lvl <= nlevel; ++lvl) {
620 delete discounter[lvl];
622 delete [] discounter;