3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 * The contents of this file are subject to the terms of either the GNU Lesser
8 * General Public License Version 2.1 only ("LGPL") or the Common Development and
9 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 * file except in compliance with the License. You can obtain a copy of the CDDL at
11 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 * specific language governing permissions and limitations under the License. When
14 * distributing the software, include this License Header Notice in each file and
15 * include the full text of the License in the License file as well as the
18 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 * For Covered Software in this distribution, this License shall be governed by the
21 * laws of the State of California (excluding conflict-of-law provisions).
22 * Any litigation relating to this License shall be subject to the jurisdiction of
23 * the Federal Courts of the Northern District of California and the state courts
24 * of the State of California, with venue lying in Santa Clara County, California.
28 * If you wish your version of this file to be governed by only the CDDL or only
29 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 * license." If you don't indicate a single choice of license, a recipient has the
32 * option to distribute your version of this file under either the CDDL or the LGPL
33 * Version 2.1, or to extend the choice of license to its licensees as provided
34 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 * Version 2 license, then the option applies only if the new code is made subject
36 * to such option by the copyright holder.
39 #ifndef _SIM_SLM_BUILDER_H
40 #define _SIM_SLM_BUILDER_H
42 #include "../portability.h"
50 static const int SLM_MAX_R = 16;
51 typedef CSIMSlm::FREQ_TYPE FREQ_TYPE;
52 typedef CSIMSlm::PR_TYPE PR_TYPE;
53 typedef CSIMSlm::TNode TNode;
54 typedef CSIMSlm::TLeaf TLeaf;
58 : nlevel(0), bUseLogPr(0), level(NULL), m_nWord(0), cut(NULL),
59 discounter(NULL), nr(NULL), breaker(), m_excludes() { }
64 void SetNumberOfWord(int nWord) { this->m_nWord = nWord; }
65 void SetCut(FREQ_TYPE threshold[]);
66 void SetDiscounter(CSlmDiscounter * dis[]);
67 void SetBreakerIds(int nId, TSIMWordId brks[]);
68 void SetExcludeIds(int nId, TSIMWordId excludes[]);
69 void SetUseLogPr(int bUse)
72 void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr);
74 void Write(FILE* out);
77 //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels)
78 double getPr(int n, TSIMWordId* w);
81 typedef std::vector<TNode> TNodeLevel;
82 typedef std::vector<TLeaf> TLeafLevel;
83 typedef TNodeLevel::iterator TNodeIterator;
84 typedef TLeafLevel::iterator TLeafIterator;
87 bool isBreakId(TSIMWordId id);
88 bool isExcludeId(TSIMWordId id);
94 void*FindChild(int lvl, TNode* root, TSIMWordId id);
95 int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast,
96 TNodeIterator chfirst, TNodeIterator chlast, int thred);
97 int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast,
98 TLeafIterator chfirst, TLeafIterator chlast, int thred);
101 int nlevel, bUseLogPr;
103 //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type
106 FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ...
107 CSlmDiscounter** discounter; // discounter[1] is for 1-gram...
108 FREQ_TYPE(*nr)[SLM_MAX_R]; //nr[1][SLM_MAX_R] is for 1-gram...
109 std::vector<TSIMWordId> breaker;
110 std::vector<TSIMWordId> m_excludes;
113 class CSlmDiscounter {
115 virtual ~CSlmDiscounter() {}
116 // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr;
117 // nr[1] is number of ngram items with freq 1, ...
118 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0;
120 // freq is the ngram frequence, not the conditional pr
121 virtual double discount(int freq) = 0;
122 virtual const char* getName() = 0;
125 //Good-Turing discount
126 class CSlmGTDiscounter : public CSlmDiscounter {
128 CSlmGTDiscounter(int threshold = 10, double highfreq_discount =
129 0.95) : thres(threshold), hd(highfreq_discount),
131 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
132 virtual double discount(int freq);
133 virtual const char* getName()
134 { return "Good-Turing"; }
141 class CSlmAbsoluteDiscounter : public CSlmDiscounter {
143 CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {}
144 //c == 0 mean this value should be count according to r[]
145 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
146 virtual double discount(int freq); // return freq - c
147 virtual const char* getName()
148 { return "Absolution"; }
153 class CSlmLinearDiscounter : public CSlmDiscounter {
155 CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {}
156 //dis == 0 mean this value should be count according to r[]
157 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
158 virtual double discount(int freq); // return freq * dis
159 virtual const char* getName()