2 * Copyright (c) 2009 Kov Chai <tchaikov@gmail.com>
4 * The contents of this file are subject to the terms of either the GNU Lesser
5 * General Public License Version 2.1 only ("LGPL") or the Common Development and
6 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
7 * file except in compliance with the License. You can obtain a copy of the CDDL at
8 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
9 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
10 * specific language governing permissions and limitations under the License. When
11 * distributing the software, include this License Header Notice in each file and
12 * include the full text of the License in the License file as well as the
15 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
17 * For Covered Software in this distribution, this License shall be governed by the
18 * laws of the State of California (excluding conflict-of-law provisions).
19 * Any litigation relating to this License shall be subject to the jurisdiction of
20 * the Federal Courts of the Northern District of California and the state courts
21 * of the State of California, with venue lying in Santa Clara County, California.
25 * If you wish your version of this file to be governed by only the CDDL or only
26 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
27 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
28 * license." If you don't indicate a single choice of license, a recipient has the
29 * option to distribute your version of this file under either the CDDL or the LGPL
30 * Version 2.1, or to extend the choice of license to its licensees as provided
31 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
32 * Version 2 license, then the option applies only if the new code is made subject
33 * to such option by the copyright holder.
44 * the GNU extension is not always available, so we invent another wheel.
47 getline(char *buf, size_t n, FILE* stream)
51 for (p = buf; p != end; ++p) {
52 int c = fgetc(stream);
53 if (c == '\n' || c == EOF)
66 getwords(char* buf, char** next)
69 char* delim = strstr(buf, " ");
71 cerr << "Unknown format in: " << buf << "." << endl;
80 get_wid(const char* word, const TLexicon& lexicon)
82 TLexicon::const_iterator lexi = lexicon.find(word);
84 if (lexi != lexicon.end()) {
87 cerr << "Error:\"" << word << "\" not found in lexicon." << endl;
94 CArpaSlm::TLeaf::load_words(char* buf, const TLexicon& lexicon)
98 for (word = end = buf; *end != 0; ++end) {
100 assert(nword < N_GRAM);
102 hw[nword++] = get_wid(word, lexicon);
107 wid = hw[nword++] = get_wid(word, lexicon);
113 CArpaSlm::TLeaf::load(istream& is, const TLexicon& lexicon)
116 is.getline(buf, sizeof(buf));
118 char* words = getwords(buf, &next);
119 load_words(words, lexicon);
120 sscanf(next, "%f (%1u, %u)",
125 CArpaSlm::TNode::load(istream& is, const TLexicon& lexicon)
128 is.getline(buf, sizeof(buf));
130 char* words = getwords(buf, &next);
131 load_words(words, lexicon);
132 sscanf(next, "%f %f (%1u, %u)",
133 &pr, &bow, &bol, &bon);
137 CArpaSlm::TNode::load_level0(istream& is)
141 is.getline(buf, sizeof(buf));
142 sscanf(buf, "%f %f (%1u, %u)",
143 &pr, &bow, &bol, &bon);
148 CArpaSlm::load(const char* filename, const TLexicon& lexicon)
150 printf("Loading ARPA slm..."); fflush(stdout);
151 ifstream file(filename);
153 for (int i = 0; i <= N_GRAM; ++i) {
156 file.getline(buf, sizeof(buf));
158 cerr << "Failed to read from" << filename << endl;
161 sscanf(buf, "\\%d-gram\\%d%*[\n]", &lvl, &size);
162 assert(lvl <= N_GRAM);
165 node0.load_level0(file);
166 m_levels[0].push_back(node0);
167 } else if (lvl < m_N) {
168 m_levels[lvl].reserve(size);
169 for (int i = 0; i < size; ++i) {
171 node.load(file, lexicon);
172 m_levels[lvl].push_back(node);
176 m_lastLevel.reserve(size);
177 for (int i = 0; i < size; ++i) {
179 leaf.load(file, lexicon);
180 m_lastLevel.push_back(leaf);
186 template <class NodeT>
188 const unsigned m_lvl;
189 CompareNode(unsigned lvl) : m_lvl(lvl)
193 * @return true if strictly less, false otherwise
196 operator ()(const NodeT& node, const TSIMWordId hw[N_GRAM])
198 for (unsigned i = 0; i < m_lvl; ++i) {
199 if (node.hw[i] < hw[i])
201 if (node.hw[i] > hw[i])
204 // node.hw[:lvl] is the same as hw[:]
210 CArpaSlm::threading()
213 TNode& node = m_levels[0][0];
216 for (unsigned lvl = 1; lvl < m_N; ++lvl) {
217 TNodeLevel& level = m_levels[lvl];
218 unsigned last_child = 0;
219 for (TNodeLevel::iterator node = level.begin();
222 node->ch = last_child = find_1st_child(lvl, *node, last_child);
228 CArpaSlm::find_1st_child(unsigned lvl, const TNode& node, int last_child)
231 if (lvl == m_N - 1) {
232 TLeafLevel::iterator found = lower_bound(
233 m_lastLevel.begin(), m_lastLevel.end(), node.hw,
234 CompareNode<TLeaf>(lvl));
235 return distance(m_lastLevel.begin(), found);
237 const TNodeLevel& level = m_levels[lvl + 1];
238 TNodeLevel::const_iterator found = lower_bound(level.begin(), level.end(
240 CompareNode<TNode>(lvl));
241 return distance(level.begin(), found);