3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 * The contents of this file are subject to the terms of either the GNU Lesser
8 * General Public License Version 2.1 only ("LGPL") or the Common Development and
9 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 * file except in compliance with the License. You can obtain a copy of the CDDL at
11 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 * specific language governing permissions and limitations under the License. When
14 * distributing the software, include this License Header Notice in each file and
15 * include the full text of the License in the License file as well as the
18 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 * For Covered Software in this distribution, this License shall be governed by the
21 * laws of the State of California (excluding conflict-of-law provisions).
22 * Any litigation relating to this License shall be subject to the jurisdiction of
23 * the Federal Courts of the Northern District of California and the state courts
24 * of the State of California, with venue lying in Santa Clara County, California.
28 * If you wish your version of this file to be governed by only the CDDL or only
29 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 * license." If you don't indicate a single choice of license, a recipient has the
32 * option to distribute your version of this file under either the CDDL or the LGPL
33 * Version 2.1, or to extend the choice of license to its licensees as provided
34 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 * Version 2 license, then the option applies only if the new code is made subject
36 * to such option by the copyright holder.
39 #ifndef _SUN_AGC_SLM_H
40 #define _SUN_AGC_SLM_H
46 #include "../portability.h"
51 * Thread slm make the following modifications to simple back-off language model
52 * -# Word id are limited to 18 bits, about 240K word ids
53 * -# Compact all float value of -log(pr) into 65536 (16 bits)
54 * level and use a table to map the index to a float value;
55 * -# Compact all float value of -log(pr) into 16384 (14 bits)
56 * level and use a table to map the index to a float value;
57 * -# threading infomation embed into binary model file. Threading include
58 * - bol(back-off-level) from current level
59 * - bon(back-off-node)'s index in the bol level array
61 * The thread could be used:
62 * - when leaf node are arrived, it could use (bol,bon) as history for
64 * - when a word could not be found in current node (cl, cn)'s children,
65 * searching could be transfered to (bol, bon) directly and continue
66 * searching the target word
67 * -# Add a basic type TState in Language model, a state is pair of\n
68 * (level, array_idx_of_the level)
69 * -# change all get probability interface to\n
70 * double transfer(TState& history, unsigned int wid, TState& result);
81 * (level:idx) located a state in the language model very well
82 * Please note the psuedo unigram state, with level == 0, but idx > 0
83 * it's for used with bigram cache model
86 TState(const TState &b) : m_all(b.m_all) {
88 TState(unsigned level = 0, unsigned idx = 0) {
89 anony.m_Level = level; anony.m_Idx = idx;
92 TState& operator++() { ++anony.m_Idx; return *this; }
94 void setIdx(unsigned int idx) { anony.m_Idx = idx; }
95 void setLevel(unsigned int lvl) { anony.m_Level = lvl; }
97 unsigned int getLevel() const { return anony.m_Level; }
98 unsigned int getIdx() const { return anony.m_Idx; }
99 operator unsigned() const { return m_all; }
101 bool isTailState() const { return getIdx() <= 1; }
103 bool operator==(const TState & b) const {
104 return m_all == b.m_all;
106 bool operator<(const TState & b) const {
107 return unsigned(*this) < unsigned(b);
109 TState& operator=(const TState& b) {
110 if (m_all == b.m_all)
119 #ifndef WORDS_BIGENDIAN
122 unsigned m_Level : 8;
126 unsigned m_Level : 8;
137 unsigned int wid() const {
141 unsigned int bow() const {
145 unsigned int pr() const {
149 unsigned int bon() const {
153 unsigned int bol() const {
157 unsigned int ch() const {
158 return((m_ch_hi << 16) + m_ch_lo);
161 void set_wid(unsigned int wid){
165 void set_bow(unsigned int bow){
169 void set_pr(unsigned int pr){
173 void set_bon(unsigned int bon){
177 void set_bol(unsigned int bol){
181 void set_ch(unsigned int ch){
182 m_ch_hi = ((ch >> 16) & 0x7F);
183 m_ch_lo = (ch & 0xFFFF);
187 #ifndef WORDS_BIGENDIAN
191 unsigned m_ch_lo : 16;
194 unsigned m_ch_hi : 7;
196 unsigned m_ch_hi : 7;
199 unsigned m_ch_lo : 16;
210 inline TChildIdx(unsigned val) : m_all(val) { }
211 inline TChildIdx(const TChildIdx& b) : m_all(b.m_all) { }
212 inline TChildIdx(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi = hi; anony.m_lo = lo; }
214 inline unsigned int lo() { return anony.m_lo; }
215 inline unsigned int hi() { return anony.m_hi; }
216 inline unsigned int all(){ return m_all; }
218 inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
219 inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
220 inline unsigned int set_all(unsigned int all) { return (m_all = all); }
224 *#ifndef WORDS_BIGENDIAN
246 inline unsigned int wid() const { return m_wid; }
247 inline unsigned int bon() const { return m_bon; }
248 inline unsigned int bol() const { return m_bol; }
249 inline unsigned int pr() const { return((m_pr_hi << 14) + m_pr_lo); }
251 inline void set_wid(unsigned int wid) { m_wid = wid; }
252 inline void set_bon(unsigned int bon) { m_bon = bon; }
253 inline void set_bol(unsigned int bol) { m_bol = bol; }
254 inline void set_pr(unsigned int pr) { m_pr_hi = ((pr >> 14) & 0x3);
255 m_pr_lo = pr & 0x3FFF; }
258 #ifndef WORDS_BIGENDIAN
260 unsigned m_pr_lo : 14;
263 unsigned m_pr_hi : 2;
265 unsigned m_pr_hi : 2;
268 unsigned m_pr_lo : 14;
276 inline TPr(unsigned int val) : m_all(val) { }
277 inline TPr(const TPr & b) : m_all(b.m_all) { }
278 inline TPr(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi=hi, anony.m_lo=lo; }
280 inline unsigned int lo() { return anony.m_lo; }
281 inline unsigned int hi() { return anony.m_hi; }
282 inline unsigned int all(){ return m_all; }
284 inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
285 inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
286 inline unsigned int set_all(unsigned int all) { return (m_all = all); }
290 #ifndef WORDS_BIGENDIAN
309 : m_N(0), m_UseLogPr(0), m_Levels(NULL), m_LevelSizes(NULL),
310 m_bowTable(NULL), m_prTable(NULL), m_bMMap(false), m_buf(NULL) { }
312 ~CThreadSlm() { free(); }
315 load(const char* fname, bool MMap = false);
317 unsigned isUseLogPr() const
318 { return m_UseLogPr; }
324 transferNegLog(TState history, unsigned int wid, TState& result);
327 transfer(TState history, unsigned int wid, TState& result);
330 history_state_of(TState st);
333 historify(TState& st);
336 lastWordId(TState st);
340 rawTransfer(TState history, unsigned int wid, TState& result);
343 typedef void* PtrVoid;
348 unsigned *m_LevelSizes;