3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 * The contents of this file are subject to the terms of either the GNU Lesser
8 * General Public License Version 2.1 only ("LGPL") or the Common Development and
9 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 * file except in compliance with the License. You can obtain a copy of the CDDL at
11 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 * specific language governing permissions and limitations under the License. When
14 * distributing the software, include this License Header Notice in each file and
15 * include the full text of the License in the License file as well as the
18 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 * For Covered Software in this distribution, this License shall be governed by the
21 * laws of the State of California (excluding conflict-of-law provisions).
22 * Any litigation relating to this License shall be subject to the jurisdiction of
23 * the Federal Courts of the Northern District of California and the state courts
24 * of the State of California, with venue lying in Santa Clara County, California.
28 * If you wish your version of this file to be governed by only the CDDL or only
29 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 * license." If you don't indicate a single choice of license, a recipient has the
32 * option to distribute your version of this file under either the CDDL or the LGPL
33 * Version 2.1, or to extend the choice of license to its licensees as provided
34 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 * Version 2 license, then the option applies only if the new code is made subject
36 * to such option by the copyright holder.
39 #ifndef _SUN_AGC_SLM_H
40 #define _SUN_AGC_SLM_H
46 #include "../portability.h"
51 * Thread slm make the following modifications to simple back-off language model
52 * -# Word id are limited to 18 bits, about 240K word ids
53 * -# Compact all float value of -log(pr) into 65536 (16 bits)
54 * level and use a table to map the index to a float value;
55 * -# Compact all float value of -log(pr) into 16384 (14 bits)
56 * level and use a table to map the index to a float value;
57 * -# threading infomation embed into binary model file. Threading include
58 * - bol(back-off-level) from current level
59 * - bon(back-off-node)'s index in the bol level array
61 * The thread could be used:
62 * - when leaf node are arrived, it could use (bol,bon) as history for
64 * - when a word could not be found in current node (cl, cn)'s children,
65 * searching could be transfered to (bol, bon) directly and continue
66 * searching the target word
67 * -# Add a basic type TState in Language model, a state is pair of\n
68 * (level, array_idx_of_the level)
69 * -# change all get probability interface to\n
70 * double transfer(TState& history, unsigned int wid, TState& result);
81 * (level:idx) located a state in the language model very well
82 * Please note the psuedo unigram state, with level == 0, but idx > 0
83 * it's for used with bigram cache model
86 TState(const TState &b) : m_all(b.m_all) {
88 TState(unsigned level = 0, unsigned idx = 0) {
89 anony.m_Level = level; anony.m_Idx = idx;
92 TState& operator++() { ++anony.m_Idx; return *this; }
94 void setIdx(unsigned int idx) { anony.m_Idx = idx; }
95 void setLevel(unsigned int lvl) { anony.m_Level = lvl; }
97 unsigned int getLevel() const { return anony.m_Level; }
98 unsigned int getIdx() const { return anony.m_Idx; }
99 operator unsigned() const { return m_all; }
101 bool isTailState() const { return getIdx() <= 1; }
103 bool operator==(const TState & b) const {
104 return m_all == b.m_all;
106 bool operator<(const TState & b) const {
107 return unsigned(*this) < unsigned(b);
112 #ifndef WORDS_BIGENDIAN
115 unsigned m_Level : 8;
119 unsigned m_Level : 8;
130 unsigned int wid() const {
134 unsigned int bow() const {
138 unsigned int pr() const {
142 unsigned int bon() const {
146 unsigned int bol() const {
150 unsigned int ch() const {
151 return((m_ch_hi << 16) + m_ch_lo);
154 void set_wid(unsigned int wid){
158 void set_bow(unsigned int bow){
162 void set_pr(unsigned int pr){
166 void set_bon(unsigned int bon){
170 void set_bol(unsigned int bol){
174 void set_ch(unsigned int ch){
175 m_ch_hi = ((ch >> 16) & 0x7F);
176 m_ch_lo = (ch & 0xFFFF);
180 #ifndef WORDS_BIGENDIAN
184 unsigned m_ch_lo : 16;
187 unsigned m_ch_hi : 7;
189 unsigned m_ch_hi : 7;
192 unsigned m_ch_lo : 16;
203 inline TChildIdx(unsigned val) : m_all(val) { }
204 inline TChildIdx(const TChildIdx& b) : m_all(b.m_all) { }
205 inline TChildIdx(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi = hi; anony.m_lo = lo; }
207 inline unsigned int lo() { return anony.m_lo; }
208 inline unsigned int hi() { return anony.m_hi; }
209 inline unsigned int all(){ return m_all; }
211 inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
212 inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
213 inline unsigned int set_all(unsigned int all) { return (m_all = all); }
217 *#ifndef WORDS_BIGENDIAN
239 inline unsigned int wid() const { return m_wid; }
240 inline unsigned int bon() const { return m_bon; }
241 inline unsigned int bol() const { return m_bol; }
242 inline unsigned int pr() const { return((m_pr_hi << 14) + m_pr_lo); }
244 inline void set_wid(unsigned int wid) { m_wid = wid; }
245 inline void set_bon(unsigned int bon) { m_bon = bon; }
246 inline void set_bol(unsigned int bol) { m_bol = bol; }
247 inline void set_pr(unsigned int pr) { m_pr_hi = ((pr >> 14) & 0x3);
248 m_pr_lo = pr & 0x3FFF; }
251 #ifndef WORDS_BIGENDIAN
253 unsigned m_pr_lo : 14;
256 unsigned m_pr_hi : 2;
258 unsigned m_pr_hi : 2;
261 unsigned m_pr_lo : 14;
269 inline TPr(unsigned int val) : m_all(val) { }
270 inline TPr(const TPr & b) : m_all(b.m_all) { }
271 inline TPr(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi=hi, anony.m_lo=lo; }
273 inline unsigned int lo() { return anony.m_lo; }
274 inline unsigned int hi() { return anony.m_hi; }
275 inline unsigned int all(){ return m_all; }
277 inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
278 inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
279 inline unsigned int set_all(unsigned int all) { return (m_all = all); }
283 #ifndef WORDS_BIGENDIAN
302 : m_N(0), m_UseLogPr(0), m_Levels(NULL), m_LevelSizes(NULL),
303 m_bowTable(NULL), m_prTable(NULL), m_bMMap(false), m_buf(NULL) { }
305 ~CThreadSlm() { free(); }
308 load(const char* fname, bool MMap = false);
310 unsigned isUseLogPr() const
311 { return m_UseLogPr; }
317 transferNegLog(TState history, unsigned int wid, TState& result);
320 transfer(TState history, unsigned int wid, TState& result);
323 history_state_of(TState st);
326 historify(TState& st);
329 lastWordId(TState st);
333 rawTransfer(TState history, unsigned int wid, TState& result);
336 typedef void* PtrVoid;
341 unsigned *m_LevelSizes;