2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
43 #include "pytrie_gen.h"
44 #include "../slm/slm.h"
45 #include "trie_writer.h"
47 class CUnigramSorter : public CWordEvaluator {
50 getCost(unsigned int wid);
53 isSeen(unsigned int wid);
56 open(const char* lm_file)
58 return m_Model.load(lm_file);
73 CUnigramSorter::getCost(unsigned int wid)
75 CThreadSlm::TState st(0, 0);
76 return m_Model.transferNegLog(st, wid, st);
80 CUnigramSorter::isSeen(unsigned int wid)
82 CThreadSlm::TState st(0, 0);
83 m_Model.transferNegLog(st, wid, st);
84 //printf(" -log(pr(%d)) = %lf\n", wid, logpr);
85 return(st.getLevel() == 1);
89 * This program is used to generate the PINYIN Lexicon. It
90 * Only works on zh_CN.utf8 locale.\n
92 * -# dictionary file, in utf8 encoding, line-based text file,
93 * each line looks like\n
94 * CCC id [pinyin'pinyin'pinyin]*
95 * -# output binary PINYIN Lexicon file name
96 * -# log file to print the generated PINYIN Lexicon
97 * -# language model to sort the words of each node
100 ShowUsage(const char* progname)
105 " %s -i lexicon_file -o result_file -l log_file -s slm_file [-e le|be]\n",
110 " This program is used to generate the PINYIN Lexicon. It Only works on zh_CN.utf8 locale\n"
116 main(int argc, char* argv[])
118 setlocale(LC_ALL, "");
120 const char* lexicon_file = NULL;
121 const char* result_file = NULL;
122 const char* log_file = NULL;
123 const char* slm_file = NULL;
124 int build_endian = get_host_endian();
126 while ((opt = getopt(argc, argv, "i:o:l:s:e:")) != -1) {
129 lexicon_file = optarg;
132 result_file = optarg;
140 build_endian = parse_endian(optarg);
144 if (!lexicon_file || !result_file || !log_file || !slm_file ||
145 build_endian == -1) {
149 printf("Opening language model..."); fflush(stdout);
151 if (!srt.open(slm_file)) {
155 printf("done!\n"); fflush(stdout);
157 CPinyinTrieMaker maker;
159 maker.constructFromLexicon(lexicon_file);
161 printf("Writing out..."); fflush(stdout);
162 maker.write(result_file, &srt, get_host_endian() != build_endian);
163 printf("done!\n"); fflush(stdout);
167 if (get_host_endian() != build_endian) {
168 printf("host endian is different from build endian. "
169 "log_file will not be written.\n");
174 printf("Printing the lexicon out to log_file..."); fflush(stdout);
178 FILE *fp = fopen(log_file, "w");