2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
59 cerr << "getWordFreq [-s corpus_size] [-v] [-e] -m slm_file -l lexicon\n";
60 cerr << " default corpus_size is 300000000 if not given\n";
62 " -v means output other information after word and freq for each line\n";
63 cerr << " -e give format for ervin\n";
67 static char* slm_file = NULL;
68 static char* lexicon_file = NULL;
69 static int corpus_size = 300000000;
70 static bool verbose = false;
71 static bool ervin = false;
74 getParameters(int argc, char* argv[])
77 while ((ch = getopt(argc, argv, "m:l:s:ve")) != -1) {
80 slm_file = strdup(optarg); break;
82 lexicon_file = strdup(optarg); break;
84 corpus_size = atoi(optarg); break;
86 verbose = true; break;
93 return(slm_file && lexicon_file && corpus_size > 10);
96 static char buf[8192];
99 tagFile(FILE *fp, CThreadSlm& slm)
102 while (fgets(buf, sizeof(buf), fp) != NULL) {
104 char* wrd = strtok(buf, "\n\r \t");
105 char* idstr = strtok(NULL, "\n\r \t");
106 char* info = strtok(NULL, "\n\r");
108 int id = atoi(idstr);
110 CThreadSlm::TState st;
111 double neglogpr = slm.transfer(st, (unsigned int)id, st);
112 if (st.getLevel() == 1) {
113 freq = int(exp(-neglogpr) * corpus_size);
121 for (char *p = strtok(info, " \t\n\r");
123 p = strtok(NULL, " \t\n\t"))
126 for (int i = 0, sz = pyv.size(); i < sz; ++i) {
127 cout << wrd << " " << pyv[i] << " " << freq << "\n";
129 } else if (idstr && verbose) {
130 cout << wrd << " " << idstr << " " << freq;
135 cout << wrd << " " << freq << "\n";
142 main(int argc, char*argv[])
144 if (!getParameters(argc, argv))
149 if (slm.load(slm_file, true) && (fp = fopen(lexicon_file, "r")) != NULL) {