3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
5 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
7 # The contents of this file are subject to the terms of either the GNU Lesser
8 # General Public License Version 2.1 only ("LGPL") or the Common Development and
9 # Distribution License ("CDDL")(collectively, the "License"). You may not use this
10 # file except in compliance with the License. You can obtain a copy of the CDDL at
11 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
12 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the
13 # specific language governing permissions and limitations under the License. When
14 # distributing the software, include this License Header Notice in each file and
15 # include the full text of the License in the License file as well as the
18 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
20 # For Covered Software in this distribution, this License shall be governed by the
21 # laws of the State of California (excluding conflict-of-law provisions).
22 # Any litigation relating to this License shall be subject to the jurisdiction of
23 # the Federal Courts of the Northern District of California and the state courts
24 # of the State of California, with venue lying in Santa Clara County, California.
28 # If you wish your version of this file to be governed by only the CDDL or only
29 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
30 # include this software in this distribution under the [CDDL or LGPL Version 2.1]
31 # license." If you don't indicate a single choice of license, a recipient has the
32 # option to distribute your version of this file under either the CDDL or the LGPL
33 # Version 2.1, or to extend the choice of license to its licensees as provided
34 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
35 # Version 2 license, then the option applies only if the new code is made subject
36 # to such option by the copyright holder.
43 from imdict import IMDict
44 from trie import match_longest, get_ambiguious_length
45 from utils import read_ch_sentences
50 mmseg.py -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID] corpus_file
53 The dictionary file (in UTF-8 encoding) to be used.
55 Output format, can be 'text' or 'bin'. Default is 'bin'.
56 Normally, in text mode, word text are output, while in binary mode,
57 the integer of the word-ids are writed to stdout.
59 Show Id info. In text output format, attach id after known words.
61 Sentence token id. Default 10.
62 It will be write to output in binary mode after every sentence.
64 Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0),
65 The sequence ABC will not be segmented, in binary mode, the AMBI-ID
66 is written out; in text mode, <ambi>ABC</ambi> will be output. Default
70 options={'show-id': False,
75 def parse_options(args):
77 opts, args = getopt.getopt(args, "hid:f:s:a:", ["help", "show-id", "dict=", "format=", "stok-id=", "ambi-id="])
78 except getopt.GetoptError, err:
83 if opt in ('-h', '--help'):
86 elif opt in ('-d', '--dict'):
88 elif opt in ('-i', '--show-id'):
89 options['show-id'] = True
90 elif opt in ('-f', '--format'):
91 if val in ('bin', 'text'):
92 options['format'] = val
93 elif opt in ('-s', '--stok-id'):
94 options['stok-id'] = int(val)
95 elif opt in ('-val', '--ambi-id'):
96 options['ambi-id'] = int(val)
98 if 'dict' not in options:
103 options['corpus'] = args[0]
105 def output_word(wid, word):
106 if options['format'] == 'text':
107 if wid == options['ambi-id']:
108 word = '<ambi>'+word+'</ambi>'
109 if options['show-id']:
110 word = word+'('+str(wid)+')'
111 sys.stdout.write('%s ' % word.encode('UTF-8'))
113 sys.stdout.write(struct.pack('I', wid))
115 def process_file(file, dict):
116 for line in read_ch_sentences(file):
117 print >> sys.stderr, line.encode('UTF-8')
122 wid, l = match_longest(dict, strbuf)
126 ambi_len = get_ambiguious_length(dict, strbuf, l)
128 wid, l = options['ambi-id'], ambi_len
130 output_word (wid, strbuf[:l])
133 output_word (options['stok-id'], '\n')
135 if __name__ == "__main__":
136 parse_options(sys.argv[1:])
138 dict = IMDict(options['dict'])
140 try: file = codecs.open(options['corpus'], "r", "UTF-8")
141 except: file = codecs.getreader('UTF-8')(sys.stdin)
143 process_file (file, dict)