2 # -*- coding: UTF-8 -*-
4 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
6 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
8 # The contents of this file are subject to the terms of either the GNU Lesser
9 # General Public License Version 2.1 only ("LGPL") or the Common Development and
10 # Distribution License ("CDDL")(collectively, the "License"). You may not use this
11 # file except in compliance with the License. You can obtain a copy of the CDDL at
12 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
13 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the
14 # specific language governing permissions and limitations under the License. When
15 # distributing the software, include this License Header Notice in each file and
16 # include the full text of the License in the License file as well as the
19 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
21 # For Covered Software in this distribution, this License shall be governed by the
22 # laws of the State of California (excluding conflict-of-law provisions).
23 # Any litigation relating to this License shall be subject to the jurisdiction of
24 # the Federal Courts of the Northern District of California and the state courts
25 # of the State of California, with venue lying in Santa Clara County, California.
29 # If you wish your version of this file to be governed by only the CDDL or only
30 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
31 # include this software in this distribution under the [CDDL or LGPL Version 2.1]
32 # license." If you don't indicate a single choice of license, a recipient has the
33 # option to distribute your version of this file under either the CDDL or the LGPL
34 # Version 2.1, or to extend the choice of license to its licensees as provided
35 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
36 # Version 2 license, then the option applies only if the new code is made subject
37 # to such option by the copyright holder.
49 def __init__(self, key, freq):
53 def __cmp__(self,other):
54 return cmp(self.key, other.key)
57 return "ngram: " + self.key.__str__() + " freq: " + str(self.freq)
59 def read_ch_sentences(file):
62 if buf and (line[0].isspace() or len(buf) <= 40):
67 if ch.isspace() or ch == u'—':
80 def mergesort (iters):
85 heap.append((it.next(), it))
96 heapq.heapreplace(heap, (it.next(),it))
100 def read_ngrams (fname, n):
101 file = open(fname, "r")
102 fsize = os.path.getsize(fname)
103 mem = mmap.mmap(file.fileno(), fsize, mmap.MAP_SHARED, mmap.PROT_READ)
106 ngram = mem.read((n+1)*4)
108 data = struct.unpack('%dl' % (n+1), ngram)
109 yield NGram(data[:n], data[n])
117 __file = __mem = None
118 __realsize = __capsize = 0
120 def __init__(self, elmsize=1, fname=None, capsize=1024*1024):
121 self.__elmsize = elmsize
124 fno, self.__fname = tempfile.mkstemp("-mmarray", "pyslm-")
125 self.__file = os.fdopen (fno, "w+")
126 self.__enlarge(capsize)
130 def fromfile(self, fname):
131 if not os.path.exists(fname):
132 raise "The file '%s' does not exist!"
134 fsize = os.path.getsize(fname)
136 raise "The size of file '%s' is zero!" % fname
138 if self.__mem: self.__mem.close()
139 if self.__file: self.__file.close()
141 self.__file = open (fname, "r+")
142 self.__mem = mmap.mmap(self.__file.fileno(), fsize)
143 self.__realsize = self.__capsize = fsize/self.__elmsize
145 def tofile(self, fname):
146 if fname == self.__file.name:
147 raise "Can not dump the array to currently mapping file!"
148 tf = open(fname, "w+")
149 bsize = self.__realsize * self.__elmsize
150 tf.write (self.__mem[:bsize])
153 def __enlarge(self, capsize):
154 if self.__capsize >= capsize:
157 self.__capsize = capsize
158 self.__file.seek(self.__elmsize * self.__capsize - 1)
159 self.__file.write('\0')
162 if (self.__mem): self.__mem.close()
163 self.__mem = mmap.mmap(self.__file.fileno(), self.__file.tell())
166 bsize = self.__realsize * self.__elmsize
167 self.__file.truncate (bsize)
169 if self.__mem: self.__mem.close()
170 os.remove(self.__fname)
172 def __getitem__(self, idx):
173 if idx < -self.__realsize or idx >= self.__realsize:
175 return self.__access(idx)
177 def __setitem__(self, idx, buf):
178 if idx < -self.__realsize or idx >= self.__realsize:
180 if type(buf) != type("") or len(buf) != self.__elmsize:
181 raise "Not a string, or the buffer size is incorrect!"
182 self.__access(idx, buf)
184 def __access (self, idx, buf=None):
185 if idx < 0: idx = self.__realsize + idx
186 start = idx * self.__elmsize
187 end = start + self.__elmsize
188 if not buf: return self.__mem[start:end]
189 self.__mem[start:end] = buf
192 return self.__realsize
194 def append(self, buf):
195 if type(buf) != type("") or len(buf) != self.__elmsize:
196 raise "Not a string, or the buffer size is incorrect!"
198 if self.__realsize >= self.__capsize:
199 self.__enlarge(self.__capsize*2)
201 self.__access(self.__realsize, buf)
205 for i in xrange(0, self.__realsize):
206 yield self.__access(i)
208 def truncate(self, tsize):
209 if self.__realsize >= tsize:
210 self.__realsize = tsize