2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
41 #include "pinyin_seg.h"
42 #include "quanpin_trie.h"
45 CGetCorrectionPairOp::operator ()(std::string& pystr, unsigned& matched_len)
47 CCorrectionPairVec::iterator it = m_correctionPairs.begin();
48 CCorrectionPairVec::iterator ite = m_correctionPairs.end();
50 for (; it != ite; ++it) {
51 std::string& k = it->first;
52 std::string& v = it->second;
53 unsigned l = k.size();
55 if (pystr.size() >= l && !pystr.compare(pystr.size() - l, l, k)) {
65 CGetFuzzySegmentsOp::_initMaps()
67 unsigned num_of_fuzzy_finals;
68 const unsigned * fuzzy_final_map = CPinyinData::getInnerFuzzyFinalMap(
71 for (size_t i = 0; i < num_of_fuzzy_finals; ++i) {
72 unsigned f = *(fuzzy_final_map++);
73 unsigned _f = *(fuzzy_final_map++);
74 unsigned l = *(fuzzy_final_map++);
76 m_fuzzyFinalMap.insert(std::make_pair(f, std::make_pair(_f, l)));
79 const unsigned *fuzzy_pre_syls, *fuzzy_pro_syls;
80 CPinyinData::getFuzzyPreProSyllables(&fuzzy_pre_syls, &fuzzy_pro_syls);
82 while (*fuzzy_pre_syls) {
83 unsigned s = *(fuzzy_pre_syls++);
84 char c = *(fuzzy_pre_syls++);
85 unsigned _s = *(fuzzy_pre_syls++);
86 m_fuzzyPreMap.insert(std::make_pair(s, std::make_pair(c, _s)));
89 while (*fuzzy_pro_syls) {
90 unsigned s = *(fuzzy_pro_syls++);
91 char c = *(fuzzy_pro_syls++);
92 unsigned _s = *(fuzzy_pro_syls++);
93 m_fuzzyProMap.insert(std::make_pair(s, std::make_pair(c, _s)));
98 CGetFuzzySegmentsOp::_invalidateSegments(IPySegmentor::TSegmentVec& fuzzy_segs,
99 IPySegmentor::TSegment& seg)
101 unsigned invalidatedFrom = UINT_MAX;
103 IPySegmentor::TSegmentVec::reverse_iterator it = fuzzy_segs.rbegin();
104 IPySegmentor::TSegmentVec::reverse_iterator ite = fuzzy_segs.rend();
106 for (; it != ite; it += 2) {
107 IPySegmentor::TSegment& seg1 = *(it + 1);
108 IPySegmentor::TSegment& seg2 = *it;
110 unsigned r = seg2.m_start + seg2.m_len;
111 if (r <= seg.m_start)
114 invalidatedFrom = seg1.m_start;
117 fuzzy_segs.erase(it.base(), fuzzy_segs.end());
119 return invalidatedFrom;
123 CGetFuzzySegmentsOp::operator ()(IPySegmentor::TSegmentVec& segs,
124 IPySegmentor::TSegmentVec& fuzzy_segs,
127 IPySegmentor::TSegment& seg = segs.back();
128 unsigned invalidatedFrom = _invalidateSegments(fuzzy_segs, seg);
130 unsigned updatedFrom = UINT_MAX;
131 TSyllable syl = (TSyllable)seg.m_syllables[0];
133 if (m_bInnerFuzzyEnabled) { // xian -> xian, xi'an
134 CInnerFuzzyFinalMap::iterator it = m_fuzzyFinalMap.find(syl.final);
136 if (it != m_fuzzyFinalMap.end()) {
137 unsigned an_syl = it->second.first;
138 unsigned an_len = it->second.second;
140 unsigned xi_len = seg.m_len - an_len;
141 wstring wstr = input.substr(seg.m_start, xi_len);
144 std::string xi_str(wstr.begin(), wstr.end());
147 for (wstring::iterator it = wstr.begin(); it != wstr.end(); ++it)
148 xi_str.push_back(*it);
151 unsigned xi_syl = CPinyinData::encodeSyllable(xi_str.c_str());
156 IPySegmentor::TSegment xi = segs.back();
158 xi.m_syllables[0] = xi_syl;
160 IPySegmentor::TSegment an = segs.back();
162 an.m_start += xi_len;
163 an.m_syllables[0] = an_syl;
164 an.m_inner_fuzzy = true;
166 fuzzy_segs.push_back(xi);
167 fuzzy_segs.push_back(an);
169 updatedFrom = xi.m_start;
174 if (segs.size() >= 2) { // fangan -> fang'an, fan'gan
175 IPySegmentor::TSegment& pre_seg = *(segs.end() - 2);
177 CFuzzySyllableMap::iterator pre_it = m_fuzzyPreMap.find(
178 pre_seg.m_syllables[0]);
179 CFuzzySyllableMap::iterator it = m_fuzzyProMap.find(syl);
181 if (pre_it != m_fuzzyPreMap.end() && it != m_fuzzyProMap.end() &&
182 pre_it->second.first == it->second.first) {
183 IPySegmentor::TSegment fang = segs[segs.size() - 2];
185 fang.m_syllables[0] = pre_it->second.second;
187 IPySegmentor::TSegment an = segs.back();
190 an.m_syllables[0] = it->second.second;
192 fuzzy_segs.push_back(fang);
193 fuzzy_segs.push_back(an);
195 updatedFrom = fang.m_start;
202 return std::min(updatedFrom, invalidatedFrom);
206 CQuanpinSegmentor::CQuanpinSegmentor ()
207 : m_pGetFuzzySyllablesOp(NULL),
208 m_pGetCorrectionPairOp(NULL),
209 m_pGetFuzzySegmentsOp(NULL),
210 m_pytrie(base, check, value, sizeof(base) / sizeof(*base)),
217 CQuanpinSegmentor::load(const char * pyTrieFileName)
219 return m_pytrie.load(pyTrieFileName);
224 print_pystr(const std::string pystr)
226 for (const char* c = pystr.c_str();
227 c != pystr.c_str() + pystr.length();
229 printf("%c", *c & 0x7f);
236 CQuanpinSegmentor::push(unsigned ch)
238 m_inputBuf.push_back(ch);
240 if (m_pGetCorrectionPairOp && m_pGetCorrectionPairOp->isEnabled()) {
241 m_pystr.push_back(ch);
243 const char * v = (*m_pGetCorrectionPairOp)(m_pystr, l);
246 unsigned orig_size = m_segs.size();
247 _clear(m_pystr.size() - l);
248 m_updatedFrom = _updateWith(v);
250 if (m_segs.size() >= orig_size) {
251 // does not get better segmentation, revert to original
252 _clear(m_pystr.size() - strlen(v));
253 std::string new_pystr;
254 std::copy(m_inputBuf.end() - l, m_inputBuf.end(),
255 back_inserter(new_pystr));
256 m_updatedFrom = _updateWith(new_pystr);
258 if (l != strlen(v)) {
260 m_segs.back().m_len += l - strlen(v);
261 m_pystr.resize(m_inputBuf.length());
263 std::copy(m_inputBuf.end() - l, m_inputBuf.end(),
266 return m_updatedFrom;
269 m_pystr.resize(m_pystr.size() - 1);
272 return m_updatedFrom = _push(ch);
276 CQuanpinSegmentor::pop()
279 return m_updatedFrom = 0;
281 unsigned size = m_inputBuf.size();
282 m_inputBuf.resize(size - 1);
283 m_pystr.resize(size - 1);
285 unsigned l = m_segs.back().m_len;
289 return m_updatedFrom = size - 1;
291 std::string new_pystr = m_pystr.substr(size - l);
292 m_pystr.resize(size - l);
294 m_updatedFrom = _updateWith(new_pystr);
296 return m_updatedFrom;
300 CQuanpinSegmentor::insertAt(unsigned idx, unsigned ch)
303 _locateSegment(idx, i, j);
305 m_inputBuf.insert(idx, 1, ch);
306 m_pystr.insert(idx, 1, ch);
308 std::string new_pystr = m_pystr.substr(i);
310 m_segs.erase(m_segs.begin() + j, m_segs.end());
312 m_updatedFrom = _updateWith(new_pystr);
314 return m_updatedFrom;
318 CQuanpinSegmentor::deleteAt(unsigned idx, bool backward)
321 if (!backward) idx += 1;
322 _locateSegment(idx, i, j);
324 m_inputBuf.erase(idx, 1);
325 m_pystr.erase(idx, 1);
327 std::string new_pystr = m_pystr.substr(i);
329 m_segs.erase(m_segs.begin() + j, m_segs.end());
331 m_updatedFrom = _updateWith(new_pystr);
333 return m_updatedFrom;
337 CQuanpinSegmentor::clear(unsigned from)
339 m_inputBuf.resize(from);
344 CQuanpinSegmentor::_clear(unsigned from)
347 _locateSegment(from, i, j);
350 std::string new_pystr = m_pystr.substr(i, from - i);
352 m_segs.erase(m_segs.begin() + j, m_segs.end());
354 m_updatedFrom = _updateWith(new_pystr, from);
356 return m_updatedFrom;
360 CQuanpinSegmentor::_locateSegment(unsigned idx,
366 TSegmentVec::iterator it = m_segs.begin();
367 TSegmentVec::iterator ite = m_segs.end();
369 for (; it != ite; ++it) {
370 if (strIdx + (*it).m_len > idx)
373 strIdx += (*it).m_len;
379 CQuanpinSegmentor::_push(unsigned ch)
382 m_pystr.push_back(ch);
383 int v = m_pytrie.match_longest(m_pystr.rbegin(), m_pystr.rend(), l);
385 if (l == 0) { // not a valid syllable character, e.g., \', i, u, or A-Z
386 IPySegmentor::ESegmentType seg_type;
387 if (ch == '\'' && m_inputBuf.size() > 1)
388 seg_type = IPySegmentor::SYLLABLE_SEP;
389 else if (islower(ch))
390 seg_type = IPySegmentor::INVALID;
392 seg_type = IPySegmentor::STRING;
394 ret = m_pystr.size() - 1;
395 m_segs.push_back(TSegment(ch, ret, 1, seg_type));
396 } else if (l == 1) { // possible a new segment
397 int last_idx = m_pystr.size() - 2;
398 if (last_idx >= 0 && (m_pystr[last_idx] & 0x80)) {
399 // check if the last syllable character's highest bitmask is set
400 // e.g., feN, so [feN] + g -> [feng]
401 m_pystr[last_idx] &= 0x7f;
403 int v = m_pytrie.match_longest(m_pystr.rbegin(), m_pystr.rend(), l);
405 TSegment &last_seg = m_segs.back();
406 if (l == (unsigned) last_seg.m_len + 1) {
408 last_seg.m_syllables[0] = v;
409 ret = m_pystr.size() - l;
413 // in case not extensible, change highest bitmask back
414 m_pystr[last_idx] |= 0x80;
417 // push the new 1-length segment
418 ret = m_pystr.size() - 1;
419 m_segs.push_back(TSegment(v, ret, 1));
420 } else if (l == (unsigned) m_segs.back().m_len + 1) {
421 // current segment is extensible, e.g., [xia] + n -> [xian]
422 TSegment &last_seg = m_segs.back();
424 last_seg.m_syllables[0] = v;
425 ret = m_pystr.size() - l;
426 } else { // other cases
427 TSegment &last_seg = m_segs.back();
428 int i = 0, isum = last_seg.m_len + 1, lsum = l;
429 TSegmentVec new_segs(1, TSegment(v, m_pystr.size() - l, l));
431 // e.g., [zh] [o] [n] + g -> [zhonG],
433 unsigned end_idx = m_pystr.size() - 1;
434 m_pystr[end_idx] |= 0x80;
437 while (isum != lsum) {
438 if (lsum < isum) { // e.g., [die] + r -> [di] [er]
439 v = m_pytrie.match_longest(
440 m_pystr.rbegin() + lsum, m_pystr.rend(), l);
441 TSegment &last_seg = new_segs.back();
442 new_segs.push_back(TSegment(v, last_seg.m_start - l, l));
443 _addFuzzySyllables(new_segs.back());
447 isum += (m_segs.rbegin() + i)->m_len;
451 m_segs.erase(m_segs.end() - (i + 1), m_segs.end());
452 std::copy(new_segs.rbegin(), new_segs.rend(), back_inserter(m_segs));
453 ret = m_pystr.size() - lsum;
458 if (m_pGetFuzzySegmentsOp && m_pGetFuzzySegmentsOp->isEnabled())
461 (*m_pGetFuzzySegmentsOp)(m_segs, m_fuzzy_segs, m_inputBuf));
463 if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled()) {
464 if (m_segs.back().m_type == SYLLABLE)
465 _addFuzzySyllables(m_segs.back());
467 if (m_fuzzy_segs.size()) {
468 _addFuzzySyllables(*(m_fuzzy_segs.end() - 1));
469 _addFuzzySyllables(*(m_fuzzy_segs.end() - 2));
477 CQuanpinSegmentor::_addFuzzySyllables(TSegment& seg)
479 assert(seg.m_type == SYLLABLE);
481 seg.m_fuzzy_syllables.clear();
483 CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp)(seg.m_syllables.front());
484 CSyllables::const_iterator it = fuzzy_set.begin();
485 CSyllables::const_iterator ite = fuzzy_set.end();
487 for (; it != ite; ++it)
488 seg.m_fuzzy_syllables.push_back(*it);
492 CQuanpinSegmentor::_updateWith(const std::string& new_pystr, unsigned from)
494 unsigned minUpdatedFrom = from;
495 std::string::const_iterator it = new_pystr.begin();
496 for (; it != new_pystr.end(); ++it) {
497 unsigned updatedFrom = _push(*it & 0x7f);
499 if (updatedFrom < minUpdatedFrom) minUpdatedFrom = updatedFrom;
501 return minUpdatedFrom;