2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6 * The contents of this file are subject to the terms of either the GNU Lesser
7 * General Public License Version 2.1 only ("LGPL") or the Common Development and
8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this
9 * file except in compliance with the License. You can obtain a copy of the CDDL at
10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12 * specific language governing permissions and limitations under the License. When
13 * distributing the software, include this License Header Notice in each file and
14 * include the full text of the License in the License file as well as the
17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19 * For Covered Software in this distribution, this License shall be governed by the
20 * laws of the State of California (excluding conflict-of-law provisions).
21 * Any litigation relating to this License shall be subject to the jurisdiction of
22 * the Federal Courts of the Northern District of California and the state courts
23 * of the State of California, with venue lying in Santa Clara County, California.
27 * If you wish your version of this file to be governed by only the CDDL or only
28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29 * include this software in this distribution under the [CDDL or LGPL Version 2.1]
30 * license." If you don't indicate a single choice of license, a recipient has the
31 * option to distribute your version of this file under either the CDDL or the LGPL
32 * Version 2.1, or to extend the choice of license to its licensees as provided
33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34 * Version 2 license, then the option applies only if the new code is made subject
35 * to such option by the copyright holder.
40 #include "shuangpin_seg.h"
42 CShuangpinData CShuangpinSegmentor::s_shpData;
44 CShuangpinSegmentor::CShuangpinSegmentor (EShuangpinType shpType)
45 : m_updatedFrom(0), m_nAlpha(0), m_hasInvalid(false), m_nLastValidPos(0)
49 s_shpData.setShuangpinType(shpType);
53 CShuangpinSegmentor::push(unsigned ch)
55 m_inputBuf.push_back(ch);
57 return m_updatedFrom = _push(ch);
61 CShuangpinSegmentor::pop()
64 return m_updatedFrom = 0;
66 unsigned size = m_inputBuf.size();
68 EShuangpinType shpType = s_shpData.getShuangpinType();
69 bool isInputPy = (islower(m_pystr[size - 1]) ||
70 (m_pystr[size - 1] == ';' &&
71 (shpType == MS2003 || shpType == ZIGUANG)));
76 m_inputBuf.resize(size - 1);
77 m_pystr.resize(size - 1);
79 unsigned l = m_segs.back().m_len;
82 if (size == 1 || m_segs.back().m_type != IPySegmentor::INVALID) {
86 return m_updatedFrom = size - 1;
88 std::string new_pystr = m_pystr.substr(size - l);
89 m_pystr.resize(size - l);
91 m_updatedFrom = UINT_MAX;
92 std::string::const_iterator it = new_pystr.begin();
93 for (; it != new_pystr.end(); ++it) {
94 unsigned tmp = _push((*it) & 0x7f);
95 if (tmp < m_updatedFrom) m_updatedFrom = tmp;
102 CShuangpinSegmentor::insertAt(unsigned idx, unsigned ch)
104 unsigned pyIdx, segIdx;
105 _locateSegment(idx, pyIdx, segIdx);
107 m_inputBuf.insert(idx, 1, ch);
108 m_pystr.insert(idx, 1, ch);
110 std::string new_pystr = m_pystr.substr(pyIdx);
111 m_pystr.resize(pyIdx);
112 m_segs.erase(m_segs.begin() + segIdx, m_segs.end());
114 if (m_nLastValidPos == idx) {
115 m_hasInvalid = false;
116 } else if (m_nLastValidPos + 1 == idx) {
117 m_hasInvalid = false;
118 int nSize = m_pystr.size();
119 if (islower(m_pystr[nSize - 1])) {
120 m_nLastValidPos = idx - 1;
121 new_pystr.insert((size_t)0, 1, m_pystr[nSize - 1]);
122 m_pystr.erase(nSize - 1, 1);
123 m_segs.erase(m_segs.begin() + segIdx - 1);
125 } else if (m_nLastValidPos + 1 > idx) {
126 m_hasInvalid = false;
127 m_nLastValidPos = idx;
129 m_nAlpha = _getNumberOfNonAlpha();
131 m_updatedFrom = UINT_MAX;
132 std::string::const_iterator it = new_pystr.begin();
133 for (; it != new_pystr.end(); ++it) {
134 unsigned tmp = _push((*it) & 0x7f);
135 if (tmp < m_updatedFrom) m_updatedFrom = tmp;
138 return m_updatedFrom;
142 CShuangpinSegmentor::deleteAt(unsigned idx, bool backward)
144 unsigned pyIdx, segIdx;
145 if (!backward) idx += 1;
146 _locateSegment(idx, pyIdx, segIdx);
148 m_inputBuf.erase(idx, 1);
149 m_pystr.erase(idx, 1);
151 std::string new_pystr = m_pystr.substr(pyIdx);
152 m_pystr.resize(pyIdx);
153 TSegmentVec tmp_segs(m_segs.begin() + segIdx + 1, m_segs.end());
154 m_segs.erase(m_segs.begin() + segIdx, m_segs.end());
156 if (m_nLastValidPos + 1 < idx) {
157 //del invalid ch, and do not effect current status.
158 m_pystr.insert(idx, new_pystr);
159 m_segs.insert(m_segs.end(), tmp_segs.begin(), tmp_segs.end());
160 return m_inputBuf.size() - 1;
162 m_hasInvalid = false;
163 m_nAlpha = _getNumberOfNonAlpha();
166 m_updatedFrom = UINT_MAX;
167 std::string::const_iterator it = new_pystr.begin();
168 for (; it != new_pystr.end(); ++it) {
169 unsigned tmp = _push((*it) & 0x7f);
170 if (tmp < m_updatedFrom) m_updatedFrom = tmp;
173 return m_updatedFrom;
177 CShuangpinSegmentor::clear(unsigned from)
179 m_inputBuf.resize(from);
184 CShuangpinSegmentor::_clear(unsigned from)
187 _locateSegment(from, i, j);
189 std::string new_pystr = m_pystr.substr(i, from - i);
191 m_nAlpha = _getNumberOfNonAlpha();
193 m_segs.erase(m_segs.begin() + j, m_segs.end());
195 if (m_nLastValidPos + 1 >= from) {
196 m_hasInvalid = false;
199 m_updatedFrom = from;
201 for (std::string::const_iterator it = new_pystr.begin();
202 it != new_pystr.end(); ++it) {
203 unsigned tmp = _push((*it) & 0x7f);
204 if (tmp < m_updatedFrom) m_updatedFrom = tmp;
207 return m_updatedFrom;
211 CShuangpinSegmentor::_getNumberOfNonAlpha() const
214 for (const char* c = m_pystr.c_str(); *c != 0; ++c) {
222 CShuangpinSegmentor::_locateSegment(unsigned idx,
228 TSegmentVec::const_iterator it = m_segs.begin();
229 TSegmentVec::const_iterator ite = m_segs.end();
231 for (; it != ite; ++it) {
232 if (strIdx + it->m_len > idx)
241 CShuangpinSegmentor::_encode(const char* buf, char ch, bool isComplete)
245 s_shpData.getMapString(buf, syls);
249 const int len = m_pystr.size();
250 CMappedYin::const_iterator iter = syls.begin();
251 CMappedYin::const_iterator iter_end = syls.end();
254 TSegment &s = m_segs.back();
256 s.m_start = len - s.m_len;
257 s.m_syllables.clear();
258 s.m_type = IPySegmentor::SYLLABLE;
259 for (; iter != iter_end; iter++) {
260 s.m_syllables.push_back(s_shpData.encodeSyllable(iter->c_str()));
262 m_nLastValidPos += 1;
267 s.m_start = len - s.m_len;
268 m_nLastValidPos += 1;
270 for (; iter != iter_end; ++iter) {
271 TSyllable syl = s_shpData.encodeSyllable(iter->c_str());
273 s.m_syllables.push_back(syl);
276 m_segs.push_back(TSegment(ch, s.m_start, 1,
277 IPySegmentor::STRING));
285 CShuangpinSegmentor::_push(unsigned ch)
289 EShuangpinType shpType;
291 m_pystr.push_back(ch);
292 const int len = m_pystr.size();
295 m_segs.push_back(TSegment(ch, startFrom, 1, IPySegmentor::INVALID));
299 shpType = s_shpData.getShuangpinType();
300 isInputPy = (islower(ch) ||
301 (ch == ';' && (shpType == MS2003 || shpType == ZIGUANG)));
306 IPySegmentor::ESegmentType seg_type;
307 if (ch == '\'' && m_inputBuf.size() > 1)
308 seg_type = IPySegmentor::SYLLABLE_SEP;
310 seg_type = IPySegmentor::STRING;
311 m_segs.push_back(TSegment(ch, startFrom, 1, seg_type));
313 m_nLastValidPos += 1;
315 bool bCompleted = !((len - m_nAlpha) % 2) && isInputPy;
318 sprintf(buf, "%c%c", m_pystr[len - 2], ch);
320 sprintf(buf, "%c", ch);
322 startFrom = _encode(buf, ch, bCompleted);
325 startFrom = m_pystr.size() - 1;
326 m_segs.push_back(TSegment(ch, startFrom, 1, IPySegmentor::INVALID));
332 if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
333 if (m_segs.back().m_type == SYLLABLE)
334 _addFuzzySyllables(m_segs.back());
340 CShuangpinSegmentor::_addFuzzySyllables(TSegment& seg)
342 assert(seg.m_type == SYLLABLE);
344 seg.m_fuzzy_syllables.clear();
346 std::vector<unsigned>::iterator it = seg.m_syllables.begin();
347 std::vector<unsigned>::iterator ite = seg.m_syllables.end();
348 for (; it != ite; ++it) {
349 CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp)(*it);
351 CSyllables::const_iterator _it = fuzzy_set.begin();
352 CSyllables::const_iterator _ite = fuzzy_set.end();
353 for (; _it != _ite; ++_it)
354 seg.m_fuzzy_syllables.push_back(*_it);