5 * Created by cererd.long on 10-10-8.
6 * Copyright 2010 __MyCompanyName__. All rights reserved.
10 #include "hunpin_seg.h"
14 #include "pinyin_seg.h"
15 #include "quanpin_trie.h"
18 CShuangpinData CHunpinSegmentor::s_shpData;
20 CHunpinSegmentor::CHunpinSegmentor (EShuangpinType shpType)
21 : m_pGetFuzzySyllablesOp(NULL),
22 m_pytrie(base, check, value, sizeof(base) / sizeof(*base)),
26 s_shpData.setShuangpinType(shpType);
31 CHunpinSegmentor::_encode(const char* buf, int ret)
35 s_shpData.getMapString(buf, syls);
40 CMappedYin::const_iterator iter = syls.begin();
41 CMappedYin::const_iterator iter_end = syls.end();
43 m_segs.push_back(TSegment(0, 0, 1, IPySegmentor::SYLLABLE));
44 TSegment &s = m_segs.back();
48 s.m_syllables.clear();
49 s.m_type = IPySegmentor::SYLLABLE;
50 for (; iter != iter_end; iter++) {
51 s.m_syllables.push_back(s_shpData.encodeSyllable(iter->c_str()));
58 CHunpinSegmentor::_encode(const char* buf)
62 s_shpData.getMapString(buf, syls);
66 CMappedYin::const_iterator iter = syls.begin();
67 CMappedYin::const_iterator iter_end = syls.end();
69 TSegment &s = m_segs.back();
71 s.m_start = m_pystr.size() - s.m_len;
72 s.m_syllables.clear();
73 s.m_type = IPySegmentor::SYLLABLE;
74 for (; iter != iter_end; iter++) {
75 s.m_syllables.push_back(s_shpData.encodeSyllable(iter->c_str()));
82 CHunpinSegmentor::load(const char * pyTrieFileName)
84 return m_pytrie.load(pyTrieFileName);
89 print_pystr(const std::string pystr)
91 for (const char* c = pystr.c_str();
92 c != pystr.c_str() + pystr.length();
94 printf("%c", *c & 0x7f);
101 CHunpinSegmentor::push(unsigned ch)
103 m_inputBuf.push_back(ch);
105 m_updatedFrom = _push(ch);
108 return m_updatedFrom;
112 CHunpinSegmentor::pop()
115 return m_updatedFrom = 0;
117 unsigned size = m_inputBuf.size();
118 m_inputBuf.resize(size - 1);
119 m_pystr.resize(size - 1);
121 unsigned l = m_segs.back().m_len;
125 return m_updatedFrom = size - 1;
127 std::string new_pystr = m_pystr.substr(size - l);
128 m_pystr.resize(size - l);
130 m_updatedFrom = _updateWith(new_pystr);
132 return m_updatedFrom;
136 CHunpinSegmentor::insertAt(unsigned idx, unsigned ch)
139 _locateSegment(idx, i, j);
141 m_inputBuf.insert(idx, 1, ch);
142 m_pystr.insert(idx, 1, ch);
144 std::string new_pystr = m_pystr.substr(i);
146 m_segs.erase(m_segs.begin() + j, m_segs.end());
148 m_updatedFrom = _updateWith(new_pystr);
150 return m_updatedFrom;
154 CHunpinSegmentor::deleteAt(unsigned idx, bool backward)
157 if (!backward) idx += 1;
158 _locateSegment(idx, i, j);
160 m_inputBuf.erase(idx, 1);
161 m_pystr.erase(idx, 1);
163 std::string new_pystr = m_pystr.substr(i);
165 m_segs.erase(m_segs.begin() + j, m_segs.end());
167 m_updatedFrom = _updateWith(new_pystr);
169 return m_updatedFrom;
173 CHunpinSegmentor::clear(unsigned from)
175 m_inputBuf.resize(from);
180 CHunpinSegmentor::_clear(unsigned from)
183 _locateSegment(from, i, j);
186 std::string new_pystr = m_pystr.substr(i, from - i);
188 m_segs.erase(m_segs.begin() + j, m_segs.end());
190 m_updatedFrom = _updateWith(new_pystr, from);
192 return m_updatedFrom;
196 CHunpinSegmentor::_locateSegment(unsigned idx,
202 TSegmentVec::iterator it = m_segs.begin();
203 TSegmentVec::iterator ite = m_segs.end();
205 for (; it != ite; ++it) {
206 if (strIdx + (*it).m_len > idx)
209 strIdx += (*it).m_len;
216 CHunpinSegmentor::_push(unsigned ch)
218 m_pystr.push_back(ch);
220 TSegmentVec::iterator ite = m_segs.size() > 0 ? m_segs.end() -
221 1 : m_segs.begin() - 1;
222 const unsigned maxStringCount = 6;
223 unsigned syllableCount = 0;
224 unsigned stringCount = 0;
225 for (; ite != m_segs.begin() - 1; ite--) {
226 stringCount += (*ite).m_len;
228 if (stringCount > maxStringCount) {
234 unsigned strlen = m_pystr.size();
237 for (int index = syllableCount; index >= 0; index--) {
238 TSegmentVec::iterator it = m_segs.end() - index;
242 if ((strlen - (*it).m_start) == 2) {
244 sprintf(buf, "%c%c", m_pystr[(*it).m_start],
245 m_pystr[(*it).m_start + 1]);
246 int startFrom = _encode(buf);
247 if (startFrom >= 0) break;
250 v = m_pytrie.match_longest(m_pystr.rbegin(),
251 m_pystr.rbegin() + strlen -
252 (*it).m_start, tmpl);
254 if (tmpl == (strlen - (*it).m_start)) {
255 TSegmentVec new_segs(1, TSegment(v, (*it).m_start, tmpl));
256 m_segs.erase(m_segs.end() - index, m_segs.end());
257 std::copy(new_segs.rbegin(), new_segs.rend(),
258 back_inserter(m_segs));
263 v = m_pytrie.match_longest(m_pystr.rbegin(),
264 m_pystr.rbegin() + 1, tmpl);
266 IPySegmentor::ESegmentType seg_type;
267 if (ch == '\'' && m_inputBuf.size() > 1) {
268 seg_type = IPySegmentor::SYLLABLE_SEP;
269 } else if (islower(ch)) {
270 seg_type = IPySegmentor::INVALID;
272 seg_type = IPySegmentor::STRING;
274 ret = m_pystr.size() - 1;
275 m_segs.push_back(TSegment(ch, ret, 1, seg_type));
277 ret = m_pystr.size() - 1;
278 m_segs.push_back(TSegment(v, ret, 1));
283 TSegment &last_seg = m_segs.back();
284 if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
285 if (m_segs.back().m_type == SYLLABLE)
286 _addFuzzySyllables(last_seg);
288 return last_seg.m_start;
293 unsigned CHunpinSegmentor::_push (unsigned ch)
295 printf("using hunpin_seg");
296 //translation for positive match arithmetic
297 //m_segs.erase (m_segs.begin(), m_segs.end());
299 m_pystr.push_back (ch);
300 unsigned strlen = m_pystr.size();
304 while (_start < strlen) {
305 for (unsigned _tn = (maxlen+_start) <= strlen ? maxlen : (strlen - _start) ; _tn > 0 ; _tn--) {
310 //printf("\nout,shuang pin output,char1:%c,char2:%c\n",m_pystr[_start],m_pystr[_start+1]);
312 sprintf(buf, "%c%c", m_pystr[_start], m_pystr[_start+1]);
313 int startFrom = _encode(buf,_start);
315 //printf("\nin,shuang pin output,char1:%c,char2:%c\n",m_pystr[_start],m_pystr[_start+1]);
323 int v = m_pytrie.match_longest ((m_pystr.rbegin() + strlen - _start - _tn), (m_pystr.rbegin() + strlen - _start), tmpl);
325 //printf("\n input match len is %d,size is %d,_start is %d ,end is %d ,start char is %c,end char is %c\n",tmpl,strlen,_start,_start + _tn,m_pystr[_start],m_pystr[_start+_tn-1]);
328 m_segs.push_back (TSegment (v, _start, _tn));
332 else if(_tn == 1 && tmpl == 0) {
333 IPySegmentor::ESegmentType seg_type;
334 if (m_pystr[_start] == '\'' && m_inputBuf.size() > 1)
335 seg_type = IPySegmentor::SYLLABLE_SEP;
336 else if (islower (m_pystr[_start]))
337 seg_type = IPySegmentor::INVALID;
339 seg_type = IPySegmentor::STRING;
341 m_segs.push_back (TSegment (m_pystr[_start], _start, 1, seg_type));
348 TSegment &last_seg = m_segs.back();
349 if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
350 if ( m_segs.back().m_type == SYLLABLE)
351 _addFuzzySyllables (last_seg);
353 return last_seg.m_start;
358 CHunpinSegmentor::_addFuzzySyllables(TSegment& seg)
360 assert(seg.m_type == SYLLABLE);
362 seg.m_fuzzy_syllables.clear();
364 CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp)(seg.m_syllables.front());
365 CSyllables::const_iterator it = fuzzy_set.begin();
366 CSyllables::const_iterator ite = fuzzy_set.end();
368 for (; it != ite; ++it)
369 seg.m_fuzzy_syllables.push_back(*it);
373 CHunpinSegmentor::_updateWith(const std::string& new_pystr, unsigned from)
375 unsigned minUpdatedFrom = from;
376 std::string::const_iterator it = new_pystr.begin();
377 for (; it != new_pystr.end(); ++it) {
378 unsigned updatedFrom = _push(*it & 0x7f);
380 if (updatedFrom < minUpdatedFrom) minUpdatedFrom = updatedFrom;
382 return minUpdatedFrom;