2 * ʸÀá¤ËÂФ·¤Æ¸õÊä¤Î¥ê¥¹¥È¤òÀ¸À®¤¹¤ë¡£
3 * make_candidates()¤¬context´ÉÍýÉô¤«¤é¸Æ¤Ð¤ì¤ë¡£
5 * ¸õÊä¤ÎÀ¸À®¤Ï¼¡¤ÎÊýË¡¤Ç¹Ô¤¦
6 * (1)splitter¤¬³ä¤êÅö¤Æ¤¿ÉÊ»ì¤ËÂФ·¤Æproc_splitter_info()
8 * (2)¤Ò¤é¤¬¤Ê¤Î¤ß¤È¥«¥¿¥«¥Ê¤Î¤ß¤Î¸õÊä¤òÀ¸À®¤¹¤ë
9 * (3)ºÇ¸å¤Îʸ»ú¤ò½õ»ì¤È²ò¼á¤·¤Æ̵ÍýÌðÍý¸õÊä¤òÀ¸À®¤¹¤ë
12 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2001 9/30
13 * Copyright (C) 2000-2005 TABATA Yusuke
14 * Copyright (C) 2004-2005 YOSHIDA Yuichi
15 * Copyright (C) 2002 UGAWA Tomoharu
17 * $Id: compose.c,v 1.25 2005/08/19 04:20:25 oxy Exp $
20 This library is free software; you can redistribute it and/or
21 modify it under the terms of the GNU Lesser General Public
22 License as published by the Free Software Foundation; either
23 version 2 of the License, or (at your option) any later version.
25 This library is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28 Lesser General Public License for more details.
30 You should have received a copy of the GNU Lesser General Public
31 License along with this library; if not, write to the Free Software
32 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38 #include <anthy/dic.h>
39 #include <anthy/splitter.h>
40 #include <anthy/segment.h>
41 #include "wordborder.h"
44 static struct cand_ent *
48 ce = (struct cand_ent *)malloc(sizeof(struct cand_ent));
52 ce->core_elm_index = -1;
53 ce->dep_word_hash = 0;
60 static struct cand_ent *
61 dup_candidate(struct cand_ent *ce)
63 struct cand_ent *ce_new;
65 ce_new = alloc_cand_ent();
66 ce_new->nr_words = ce->nr_words;
67 ce_new->str.len = ce->str.len;
68 ce_new->str.str = anthy_xstr_dup_str(&ce->str);
69 ce_new->elm = malloc(sizeof(struct cand_elm)*ce->nr_words);
70 ce_new->flag = ce->flag;
71 ce_new->core_elm_index = ce->core_elm_index;
73 ce_new->score = ce->score;
74 ce_new->dep_word_hash = ce->dep_word_hash;
76 for (i = 0 ; i < ce->nr_words ; i++) {
77 ce_new->elm[i] = ce->elm[i];
82 /** ʸÀá¤Ë¸õÊä¤òÄɲ乤ë */
84 push_back_candidate(struct seg_ent *seg, struct cand_ent *ce)
86 /* seg_ent¤Ë¸õÊäce¤òÄɲà */
88 seg->cands = (struct cand_ent **)
89 realloc(seg->cands, sizeof(struct cand_ent *) * seg->nr_cands);
90 seg->cands[seg->nr_cands - 1] = ce;
92 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) {
93 anthy_print_candidate(ce);
99 push_back_guessed_candidate(struct seg_ent *seg)
104 if (seg->str.len < 2) {
107 /* ºÇ¸å¤Îʸ»ú¤Ï½õ»ì¤«¡© */
108 xc = seg->str.str[seg->str.len - 1];
109 if (!(anthy_get_xchar_type(xc) & XCT_DEP)) {
112 /* ºÇ¸å¤Îʸ»ú°Ê³°¤ò¥«¥¿¥«¥Ê¤Ë¤·¤Æ¤ß¤ë */
113 ce = alloc_cand_ent();
114 xs = anthy_xstr_hira_to_kata(&seg->str);
115 xs->str[xs->len-1] = xc;
116 ce->str.str = anthy_xstr_dup_str(xs);
117 ce->str.len = xs->len;
118 ce->flag = CEF_GUESS;
120 push_back_candidate(seg, ce);
123 /** ºÆµ¢¤Ç1ñ¸ì¤º¤Ä¸õÊä¤ò³äÅö¤Æ¤Æ¤¤¤¯ */
125 enum_candidates(struct seg_ent *seg,
130 struct cand_ent *cand;
134 if (n == ce->mw->nr_parts) {
136 /* ʸÀá¸åÉô¤Î²òÀϤ·¤Ê¤«¤Ã¤¿Éôʬ¤ò¸õÊäʸ»úÎó¤ËÄɲà */
138 tail.len = seg->len - from;
139 tail.str = &seg->str.str[from];
140 anthy_xstrcat(&ce->str, &tail);
141 if (ce->str.str && (0 < ce->str.len)) { /* ¼½ñ¤â¤·¤¯¤Ï³Ø½¬¥Ç¡¼¥¿¤¬²õ¤ì¤Æ¤¤¤¿»þ¤ÎÂкö */
142 push_back_candidate(seg, dup_candidate(ce));
147 p = anthy_get_nr_dic_ents(ce->elm[n].se, &ce->elm[n].str);
149 /* Éʻ줬³äÅö¤Æ¤é¤ì¤Æ¤¤¤ë¤Î¤Ç¡¢¤½¤ÎÉÊ»ì¤Ë¥Þ¥Ã¥Á¤¹¤ë¤â¤Î¤ò³äÅö¤Æ¤ë */
150 for (i = 0; i < p; i++) {
152 if (anthy_get_nth_dic_ent_is_compound(ce->elm[n].se, i)) {
155 anthy_get_nth_dic_ent_wtype(ce->elm[n].se, &ce->elm[n].str, i, &wt);
157 ce->elm[n].wt = anthy_get_wtype_with_ct(ce->elm[n].wt, CT_NONE);
158 if (anthy_wtype_include(ce->elm[n].wt, wt)) {
161 yomi.len = ce->elm[n].str.len;
162 yomi.str = &seg->str.str[from];
163 cand = dup_candidate(ce);
164 anthy_get_nth_dic_ent_str(cand->elm[n].se,
166 cand->elm[n].nth = i;
167 cand->elm[n].id = anthy_xstr_hash(&word);
170 anthy_xstrcat(&cand->str, &word);
172 /* ¼«Ê¬¤òºÆµ¢¸Æ¤Ó½Ð¤·¤·¤Æ³¤¤ò³ä¤êÅö¤Æ¤ë */
173 nr_cands += enum_candidates(seg, cand,
176 anthy_release_cand_ent(cand);
180 /* ÉÊ»ìÉÔÄê¤Î¾ì¹ç¤Ë¤Ï̤ÊÑ´¹¤Ç¼¡¤Îñ¸ì¤Ø¹Ô¤¯ */
181 pos = anthy_wtype_get_pos(ce->elm[n].wt);
182 if (nr_cands == 0 || pos == POS_INVAL || pos == POS_NONE) {
184 xs.len = ce->elm[n].str.len;
185 xs.str = &seg->str.str[from];
186 cand = dup_candidate(ce);
187 cand->elm[n].nth = -1;
188 cand->elm[n].id = -1;
189 anthy_xstrcat(&cand->str, &xs);
190 nr_cands = enum_candidates(seg,cand,
193 anthy_release_cand_ent(cand);
201 * ʸÀáÁ´ÂΤò´Þ¤à°ìñ¸ì(ñ´Á»ú¤ò´Þ¤à)¤Î¸õÊä¤òÀ¸À®¤¹¤ë
204 push_back_singleword_candidate(struct seg_ent *seg,
213 se = anthy_get_seq_ent_from_xstr(&seg->str, is_reverse);
214 n = anthy_get_nr_dic_ents(se, &seg->str);
215 /* ¼½ñ¤Î³Æ¥¨¥ó¥È¥ê¤ËÂФ·¤Æ */
216 for (i = 0; i < n; i++) {
218 if (anthy_get_nth_dic_ent_is_compound(se, i)) {
221 /* ÉÊ»ì¤ò¼è¤ê½Ð¤·¤Æ */
222 anthy_get_nth_dic_ent_wtype(se, &seg->str, i, &wt);
223 ct = anthy_wtype_get_ct(wt);
224 /* ½ª»ß·Á¤«³èÍѤ·¤Ê¤¤¤â¤Î¤Î¸¶·Á¤Ê¤é */
225 if (ct == CT_SYUSI || ct == CT_NONE) {
226 ce = alloc_cand_ent();
227 anthy_get_nth_dic_ent_str(se,&seg->str, i, &xs);
228 ce->str.str = xs.str;
229 ce->str.len = xs.len;
230 ce->flag = CEF_SINGLEWORD;
231 push_back_candidate(seg, ce);
237 push_back_noconv_candidate(struct seg_ent *seg)
239 /* ̵ÊÑ´¹¤ÇÊÒ²¾Ì¾¤Ë¤Ê¤ë¸õÊä¤ÈÊ¿²¾Ì¾¤Î¤ß¤Ë¤Ê¤ë¸õÊä¤òÄɲà */
244 ce = alloc_cand_ent();
245 ce->str.str = anthy_xstr_dup_str(&seg->str);
246 ce->str.len = seg->str.len;
247 ce->flag = CEF_HIRAGANA;
248 push_back_candidate(seg, ce);
251 ce = alloc_cand_ent();
252 xs = anthy_xstr_hira_to_kata(&seg->str);
253 ce->str.str = anthy_xstr_dup_str(xs);
254 ce->str.len = xs->len;
255 ce->flag = CEF_KATAKANA;
257 push_back_candidate(seg, ce);
260 xs = anthy_conv_half_wide(&seg->str);
262 ce = alloc_cand_ent();
263 ce->str.str = anthy_xstr_dup_str(xs);
264 ce->str.len = xs->len;
267 push_back_candidate(seg, ce);
271 /* word_list¤ÎÍ×ÁÇpart_info¤ÎÇÛÎ󤫤écand_elm¤ÎÇÛÎó¤òºî¤ë */
273 make_cand_elem_from_word_list(struct seg_ent *se,
275 struct word_list *wl,
280 int from = wl->from - se->from;
282 for (i = 0; i < NR_PARTS; ++i) {
283 struct part_info *part = &wl->part[i];
285 if (part->len == 0) {
286 /* Ťµ¤Î̵¤¤part¤Ï̵»ë¤¹¤ë */
289 if (i == PART_CORE) {
290 ce->core_elm_index = i + index;
292 core_xs.str = &se->str.str[from];
293 core_xs.len = part->len;
294 if (i == PART_DEPWORD) {
295 ce->dep_word_hash = anthy_dep_word_hash(&core_xs);
297 ce->elm[i + index].se = anthy_get_seq_ent_from_xstr(&core_xs, is_reverse);
298 ce->elm[i + index].str.str = core_xs.str;
299 ce->elm[i + index].str.len = core_xs.len;
300 ce->elm[i + index].wt = part->wt;
301 ce->elm[i + index].ratio = RATIO_BASE * wl->len;
307 /** ¤Þ¤ºwordlist¤ò»ý¤Ämetaword¤«¤émeta_word¤ò¼è¤ê½Ð¤¹ */
309 make_candidate_from_simple_metaword(struct seg_ent *se,
310 struct meta_word *mw,
311 struct meta_word *top_mw,
315 * ³Æñ¸ì¤ÎÉʻ줬·èÄꤵ¤ì¤¿¾õÂ֤ǥ³¥ß¥Ã¥È¤µ¤ì¤ë¡£
319 /* Ê£¿ô(1¤â´Þ¤à)¤Îñ¸ì¤Ç¹½À®¤µ¤ì¤ëʸÀá¤Ëñ¸ì¤ò³äÅö¤Æ¤Æ¤¤¤¯ */
320 ce = alloc_cand_ent();
321 ce->nr_words = mw->nr_parts;
324 ce->elm = calloc(sizeof(struct cand_elm),ce->nr_words);
328 /* ÀÜƬ¼, ¼«Î©¸ìÉô, ÀÜÈø¼, ÉÕ°¸ì */
329 make_cand_elem_from_word_list(se, ce, mw->wl, 0, is_reverse);
331 /* WRAP¤µ¤ì¤Æ¤¤¤¿¤éGUESS¤ÈƱ¤¸°·¤¤¤Ë¤·¤ÆÅÀ¿ô¤ò²¼¤²¤ë */
332 if (anthy_metaword_type_tab[top_mw->type].status != MW_STATUS_WRAPPED) {
333 ce->flag = (se->best_mw == mw) ? CEF_BEST : CEF_NONE;
335 ce->flag = CEF_GUESS;
338 enum_candidates(se, ce, 0, 0);
339 anthy_release_cand_ent(ce);
342 /** combined¤Êmetaword¤ÏÆó¤Ä¤Î¸ì¤ò¹çÂΤ·¤Æ°ì¤Ä¤Î¸ì¤È¤·¤Æ½Ð¤¹ */
344 make_candidate_from_combined_metaword(struct seg_ent *se,
345 struct meta_word *mw,
346 struct meta_word *top_mw,
350 * ³Æñ¸ì¤ÎÉʻ줬·èÄꤵ¤ì¤¿¾õÂ֤ǥ³¥ß¥Ã¥È¤µ¤ì¤ë¡£
354 /* Ê£¿ô(1¤â´Þ¤à)¤Îñ¸ì¤Ç¹½À®¤µ¤ì¤ëʸÀá¤Ëñ¸ì¤ò³äÅö¤Æ¤Æ¤¤¤¯ */
355 ce = alloc_cand_ent();
356 ce->nr_words = mw->nr_parts;
360 ce->elm = calloc(sizeof(struct cand_elm),ce->nr_words);
363 /* ÀÜƬ¼, ¼«Î©¸ìÉô, ÀÜÈø¼, ÉÕ°¸ì */
364 make_cand_elem_from_word_list(se, ce, mw->mw1->wl, 0, is_reverse);
366 make_cand_elem_from_word_list(se, ce, mw->mw2->mw1->wl, NR_PARTS, is_reverse);
369 /* WRAP¤µ¤ì¤Æ¤¤¤¿¤éGUESS¤ÈƱ¤¸°·¤¤¤Ë¤·¤ÆÅÀ¿ô¤ò²¼¤²¤ë */
370 if (anthy_metaword_type_tab[top_mw->type].status != MW_STATUS_WRAPPED) {
371 ce->flag = (se->best_mw == mw) ? CEF_BEST : CEF_NONE;
373 ce->flag = CEF_GUESS;
376 enum_candidates(se, ce, 0, 0);
377 anthy_release_cand_ent(ce);
381 /** splitter¤Î¾ðÊó¤òÍøÍѤ·¤Æ¸õÊä¤òÀ¸À®¤¹¤ë
384 proc_splitter_info(struct seg_ent *se,
385 struct meta_word *mw,
386 /* top¤È¤Ïtree¤Î¥È¥Ã¥× */
387 struct meta_word *top_mw,
393 /* ¤Þ¤ºwordlist¤ò»ý¤Ämetaword¤Î¾ì¹ç */
394 if (mw->wl && mw->wl->len) {
395 make_candidate_from_simple_metaword(se, mw, top_mw, is_reverse);
399 st = anthy_metaword_type_tab[mw->type].status;
401 case MW_STATUS_WRAPPED:
402 /* wrap¤µ¤ì¤¿¤â¤Î¤Î¾ðÊó¤ò¼è¤ê½Ð¤¹ */
403 proc_splitter_info(se, mw->mw1, top_mw, is_reverse);
405 case MW_STATUS_COMBINED:
406 make_candidate_from_combined_metaword(se, mw, top_mw, is_reverse);
408 case MW_STATUS_COMPOUND:
412 ce = alloc_cand_ent();
413 ce->str.str = anthy_xstr_dup_str(&mw->cand_hint);
414 ce->str.len = mw->cand_hint.len;
415 ce->flag = CEF_COMPOUND;
417 push_back_candidate(se, ce);
420 case MW_STATUS_COMPOUND_PART:
421 /* ϢʸÀá¤Î¸Ä¡¹¤ÎʸÀá¤ò·ë¹ç¤·¤Æ°ì¤Ä¤ÎʸÀá¤È¤·¤Æ¤ß¤¿¤â¤Î */
423 case MW_STATUS_OCHAIRE:
425 /* metaword¤ò»ý¤¿¤Ê¤¤¸õÊäʸ»úÎó¤¬
428 ce = alloc_cand_ent();
429 ce->str.str = anthy_xstr_dup_str(&mw->cand_hint);
430 ce->str.len = mw->cand_hint.len;
432 ce->flag = (st == MW_STATUS_OCHAIRE) ? CEF_OCHAIRE : CEF_COMPOUND_PART;
434 if (mw->len < se->len) {
435 /* metaword¤Ç¥«¥Ð¡¼¤µ¤ì¤Æ¤Ê¤¤Îΰè¤Îʸ»úÎó¤òÉÕ¤±¤ë */
437 xs.str = &se->str.str[mw->len];
438 xs.len = se->len - mw->len;
439 anthy_xstrcat(&ce->str ,&xs);
441 push_back_candidate(se, ce);
451 /** context.c¤«¤é¸Æ½Ð¤µ¤ì¤ë¤â¤Ã¤È¤âÂçʪ
452 * °ì¤Ä°Ê¾å¤Î¸õÊä¤òɬ¤ºÀ¸À®¤¹¤ë
455 anthy_do_make_candidates(struct splitter_context *sc,
456 struct seg_ent *se, int is_reverse)
460 /* metaword¤«¤é¸õÊä¤òÀ¸À®¤¹¤ë */
461 for (i = 0; i < se->nr_metaword; i++) {
462 struct meta_word *mw = se->mw_array[i];
463 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) {
464 anthy_print_metaword(sc, mw);
466 proc_splitter_info(se, mw, mw, is_reverse);
468 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) {
471 /* ñ´Á»ú¤Ê¤É¤Î¸õÊä */
472 push_back_singleword_candidate(se, is_reverse);
474 /* ¤Ò¤é¤¬¤Ê¡¢¥«¥¿¥«¥Ê¤Î̵ÊÑ´¹¥¨¥ó¥È¥ê¤òºî¤ë */
475 push_back_noconv_candidate(se);
477 /* ¸õÊ䤬Æó¤Ä¤·¤«Ìµ¤¤¤È¤¤ÏºÇ¸å¤¬½õ»ì¤Ç»Ä¤ê¤¬Ê¿²¾Ì¾¤Î¸õÊä¤òºî¤ì¤ë¤«»î¤¹ */
478 push_back_guessed_candidate(se);