2 * ʸÀá¤â¤·¤¯¤Ïñ¸ì¤ò°ì¤Ä°Ê¾å¥»¥Ã¥È¤Ë¤·¤Æmetaword¤È¤·¤Æ°·¤¦¡£
3 * ¤³¤³¤Ç¤Ï³Æ¼ï¤Îmetaword¤òÀ¸À®¤¹¤ë
5 * init_metaword_tab() metaword½èÍý¤Î¤¿¤á¤Î¾ðÊó¤ò¹½À®¤¹¤ë
6 * anthy_make_metaword_all() contextÃæ¤Îmetaword¤ò¹½À®¤¹¤ë
7 * anthy_print_metaword() »ØÄꤵ¤ì¤¿metaword¤òɽ¼¨¤¹¤ë
9 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2001 10/29
10 * Copyright (C) 2000-2006 TABATA Yusuke
11 * Copyright (C) 2004-2006 YOSHIDA Yuichi
12 * Copyright (C) 2000-2003 UGAWA Tomoharu
18 #include <anthy/record.h>
19 #include <anthy/splitter.h>
20 #include <anthy/xchar.h>
21 #include <anthy/xstr.h>
22 #include <anthy/segment.h>
23 #include <anthy/segclass.h>
24 #include "wordborder.h"
26 /* ³Æ¼ïmeta_word¤ò¤É¤Î¤è¤¦¤Ë½èÍý¤¹¤ë¤« */
27 struct metaword_type_tab_ anthy_metaword_type_tab[] = {
28 {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE},
29 {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE},
30 {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP},
31 {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND},
32 {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE},
33 {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE},
34 {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE},
35 {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER},
36 {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER},
37 {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER},
38 {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE},
40 {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE}
44 combine_metaword(struct splitter_context *sc, struct meta_word *mw);
46 /* ¥³¥ó¥Æ¥¥¹¥ÈÃæ¤Ëmetaword¤òÄɲ乤ë */
48 anthy_commit_meta_word(struct splitter_context *sc,
51 struct word_split_info_cache *info = sc->word_split_info;
52 /* Ʊ¤¸³«»ÏÅÀ¤ò»ý¤Ä¥Î¡¼¥É¤Î¥ê¥¹¥È */
53 mw->next = info->cnode[mw->from].mw;
54 info->cnode[mw->from].mw = mw;
56 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) {
57 anthy_print_metaword(sc, mw);
62 print_metaword_features(int features)
64 if (features & MW_FEATURE_SV) {
67 if (features & MW_FEATURE_WEAK_CONN) {
70 if (features & MW_FEATURE_SUFFIX) {
73 if (features & MW_FEATURE_NUM) {
76 if (features & MW_FEATURE_CORE1) {
79 if (features & MW_FEATURE_HIGH_FREQ) {
85 anthy_do_print_metaword(struct splitter_context *sc,
90 for (i = 0; i < indent; i++) {
93 printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s",
94 anthy_metaword_type_tab[mw->type].name,
95 mw->from, mw->len, mw->score,
96 anthy_seg_class_name(mw->seg_class));
97 print_metaword_features(mw->mw_features);
98 printf(":can_use=%d*\n", mw->can_use);
100 anthy_print_word_list(sc, mw->wl);
102 if (mw->cand_hint.str) {
104 anthy_putxstr(&mw->cand_hint);
108 anthy_do_print_metaword(sc, mw->mw1, indent + 1);
111 anthy_do_print_metaword(sc, mw->mw2, indent + 1);
116 anthy_print_metaword(struct splitter_context *sc,
117 struct meta_word *mw)
119 anthy_do_print_metaword(sc, mw, 0);
122 static struct meta_word *
123 alloc_metaword(struct splitter_context *sc)
125 struct meta_word *mw;
126 mw = anthy_smalloc(sc->word_split_info->MwAllocator);
127 mw->type = MW_SINGLE;
129 mw->struct_score = 0;
130 mw->dep_word_hash = 0;
131 mw->core_wt = anthy_wt_none;
133 mw->dep_class = DEP_NONE;
137 mw->cand_hint.str = NULL;
138 mw->cand_hint.len = 0;
139 mw->seg_class = SEG_HEAD;
146 * wl¤ÎÀÜƬ¼Éôʬ¤ÈÀÜÈø¼Éôʬ¤òʸ»úÎó¤È¤·¤Æ¼è¤ê½Ð¤¹
149 get_surrounding_text(struct splitter_context* sc,
150 struct word_list* wl,
151 xstr* xs_pre, xstr* xs_post)
153 int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len;
154 int pre_len = wl->part[PART_PREFIX].len;
156 xs_pre->str = sc->ce[wl->from].c;
157 xs_pre->len = pre_len;
158 xs_post->str = sc->ce[wl->from + wl->len - post_len].c;
159 xs_post->len = post_len;
165 for (i = 0; i < xs->len; i++) {
166 if (xs->str[i] == KK_VU) {
174 * Ê£¹ç¸ì¤Ç¤¢¤ëwl¤«¤énÈÖ¤á¤ÎÉôʬ¤ò¼è¤ê½Ð¤·¤Æmw¤Ë¤¹¤ë
176 static struct meta_word*
177 make_compound_nth_metaword(struct splitter_context* sc,
178 compound_ent_t ce, int nth,
179 struct word_list* wl,
180 enum metaword_type type)
185 int seg_num = anthy_compound_get_nr_segments(ce);
186 struct meta_word* mw;
187 xstr xs_pre, xs_core, xs_post;
189 get_surrounding_text(sc, wl, &xs_pre, &xs_post);
191 for (i = 0; i <= nth; ++i) {
194 len = anthy_compound_get_nth_segment_len(ce, i);
195 part.str = sc->ce[from].c;
197 len -= count_vu(&part);
201 if (i == seg_num - 1) {
206 mw = alloc_metaword(sc);
211 mw->seg_class = wl->seg_class;
213 anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core);
215 anthy_xstrcat(&mw->cand_hint, &xs_pre);
217 anthy_xstrcat(&mw->cand_hint, &xs_core);
218 if (nth == seg_num - 1) {
219 anthy_xstrcat(&mw->cand_hint, &xs_post);
226 * metaword¤ò¼ÂºÝ¤Ë·ë¹ç¤¹¤ë
228 static struct meta_word *
229 anthy_do_cons_metaword(struct splitter_context *sc,
230 enum metaword_type type,
231 struct meta_word *mw, struct meta_word *mw2)
235 n = alloc_metaword(sc);
237 n->len = mw->len + (mw2 ? mw2->len : 0);
240 n->score = sqrt(mw->score) * sqrt(mw2->score);
242 n->score = mw->score;
248 n->seg_class = mw2->seg_class;
249 n->nr_parts = mw->nr_parts + mw2->nr_parts;
250 n->dep_word_hash = mw2->dep_word_hash;
252 n->seg_class = mw->seg_class;
253 n->nr_parts = mw->nr_parts;
254 n->dep_word_hash = mw->dep_word_hash;
256 anthy_commit_meta_word(sc, n);
261 * Ê£¹ç¸ìÍѤÎmeta_word¤òºîÀ®¤¹¤ë¡£
264 make_compound_metaword(struct splitter_context* sc, struct word_list* wl)
267 seq_ent_t se = wl->part[PART_CORE].seq;
268 int ent_num = anthy_get_nr_dic_ents(se, NULL);
270 for (i = 0; i < ent_num; ++i) {
273 struct meta_word *mw = NULL;
274 struct meta_word *mw2 = NULL;
275 if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
278 ce = anthy_get_nth_compound_ent(se, i);
279 seg_num = anthy_compound_get_nr_segments(ce);
281 for (j = seg_num - 1; j >= 0; --j) {
282 enum metaword_type type;
283 mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF);
284 anthy_commit_meta_word(sc, mw);
286 type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND;
287 mw2 = anthy_do_cons_metaword(sc, type, mw, mw2);
293 * Ê£¹ç¸ì¤ÎÃæ¤Î¸Ä¡¹¤ÎʸÀá¤ò·ë¹ç¤·¤¿meta_word¤òºîÀ®¤¹¤ë¡£
296 make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl)
299 seq_ent_t se = wl->part[PART_CORE].seq;
300 int ent_num = anthy_get_nr_dic_ents(se, NULL);
302 for (i = 0; i < ent_num; ++i) {
305 struct meta_word *mw = NULL;
306 struct meta_word *mw2 = NULL;
308 if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
312 ce = anthy_get_nth_compound_ent(se, i);
313 seg_num = anthy_compound_get_nr_segments(ce);
316 for (j = seg_num - 1; j >= 0; --j) {
317 mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART);
318 for (k = j - 1; k >= 0; --k) {
319 mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART);
321 mw2->score += mw->score;
322 anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint);
324 anthy_commit_meta_word(sc, mw2);
335 make_simple_metaword(struct splitter_context *sc, struct word_list* wl)
337 struct meta_word *mw = alloc_metaword(sc);
342 mw->type = MW_SINGLE;
343 mw->dep_class = wl->part[PART_DEPWORD].dc;
344 mw->seg_class = wl->seg_class;
345 if (wl->part[PART_CORE].len) {
346 mw->core_wt = wl->part[PART_CORE].wt;
348 mw->nr_parts = NR_PARTS;
349 mw->dep_word_hash = wl->dep_word_hash;
350 mw->mw_features = wl->mw_features;
351 anthy_commit_meta_word(sc, mw);
355 * wordlist°ì¸Ä¤«¤é¤Ê¤ë¡¢metaword¤òºîÀ®
358 make_metaword_from_word_list(struct splitter_context *sc)
361 for (i = 0; i < sc->char_count; i++) {
362 struct word_list *wl;
363 for (wl = sc->word_split_info->cnode[i].wl;
365 if (wl->is_compound) {
366 make_compound_part_metaword(sc, wl);
367 make_compound_metaword(sc, wl);
369 make_simple_metaword(sc, wl);
376 * metaword¤ò¥ê¥¹¥ÈÉ÷¤Ë·ë¹ç¤¹¤ë
378 static struct meta_word *
379 list_metaword(struct splitter_context *sc,
380 enum metaword_type type,
381 struct meta_word *mw, struct meta_word *mw2)
383 struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL);
384 struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw);
386 n->mw_features = mw->mw_features | mw2->mw_features;
392 * Æ°»ìÏ¢ÍÑ·Á + ·ÁÍƻ첽ÀÜÈø¸ì ¡Ö¡Á¤·¤ä¤¹¤¤¡×¤Ê¤É
395 try_combine_v_renyou_a(struct splitter_context *sc,
396 struct meta_word *mw, struct meta_word *mw2)
399 if (!mw->wl || !mw2->wl) return;
401 w2 = mw2->wl->part[PART_CORE].wt;
403 if (mw->wl->head_pos == POS_V &&
404 mw->wl->tail_ct == CT_RENYOU &&
405 anthy_wtype_get_pos(w2) == POS_D2KY) {
406 /* ·ÁÍÆ»ì¤Ç¤Ï¤¢¤ë¤Î¤Ç¼¡¤Î¥Á¥§¥Ã¥¯ */
407 if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq,
408 anthy_wtype_a_tail_of_v_renyou)) {
409 list_metaword(sc, MW_V_RENYOU_A, mw, mw2);
415 * Æ°»ìÏ¢ÍÑ·Á + ̾»ì²½ÀÜÈø¸ì(#D2T35) ¡ÖÆþ¤ì ¤¿¤Æ(¤Î¤ªÃã)¡×¤Ê¤É
418 try_combine_v_renyou_noun(struct splitter_context *sc,
419 struct meta_word *mw, struct meta_word *mw2)
422 if (!mw->wl || !mw2->wl) return;
424 w2 = mw2->wl->part[PART_CORE].wt;
425 if (mw->wl->head_pos == POS_V &&
426 mw->wl->tail_ct == CT_RENYOU &&
427 anthy_wtype_get_pos(w2) == POS_NOUN &&
428 anthy_wtype_get_scos(w2) == SCOS_T40) {
429 list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2);
437 try_combine_number(struct splitter_context *sc,
438 struct meta_word *mw1, struct meta_word *mw2)
440 struct word_list *wl1 = mw1->wl;
441 struct word_list *wl2 = mw2->wl;
442 struct meta_word *combined_mw;
443 int recursive = wl2 ? 0 : 1; /* combined¤Êmw¤ò·ë¹ç¤¹¤ë¾ì¹ç1 */
447 if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return;
449 /* ±¦mw¤Ï¿ô»ú¤ò·ë¹ç¤·¤¿mw */
450 if (mw2->type != MW_NUMBER) return;
454 if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return;
456 /* º¸mw¤Î¸å¤í¤Ëʸ»ú¤¬ÉÕ¤¤¤Æ¤¤¤Ê¤±¤ì¤Ð */
457 if (wl1->part[PART_POSTFIX].len == 0 &&
458 wl1->part[PART_DEPWORD].len == 0) {
459 int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt);
460 int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt);
463 if (scos2 == SCOS_NONE) return;
465 º¸mw¤Î¼ïÎà¤Ë¤è¤Ã¤Æ¡¢¸å¤í¤Ë¤Ä¤¯¤³¤È¤¬¤Ç¤¤ë±¦mw¤Î¼ïÎबÊѤï¤ë
466 Î㤨¤Ð°ì¡Á¶å¤Î¸å¤í¤Ë¤ÏËü¡Á¶åËü¡¢²¯¡Á¶å²¯¤·¤«¤Ä¤¯¤³¤È¤¬¤Ç¤¤Ê¤¤¤¬¡¢
467 ½½¡Á¶å½½¤Î¸å¤í¤Ë¤Ï¡¢¤¢¤ï¤»¤Æ°ì¡Á¶å¤Ê¤É¤â¤Ä¤¯¤³¤È¤¬¤Ç¤¤ë
471 if (scos2 == SCOS_N1) return; /* ¸å¤í¤Ë°ì¡Á¶å¤¬¤Ä¤¤¤Æ¤Ï¤¤¤±¤Ê¤¤ */
473 if (scos2 == SCOS_N10) return; /* ¸å¤í¤Ë½½¡Á¶å½½¤¬¤Ä¤¤¤Æ¤Ï¤¤¤±¤Ê¤¤ */
475 if (scos2 == SCOS_N100) return; /* ¸å¤í¤ËÉ´¡Á¶åÉ´¤¬¤Ä¤¤¤Æ¤Ï¤¤¤±¤Ê¤¤ */
477 if (scos2 == SCOS_N1000) return; /* ¸å¤í¤ËÀé¡Á¶åÀ餬¤Ä¤¤¤Æ¤Ï¤¤¤±¤Ê¤¤ */
479 /* Ëü¡Á¶åËü¡¢²¯¡Á¶å²¯¡Ä¤Ê¤É¤Ï¡¢
480 ¤¤¤Ä¤Ç¤â¸å¤í¤Ë¤Ä¤¯¤³¤È¤¬¤Ç¤¤ë */
487 combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2);
489 /* ½é¤á¤Æ·ë¹ç¤¹¤ë¾ì¹ç¤Ï¸å¤í¤Ënull¤ò¤Ä¤±¤Ælist¤Ë¤¹¤ë */
490 combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2);
492 combine_metaword(sc, combined_mw);
496 /* ±¦ÎÙ¤Îmetaword¤È·ë¹ç¤Ç¤¤ë¤«¥Á¥§¥Ã¥¯ */
498 try_combine_metaword(struct splitter_context *sc,
499 struct meta_word *mw1, struct meta_word *mw2)
501 if (!mw1->wl) return;
503 /* metaword¤Î·ë¹ç¤ò¹Ô¤¦¤¿¤á¤Ë¤Ï¡¢¸å³¤Î
504 metaword¤ËÀÜƬ¼¤¬¤Ê¤¤¤³¤È¤¬É¬Í× */
505 if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) {
509 try_combine_v_renyou_a(sc, mw1, mw2);
510 try_combine_v_renyou_noun(sc, mw1, mw2);
511 try_combine_number(sc, mw1, mw2);
515 combine_metaword(struct splitter_context *sc, struct meta_word *mw)
517 struct word_split_info_cache *info = sc->word_split_info;
520 if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
521 /* ÉÕ°¸ì¤À¤±¤ÎʸÀá¤È¤Ï·ë¹ç¤·¤Ê¤¤ */
525 for (i = mw->from - 1; i >= 0; i--) {
526 struct meta_word *mw_left;
527 for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) {
528 if (mw_left->from + mw_left->len == mw->from) {
529 /* ·ë¹ç¤Ç¤¤ë¤«¥Á¥§¥Ã¥¯ */
530 try_combine_metaword(sc, mw_left, mw);
537 combine_metaword_all(struct splitter_context *sc)
541 struct word_split_info_cache *info = sc->word_split_info;
542 /* metaword¤Îº¸Ã¼¤Ë¤è¤ë¥ë¡¼¥× */
543 for (i = sc->char_count - 1; i >= 0; i--){
544 struct meta_word *mw;
545 /* ³Æmetaword¤Î¥ë¡¼¥× */
546 for (mw = info->cnode[i].mw;
548 combine_metaword(sc, mw);
554 make_dummy_metaword(struct splitter_context *sc, int from,
555 int len, int orig_len)
558 struct meta_word *mw, *n;
560 for (mw = sc->word_split_info->cnode[from].mw; mw; mw = mw->next) {
561 if (mw->len != orig_len) continue;
562 if (mw->score > score) {
567 n = alloc_metaword(sc);
571 n->score = 3 * score * len / orig_len;
575 anthy_commit_meta_word(sc, n);
579 * ʸÀá¤ò¿¤Ð¤·¤¿¤é¤½¤ì¤ò³Ð¤¨¤Æ¤ª¤¯
582 make_expanded_metaword_all(struct splitter_context *sc)
585 if (anthy_select_section("EXPANDPAIR", 0) == -1) {
588 for (i = 0; i < sc->char_count; i++) {
589 for (j = 1; j < sc->char_count - i; j++) {
590 /* Á´¤Æ¤ÎÉôʬʸ»úÎó¤ËÂФ·¤Æ */
593 xs.str = sc->ce[i].c;
594 if (anthy_select_row(&xs, 0) == 0) {
595 /* ¤³¤ÎÉôʬʸ»úÎó¤Ï²áµî¤Ë³ÈÂç¤ÎÂоݤȤʤä¿ */
597 int nr = anthy_get_nr_values();
598 for (k = 0; k < nr; k++) {
600 exs = anthy_get_nth_xstr(k);
601 if (exs && exs->len <= sc->char_count - i) {
603 txs.str = sc->ce[i].c;
605 if (!anthy_xstrcmp(&txs, exs)) {
606 make_dummy_metaword(sc, i, txs.len, j);
615 /* ¤ªÃãÆþ¤ì³Ø½¬¤Îmetaword¤òºî¤ë */
617 make_ochaire_metaword(struct splitter_context *sc,
620 struct meta_word *mw;
631 count = anthy_get_nth_value(0);
632 /* °ìÈÖ±¦¤ÎʸÀá¤ò¤Î¤¾¤¤¤¿Ê¸»ú¿ô¤Î¹ç·×¤ò·×»» */
633 for (s = 0, j = 0; j < count - 1; j++) {
634 s += anthy_get_nth_value(j * 2 + 1);
636 /* °ìÈÖ±¦¤ÎʸÀá¤Îmetaword¤ò¹½À® */
637 xs = anthy_get_nth_xstr((count - 1) * 2 + 2);
641 seg_len = anthy_get_nth_value((count - 1) * 2 + 1);
642 mw = alloc_metaword(sc);
643 mw->type = MW_OCHAIRE;
646 mw->score = OCHAIRE_SCORE;
647 mw->cand_hint.str = malloc(sizeof(xchar)*xs->len);
648 anthy_xstrcpy(&mw->cand_hint, xs);
649 anthy_commit_meta_word(sc, mw);
651 /* ¤½¤ì°Ê³°¤ÎʸÀá¤Çmetaword¤ò¹½À® */
652 for (j-- ; j >= 0; j--) {
654 seg_len = anthy_get_nth_value(j * 2 + 1);
656 xs = anthy_get_nth_xstr(j * 2 + 2);
660 n = alloc_metaword(sc);
661 n->type = MW_OCHAIRE;
662 /* ±¦¤Îmetaword¤ò¤Ä¤Ê¤° */
666 n->score = OCHAIRE_SCORE;
667 n->cand_hint.str = malloc(sizeof(xchar)*xs->len);
668 anthy_xstrcpy(&n->cand_hint, xs);
669 anthy_commit_meta_word(sc, n);
676 * Ê£¿ô¤ÎʸÀá¤ÎÁȤòÍúÎò¤«¤é¸¡º÷¤¹¤ë
679 make_ochaire_metaword_all(struct splitter_context *sc)
682 if (anthy_select_section("OCHAIRE", 0) == -1) {
686 for (i = 0; i < sc->char_count; i++) {
688 xs.len = sc->char_count - i;
689 xs.str = sc->ce[i].c;
690 if (anthy_select_longest_row(&xs) == 0) {
693 anthy_mark_row_used();
694 key = anthy_get_index_xstr();
697 make_ochaire_metaword(sc, i, len);
698 /* º£²ó¸«¤Ä¤«¤Ã¤¿ meta_word ¤Î¼¡¤Îʸ»ú¤«¤é»Ï¤á¤ë */
706 add_dummy_metaword(struct splitter_context *sc,
710 n = alloc_metaword(sc);
715 n->seg_class = SEG_BUNSETSU;
716 anthy_commit_meta_word(sc, n);
719 /* »ØÄꤷ¤¿metaword¤òwrap¤·¤Æjʸ»úŤ¤meta_word¤òºî¤ë */
721 expand_meta_word(struct splitter_context *sc,
722 struct meta_word *mw, int from, int len,
723 int destroy_seg_class, int j)
726 n = alloc_metaword(sc);
732 n->score = mw->score;
733 n->nr_parts = mw->nr_parts;
734 if (destroy_seg_class) {
735 n->seg_class = SEG_BUNSETSU;
738 n->seg_class = mw->seg_class;
743 n->seg_class = SEG_BUNSETSU;
745 anthy_commit_meta_word(sc, n);
749 * metaword¤Î¸å¤í¤Î»¨Â¿¤Êʸ»ú¤ò¤¯¤Ã¤Ä¤±¤¿metaword¤ò¹½À®¤¹¤ë
752 make_metaword_with_depchar(struct splitter_context *sc,
753 struct meta_word *mw)
756 int destroy_seg_class = 0;
757 int from = mw ? mw->from : 0;
758 int len = mw ? mw->len : 0;
760 /* metaword¤Îľ¸å¤Îʸ»ú¤Î¼ïÎà¤òÄ´¤Ù¤ë */
762 if (sc->char_count <= from + len) {
765 type = anthy_get_xchar_type(*sc->ce[from + len].c);
766 if (!(type & XCT_SYMBOL) &&
767 !(type & XCT_PART)) {
770 if (type & XCT_PUNCTUATION) {
771 /* ¶çÆÉÅÀ¤Ê¤é¤ÐÊ̤ÎʸÀá¤Ë¤¹¤ë */
775 /* Ʊ¤¸¼ïÎà¤Îʸ»ú¤Ç¤Ê¤±¤ì¤Ð¤¯¤Ã¤Ä¤±¤ë¤Î¤ò¤¦¤Á¤¤ê */
776 for (j = 0; from + len + j < sc->char_count; j++) {
777 int p = from + len + j;
778 if ((anthy_get_xchar_type(*sc->ce[p].c) != type)) {
781 if (!(p + 1 < sc->char_count) ||
782 *sc->ce[p].c != *sc->ce[p + 1].c) {
783 destroy_seg_class = 1;
787 /* ¾å¤Î¥ë¡¼¥×¤òÈ´¤±¤¿»þ¡¢j¤Ë¤ÏÆÈΩ¤Ç¤¤Ê¤¤Ê¸»ú¤Î¿ô¤¬Æþ¤Ã¤Æ¤¤¤ë */
789 /* ÆÈΩ¤Ç¤¤Ê¤¤Ê¸»ú¤¬¤¢¤ë¤Î¤Ç¡¢¤½¤ì¤òÉÕ¤±¤¿metaword¤òºî¤ë */
791 expand_meta_word(sc, mw, from, len, destroy_seg_class, j);
796 make_metaword_with_depchar_all(struct splitter_context *sc)
799 struct word_split_info_cache *info = sc->word_split_info;
801 /* Á´metaword¤ËÂФ·¤Æ */
802 for (i = 0; i < sc->char_count; i++) {
803 struct meta_word *mw;
804 for (mw = info->cnode[i].mw;
806 make_metaword_with_depchar(sc, mw);
808 if (!info->cnode[i].mw) {
810 add_dummy_metaword(sc, i);
813 /* ʸ¤Îº¸Ã¼¤«¤é»Ï¤Þ¤ë¤â¤Î */
814 make_metaword_with_depchar(sc, NULL);
822 for (i = xs->len - 1; i >= 1; --i) {
823 xct = anthy_get_xchar_type(xs->str[i]);
824 if (!(xct & XCT_PART)) {
832 bias_to_single_char_metaword(struct splitter_context *sc)
836 for (i = sc->char_count - 1; i >= 0; --i) {
837 struct meta_word *mw;
841 struct char_node *cnode = &sc->word_split_info->cnode[i];
843 /* ¥«¥Ã¥³¤Î¾ì¹ç¤Ï°ìʸ»ú¤ÇʸÀá¤ò¹½À®¤Ç¤¤ë */
844 xct = anthy_get_xchar_type(*sc->ce[i].c);
845 if (xct & (XCT_OPEN|XCT_CLOSE)) {
849 xs.str = sc->ce[i].c;
850 for (mw = cnode->mw; mw; mw = mw->next) {
851 /* ÉÕ°¸ì¤Î¤ß¤ÎʸÀá¤Ï¸ºÅÀ¤·¤Ê¤¤ */
852 if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
855 /* °ìʸ»ú(+ľÁ°¤Ë¤Ä¤Ê¤¬¤ëʸ»ú¤Î·«¤êÊÖ¤·)¤Î¥¹¥³¥¢¤ò²¼¤²¤ë */
857 if (is_single(&xs)) {
865 anthy_mark_border_by_metaword(struct splitter_context* sc,
866 struct meta_word* mw)
868 struct word_split_info_cache* info = sc->word_split_info;
876 case MW_COMPOUND_PART:
877 info->seg_border[mw->from] = 1;
879 case MW_COMPOUND_LEAF:
880 info->seg_border[mw->from] = 1;
881 info->best_mw[mw->from] = mw;
884 case MW_COMPOUND_HEAD:
889 info->best_mw[mw->mw1->from] = mw->mw1;
890 anthy_mark_border_by_metaword(sc, mw->mw1);
891 anthy_mark_border_by_metaword(sc, mw->mw2);
895 case MW_V_RENYOU_NOUN:
896 info->seg_border[mw->from] = 1;
899 anthy_mark_border_by_metaword(sc, mw->mw1);
902 info->seg_border[mw->from] = 1;
903 anthy_mark_border_by_metaword(sc, mw->mw1);
911 anthy_make_metaword_all(struct splitter_context *sc)
913 /* ¤Þ¤º¡¢word_list°ì¸Ä¤Îmetaword¤òºî¤ë */
914 make_metaword_from_word_list(sc);
916 /* metaword¤ò·ë¹ç¤¹¤ë */
917 combine_metaword_all(sc);
919 /* ³ÈÂ礵¤ì¤¿Ê¸Àá¤ò½èÍý¤¹¤ë */
920 make_expanded_metaword_all(sc);
922 /* ÂùÅÀ¤äĹ²»¤Ê¤É¤Îµ¹æ¡¢¤½¤Î¾¤Îµ¹æ¤ò½èÍý */
923 make_metaword_with_depchar_all(sc);
926 make_ochaire_metaword_all(sc);
928 /* °ìʸ»ú¤ÎʸÀá¤Ï¸ºÅÀ */
929 bias_to_single_char_metaword(sc);
933 * »ØÄꤵ¤ì¤¿Îΰè¤ò¥«¥Ð¡¼¤¹¤ëmetaword¤ò¿ô¤¨¤ë
936 anthy_get_nr_metaword(struct splitter_context *sc,
939 struct meta_word *mw;
942 for (n = 0, mw = sc->word_split_info->cnode[from].mw;
944 if (mw->len == len && mw->can_use == ok) {
952 anthy_get_nth_metaword(struct splitter_context *sc,
953 int from, int len, int nth)
955 struct meta_word *mw;
957 for (n = 0, mw = sc->word_split_info->cnode[from].mw;
959 if (mw->len == len && mw->can_use == ok) {