2 * ʸÀá¤ÎºÇ¾®Ã±°Ì¤Ç¤¢¤ëwordlist¤ò¹½À®¤¹¤ë
4 * anthy_make_word_list_all()
5 * ʸÀá¤Î·Á¼°¤òËþ¤¿¤¹Éôʬʸ»úÎó¤òÎóµó¤¹¤ë
6 * ¤¤¤¯¤«¤Î·ÐÏ©¤ÇÎóµó¤µ¤ì¤¿word_list¤Ï
7 * anthy_commit_word_list¤Çsplitter_context¤ËÄɲ䵤ì¤ë
9 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2002 2/27
10 * Copyright (C) 2000-2006 TABATA Yusuke
11 * Copyright (C) 2004-2006 YOSHIDA Yuichi
12 * Copyright (C) 2000-2003 UGAWA Tomoharu
14 * $Id: wordlist.c,v 1.50 2002/11/17 14:45:47 yusuke Exp $
21 #include <arpa/inet.h>
23 #include <anthy/alloc.h>
24 #include <anthy/record.h>
25 #include <anthy/xstr.h>
26 #include <anthy/diclib.h>
27 #include <anthy/wtype.h>
28 #include <anthy/ruleparser.h>
29 #include <anthy/dic.h>
30 #include <anthy/splitter.h>
31 #include <anthy/feature_set.h>
32 #include "wordborder.h"
36 static void *weak_word_array;
40 anthy_print_word_list(struct splitter_context *sc,
49 xs.len = wl->part[PART_CORE].from - wl->from;
50 xs.str = sc->ce[wl->from].c;
54 xs.len = wl->part[PART_CORE].len;
55 xs.str = sc->ce[wl->part[PART_CORE].from].c;
59 xs.len = wl->part[PART_POSTFIX].len;
60 xs.str = sc->ce[wl->part[PART_CORE].from + wl->part[PART_CORE].len].c;
64 xs.len = wl->part[PART_DEPWORD].len;
65 xs.str = sc->ce[wl->part[PART_CORE].from +
66 wl->part[PART_CORE].len +
67 wl->part[PART_POSTFIX].len].c;
69 anthy_print_wtype(wl->part[PART_CORE].wt);
70 printf(" %s%s\n", anthy_seg_class_name(wl->seg_class),
71 (wl->is_compound ? ",compound" : ""));
75 anthy_dep_word_hash(xstr *xs)
77 return anthy_xstr_hash(xs) % WORD_HASH_MAX;
80 /** word_list¤òÈæ³Ó¤¹¤ë¡¢»Þ´¢¤ê¤Î¤¿¤á¤Ê¤Î¤Ç¡¢
81 ¸·Ì©¤ÊÈæ³Ó¤Ç¤¢¤ëɬÍפÏ̵¤¤ */
83 word_list_same(struct word_list *wl1, struct word_list *wl2)
85 if (wl1->node_id != wl2->node_id ||
86 wl1->from != wl2->from ||
87 wl1->len != wl2->len ||
88 wl1->mw_features != wl2->mw_features ||
89 wl1->tail_ct != wl2->tail_ct ||
90 wl1->part[PART_CORE].len != wl2->part[PART_CORE].len ||
91 wl1->is_compound != wl2->is_compound ||
92 !anthy_wtype_equal(wl1->part[PART_CORE].wt, wl2->part[PART_CORE].wt) ||
93 wl1->head_pos != wl2->head_pos) {
96 if (wl1->part[PART_DEPWORD].dc != wl2->part[PART_DEPWORD].dc) {
104 set_features(struct word_list *wl)
106 if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NOUN &&
107 anthy_wtype_get_sv(wl->part[PART_CORE].wt)) {
108 wl->mw_features |= MW_FEATURE_SV;
110 if (wl->part[PART_POSTFIX].len || wl->part[PART_PREFIX].len) {
111 wl->mw_features |= MW_FEATURE_SUFFIX;
113 if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NUMBER) {
114 wl->mw_features |= MW_FEATURE_NUM;
116 if (wl->part[PART_CORE].len == 1) {
117 wl->mw_features |= MW_FEATURE_CORE1;
119 if (wl->part[PART_CORE].len == 0) {
120 wl->mw_features |= MW_FEATURE_DEP_ONLY;
122 if (wl->part[PART_CORE].freq > HF_THRESH) {
123 wl->mw_features |= MW_FEATURE_HIGH_FREQ;
127 /** ºî¤Ã¤¿word_list¤Î¥¹¥³¥¢¤ò·×»»¤·¤Æ¤«¤é¥³¥ß¥Ã¥È¤¹¤ë */
129 anthy_commit_word_list(struct splitter_context *sc,
130 struct word_list *wl)
132 struct word_list *tmp;
135 /* ÉÕ°¸ì¤À¤±¤Îword_list¤Ç¡¢Ä¹¤µ0¤Î¤â¤ä¤Ã¤Æ¤¯¤ë¤Î¤Ç */
136 if (wl->len == 0) return;
138 wl->last_part = PART_DEPWORD;
142 /* ʸÀᶳ¦¤Î¸¡º÷¤Ç»ÈÍѤ¹¤ë¥¯¥é¥¹¤ÎÀßÄê */
143 anthy_set_seg_class(wl);
145 xs.len = wl->part[PART_DEPWORD].len;
146 xs.str = sc->ce[wl->part[PART_POSTFIX].from + wl->part[PART_POSTFIX].len].c;
147 wl->dep_word_hash = anthy_dep_word_hash(&xs);
148 if (wl->part[PART_POSTFIX].len) {
149 xs.len = wl->part[PART_POSTFIX].len;
150 xs.str = sc->ce[wl->part[PART_POSTFIX].from].c;
153 /* Ʊ¤¸ÆâÍƤÎword_list¤¬¤Ê¤¤¤«¤òÄ´¤Ù¤ë */
154 for (tmp = sc->word_split_info->cnode[wl->from].wl; tmp; tmp = tmp->next) {
155 if (word_list_same(tmp, wl)) {
159 /* wordlist¤Î¥ê¥¹¥È¤ËÄɲà */
160 wl->next = sc->word_split_info->cnode[wl->from].wl;
161 sc->word_split_info->cnode[wl->from].wl = wl;
163 /* ¥Ç¥Ð¥Ã¥°¥×¥ê¥ó¥È */
164 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_WL) {
165 anthy_print_word_list(sc, wl);
170 anthy_alloc_word_list(struct splitter_context *sc)
172 return anthy_smalloc(sc->word_split_info->WlAllocator);
175 /* ¸å³¤Î³èÍѸìÈø¡¢½õ»ì¡¢½õÆ°»ì¤òÉÕ¤±¤ë */
177 make_following_word_list(struct splitter_context *sc,
178 struct word_list *tmpl)
180 /* ¤³¤Îxs¤Ï¼«Î©¸ìÉô¤Î¸å³¤Îʸ»úÎó */
182 xs.str = sc->ce[tmpl->from+tmpl->len].c;
183 xs.len = sc->char_count - tmpl->from - tmpl->len;
184 tmpl->part[PART_DEPWORD].from =
185 tmpl->part[PART_POSTFIX].from + tmpl->part[PART_POSTFIX].len;
187 if (tmpl->node_id >= 0) {
188 /* ÉáÄ̤Îword_list */
189 anthy_scan_node(sc, tmpl, &xs, tmpl->node_id);
191 /* ¼«Î©¸ì¤¬¤Ê¤¤word_list */
192 struct wordseq_rule rule;
193 struct word_list new_tmpl;
195 int nr_rule = anthy_get_nr_dep_rule();
197 /* ̾»ì35¤Î¸å¤Ë³¤¯¥ë¡¼¥ë¤ËÂФ·¤Æ */
198 for (i = 0; i < nr_rule; ++i) {
199 anthy_get_nth_dep_rule(i, &rule);
200 if (anthy_wtype_get_pos(rule.wt) == POS_NOUN
201 && anthy_wtype_get_scos(rule.wt) == SCOS_T35) {
202 new_tmpl.part[PART_CORE].wt = rule.wt;
203 new_tmpl.node_id = rule.node_id;
204 new_tmpl.head_pos = anthy_wtype_get_pos(new_tmpl.part[PART_CORE].wt);
205 anthy_scan_node(sc, &new_tmpl, &xs, new_tmpl.node_id);
212 push_part_back(struct word_list *tmpl, int len,
213 seq_ent_t se, wtype_t wt)
216 tmpl->part[PART_POSTFIX].len += len;
217 tmpl->part[PART_POSTFIX].wt = wt;
218 tmpl->part[PART_POSTFIX].seq = se;
219 tmpl->last_part = PART_POSTFIX;
222 /* ÀÜÈø¼¤ò¤¯¤Ã¤Ä¤±¤ë */
224 make_suc_words(struct splitter_context *sc,
225 struct word_list *tmpl)
229 wtype_t core_wt = tmpl->part[PART_CORE].wt;
230 /* ¿ô»ì¡¢Ì¾Á°¡¢¥µÊÑ̾»ì¤Î¤¤¤º¤ì¤«¤ËÉÕ°¸ì¤ÏÉÕ¤¯ */
232 int core_is_name = 0;
233 int core_is_sv_noun = 0;
235 /* ¤Þ¤º¡¢ÀÜÈø¼¤¬ÉÕ¤¯¼«Î©¸ì¤«¥Á¥§¥Ã¥¯¤¹¤ë */
236 if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) {
239 if (anthy_wtype_include(anthy_wtype_name_noun, core_wt)) {
242 if (anthy_wtype_get_sv(core_wt)) {
245 if (!core_is_num && !core_is_name && !core_is_sv_noun) {
249 right = tmpl->part[PART_CORE].from + tmpl->part[PART_CORE].len;
250 /* ¼«Î©¸ì¤Î±¦Â¦¤Îʸ»úÎó¤ËÂФ·¤Æ */
252 i <= sc->word_split_info->seq_len[right];
256 xs.str = sc->ce[right].c;
258 suc = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
259 if (anthy_get_seq_ent_pos(suc, POS_SUC)) {
260 /* ±¦Â¦¤Îʸ»úÎó¤ÏÉÕ°¸ì¤Ê¤Î¤Ç¡¢¼«Î©¸ì¤ÎÉÊ»ì¤Ë¤¢¤ï¤»¤Æ¥Á¥§¥Ã¥¯ */
261 struct word_list new_tmpl;
263 anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_num_postfix)) {
265 push_part_back(&new_tmpl, i, suc, anthy_wtype_num_postfix);
266 make_following_word_list(sc, &new_tmpl);
269 anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_name_postfix)) {
271 push_part_back(&new_tmpl, i, suc, anthy_wtype_name_postfix);
272 make_following_word_list(sc, &new_tmpl);
274 if (core_is_sv_noun &&
275 anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_sv_postfix)) {
277 push_part_back(&new_tmpl, i, suc, anthy_wtype_sv_postfix);
278 make_following_word_list(sc, &new_tmpl);
285 push_part_front(struct word_list *tmpl, int len,
286 seq_ent_t se, wtype_t wt)
288 tmpl->from = tmpl->from - len;
289 tmpl->len = tmpl->len + len;
290 tmpl->part[PART_PREFIX].from = tmpl->from;
291 tmpl->part[PART_PREFIX].len += len;
292 tmpl->part[PART_PREFIX].wt = wt;
293 tmpl->part[PART_PREFIX].seq = se;
296 /* ÀÜƬ¼¤ò¤¯¤Ã¤Ä¤±¤Æ¤«¤éÀÜÈø¼¤ò¤¯¤Ã¤Ä¤±¤ë */
298 make_pre_words(struct splitter_context *sc,
299 struct word_list *tmpl)
302 wtype_t core_wt = tmpl->part[PART_CORE].wt;
304 /* ¼«Î©¸ì¤Ï¿ô»ì¤«¡© */
305 if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) {
308 /* ÀÜƬ¼¤òÎóµó¤¹¤ë */
310 i <= sc->word_split_info->rev_seq_len[tmpl->part[PART_CORE].from];
313 /* ¤³¤Îxs¤Ï¼«Î©¸ìÉô¤ÎÁ°¤Îʸ»úÎó */
315 xs.str = sc->ce[tmpl->part[PART_CORE].from - i].c;
317 pre = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
318 if (anthy_get_seq_ent_pos(pre, POS_PRE)) {
319 struct word_list new_tmpl;
321 anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_num_prefix)) {
323 push_part_front(&new_tmpl, i, pre, anthy_wtype_num_prefix);
324 make_following_word_list(sc, &new_tmpl);
325 /* ¿ô¤Î¾ì¹ç¤ÏÀÜÈø¼¤â¤¯¤Ã¤Ä¤±¤ë */
326 make_suc_words(sc, &new_tmpl);
327 }/* else if (anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_prefix)) {
329 push_part_front(&new_tmpl, i, pre, anthy_wtype_prefix);
330 make_following_word_list(sc, &new_tmpl);
336 /* wordlist¤ò½é´ü²½¤¹¤ë */
338 setup_word_list(struct word_list *wl, int from, int len,
339 int is_compound, int is_weak)
344 wl->is_compound = is_compound;
345 /* part¤ÎÇÛÎó¤ò½é´ü²½¤¹¤ë */
346 for (i = 0; i < NR_PARTS; i++) {
347 wl->part[i].from = 0;
349 wl->part[i].wt = anthy_wt_none;
351 wl->part[i].freq = 1;/* ÉÑÅÙ¤ÎÄ㤤ñ¸ì¤È¤·¤Æ¤ª¤¯ */
352 wl->part[i].dc = DEP_NONE;
354 /* ¼«Î©¸ì¤Î¥Ñ¡¼¥È¤òÀßÄê */
355 wl->part[PART_CORE].from = from;
356 wl->part[PART_CORE].len = len;
358 wl->mw_features = MW_FEATURE_NONE;
360 wl->last_part = PART_CORE;
361 wl->head_pos = POS_NONE;
362 wl->tail_ct = CT_NONE;
364 wl->mw_features |= MW_FEATURE_WEAK_SEQ;
369 * ¤¢¤ëÆÈΩ¸ì¤ËÂФ·¤Æ¡¢ÀÜƬ¼¡¢ÀÜÈø¼¡¢ÉÕ°¸ì¤òÉÕ¤±¤¿¤â¤Î¤ò
370 * ʸÀá¤Î¸õÊä(=word_list)¤È¤·¤Æcache¤ËÄɲ乤ë
373 make_word_list(struct splitter_context *sc,
379 struct word_list tmpl;
380 struct wordseq_rule rule;
381 int nr_rule = anthy_get_nr_dep_rule();
384 /* ¥Æ¥ó¥×¥ì¡¼¥È¤Î½é´ü²½ */
385 setup_word_list(&tmpl, from, len, is_compound, is_weak);
386 tmpl.part[PART_CORE].seq = se;
388 /* ³Æ¥ë¡¼¥ë¤Ë¥Þ¥Ã¥Á¤¹¤ë¤«Èæ³Ó */
389 for (i = 0; i < nr_rule; ++i) {
391 anthy_get_nth_dep_rule(i, &rule);
393 freq = anthy_get_seq_ent_wtype_freq(se, rule.wt);
395 freq = anthy_get_seq_ent_wtype_compound_freq(se, rule.wt);
399 /* ¼«Î©¸ì¤ÎÉÊ»ì¤Ï¤½¤Î¥ë¡¼¥ë¤Ë¤¢¤Ã¤Æ¤¤¤ë */
400 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_ID) {
401 /* ÉÊ»ìɽ¤Î¥Ç¥Ð¥Ã¥°ÍÑ*/
403 xs.str = sc->ce[tmpl.part[PART_CORE].from].c;
404 xs.len = tmpl.part[PART_CORE].len;
406 printf(" freq=%d rule_id=%d node_id=%d\n",
407 freq, i, rule.node_id);
409 /* Á«°Ü¤·¤¿¥ë¡¼¥ë¤Î¾ðÊó¤òžµ¤¹¤ë */
410 tmpl.part[PART_CORE].wt = rule.wt;
411 tmpl.part[PART_CORE].freq = freq;
412 tmpl.node_id = rule.node_id;
413 tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt);
416 tmpl.part[PART_POSTFIX].from =
417 tmpl.part[PART_CORE].from +
418 tmpl.part[PART_CORE].len;
420 if (anthy_wtype_get_pos(rule.wt) == POS_NOUN ||
421 anthy_wtype_get_pos(rule.wt) == POS_NUMBER) {
422 /* ÀÜƬ¼¡¢ÀÜÈø¼¤Ï̾»ì¡¢¿ô»ì¤Ë¤·¤«ÉÕ¤«¤Ê¤¤¤³¤È¤Ë¤·¤Æ¤¤¤ë */
423 make_pre_words(sc, &tmpl);
424 make_suc_words(sc, &tmpl);
426 /* ÀÜƬ¼¡¢ÀÜÈø¼Ìµ¤·¤Ç½õ»ì½õÆ°»ì¤ò¤Ä¤±¤ë */
427 make_following_word_list(sc, &tmpl);
433 make_dummy_head(struct splitter_context *sc)
435 struct word_list tmpl;
436 setup_word_list(&tmpl, 0, 0, 0, 0);
437 tmpl.part[PART_CORE].seq = 0;
438 tmpl.part[PART_CORE].wt = anthy_wtype_noun;
440 tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt);
441 make_suc_words(sc, &tmpl);
445 compare_hash(const void *kp, const void *cp)
449 return (*h) - ntohl(*c);
455 const int *array = (int *)weak_word_array;
461 nr = ntohl(array[1]);
462 h = anthy_xstr_hash(xs);
463 if (bsearch(&h, &array[16], nr,
464 sizeof(int), compare_hash)) {
470 /* ¥³¥ó¥Æ¥¥¹¥È¤ËÀßÄꤵ¤ì¤¿Ê¸»úÎó¤ÎÉôʬʸ»úÎ󤫤éÁ´¤Æ¤Îword_list¤òÎóµó¤¹¤ë */
472 anthy_make_word_list_all(struct splitter_context *sc)
478 struct depword_ent *next;
484 struct word_split_info_cache *info;
487 weak_word_array = anthy_file_dic_get_section("weak_words");
489 info = sc->word_split_info;
491 de_ator = anthy_create_allocator(sizeof(struct depword_ent), 0);
493 xs.str = sc->ce[0].c;
494 xs.len = sc->char_count;
495 anthy_gang_load_dic(&xs, sc->is_reverse);
497 /* Á´¤Æ¤Î¼«Î©¸ì¤òÎóµó */
498 /* ³«»ÏÃÏÅÀ¤Î¥ë¡¼¥× */
499 for (i = 0; i < sc->char_count ; i++) {
500 int search_len = sc->char_count - i;
502 if (search_len > 30) {
506 /* ʸ»úÎóĹ¤Î¥ë¡¼¥×(Ť¤Êý¤«¤é) */
507 for (j = search_len; j > search_from; j--) {
508 /* seq_ent¤ò¼èÆÀ¤¹¤ë */
510 xs.str = sc->ce[i].c;
511 se = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
513 /* ñ¸ì¤È¤·¤Æǧ¼±¤Ç¤¤Ê¤¤ */
518 /* ³Æ¡¢Éôʬʸ»úÎó¤¬Ã±¸ì¤Ê¤é¤ÐÀÜƬ¼¡¢ÀÜÈø¼¤Î
519 ºÇÂçŤòÄ´¤Ù¤Æ¥Þ¡¼¥¯¤¹¤ë */
520 if (j > info->seq_len[i] &&
521 anthy_get_seq_ent_pos(se, POS_SUC)) {
522 info->seq_len[i] = j;
524 if (j > info->rev_seq_len[i + j] &&
525 anthy_get_seq_ent_pos(se, POS_PRE)) {
526 info->rev_seq_len[i + j] = j;
529 /* ȯ¸«¤·¤¿¼«Î©¸ì¤ò¥ê¥¹¥È¤ËÄɲà */
530 if (anthy_get_seq_ent_indep(se) &&
531 /* Ê£¹ç¸ì¤Ç̵¤¤¸õÊ䤬¤¢¤ë¤³¤È¤ò³Îǧ */
532 anthy_has_non_compound_ents(se)) {
533 de = (struct depword_ent *)anthy_smalloc(de_ator);
538 de->is_weak = check_weak(&xs);
543 /* ȯ¸«¤·¤¿Ê£¹ç¸ì¤ò¥ê¥¹¥È¤ËÄɲà */
544 if (anthy_has_compound_ents(se)) {
545 de = (struct depword_ent *)anthy_smalloc(de_ator);
558 /* ȯ¸«¤·¤¿¼«Î©¸ìÁ´¤Æ¤ËÂФ·¤ÆÉÕ°¸ì¥Ñ¥¿¡¼¥ó¤Î¸¡º÷ */
559 for (de = head; de; de = de->next) {
560 make_word_list(sc, de->se, de->from, de->len,
561 de->is_compound, de->is_weak);
564 /* ¼«Î©¸ì¤Î̵¤¤word_list */
565 for (i = 0; i < sc->char_count; i++) {
566 struct word_list tmpl;
567 setup_word_list(&tmpl, i, 0, 0, 0);
569 make_following_word_list(sc, &tmpl);
571 int type = anthy_get_xchar_type(*sc->ce[i - 1].c);
572 if ((type & (XCT_CLOSE | XCT_SYMBOL)) &&
573 !(type & XCT_PUNCTUATION)) {
574 /* ¶çÆÉÅÀ°Ê³°¤Îµ¹æ */
575 make_following_word_list(sc, &tmpl);
580 /* ÀèƬ¤Ë0ʸ»ú¤Î¼«Î©¸ì¤òÉÕ¤±¤ë */
583 anthy_free_allocator(de_ator);