2 * ³ÎΨ¤òɾ²Á¤·¥Ó¥¿¥Ó¥¢¥ë¥´¥ê¥º¥à(viterbi algorithm)¤Ë¤è¤Ã¤Æ
3 * ʸÀá¤Î¶èÀÚ¤ê¤ò·èÄꤷ¤Æ¥Þ¡¼¥¯¤¹¤ë¡£
6 * ³°Éô¤«¤é¸Æ¤Ó½Ð¤µ¤ì¤ë´Ø¿ô
9 * Copyright (C) 2006-2007 TABATA Yusuke
10 * Copyright (C) 2004-2006 YOSHIDA Yuichi
11 * Copyright (C) 2006 HANAOKA Toshiyuki
15 This library is free software; you can redistribute it and/or
16 modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 2 of the License, or (at your option) any later version.
20 This library is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 Lesser General Public License for more details.
25 You should have received a copy of the GNU Lesser General Public
26 License along with this library; if not, write to the Free Software
27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 * ¥³¥ó¥Æ¥¥¹¥ÈÃæ¤Ë¸ºß¤¹¤ëmeta_word¤ò¤Ä¤Ê¤¤¤Ç¥°¥é¥Õ¤ò¹½À®¤·¤Þ¤¹¡£
31 * (¤³¤Î¥°¥é¥Õ¤Î¤³¤È¤ò¥é¥Æ¥£¥¹(lattice/«)¤â¤·¤¯¤Ï¥È¥ì¥ê¥¹(trellis)¤È¸Æ¤Ó¤Þ¤¹)
32 * meta_word¤É¤¦¤·¤ÎÀܳ¤¬¥°¥é¥Õ¤Î¥Î¡¼¥É¤È¤Ê¤ê¡¢¹½Â¤ÂÎlattice_node¤Î
33 * ¥ê¥ó¥¯¤È¤·¤Æ¹½À®¤µ¤ì¤Þ¤¹¡£
35 * ¤³¤³¤Ç¤Î½èÍý¤Ï¼¡¤ÎÆó¤Ä¤ÎÍ×ÁǤǹ½À®¤µ¤ì¤Þ¤¹
36 * (1) ¥°¥é¥Õ¤ò¹½À®¤·¤Ä¤Ä¡¢³Æ¥Î¡¼¥É¤Ø¤ÎÅþã³ÎΨ¤òµá¤á¤ë
37 * (2) ¥°¥é¥Õ¤ò¸å¤í(±¦)¤«¤é¤¿¤É¤Ã¤ÆºÇŬ¤Ê¥Ñ¥¹¤òµá¤á¤ë
45 #include <anthy/alloc.h>
46 #include <anthy/xstr.h>
47 #include <anthy/segclass.h>
48 #include <anthy/splitter.h>
49 #include <anthy/feature_set.h>
50 #include <anthy/diclib.h>
51 #include "wordborder.h"
53 static float anthy_normal_length = 20.0; /* ʸÀá¤Î´üÂÔ¤µ¤ì¤ëŤµ */
54 static void *trans_info_array;
56 #define NODE_MAX_SIZE 50
58 /* ¥°¥é¥Õ¤Î¥Î¡¼¥É(Á«°Ü¾õÂÖ) */
60 int border; /* ʸ»úÎóÃæ¤Î¤É¤³¤«¤é»Ï¤Þ¤ë¾õÂÖ¤« */
61 enum seg_class seg_class; /* ¤³¤Î¾õÂÖ¤ÎÉÊ»ì */
64 double real_probability; /* ¤³¤³¤Ë»ê¤ë¤Þ¤Ç¤Î³ÎΨ(ʸÀá¿ôÊäÀµÌµ¤·) */
65 double adjusted_probability; /* ¤³¤³¤Ë»ê¤ë¤Þ¤Ç¤Î³ÎΨ(ʸÀá¿ôÊäÀµÍ¤ê) */
68 struct lattice_node* before_node; /* °ì¤ÄÁ°¤ÎÁ«°Ü¾õÂÖ */
69 struct meta_word* mw; /* ¤³¤ÎÁ«°Ü¾õÂÖ¤ËÂбþ¤¹¤ëmeta_word */
71 struct lattice_node* next; /* ¥ê¥¹¥È¹½Â¤¤Î¤¿¤á¤Î¥Ý¥¤¥ó¥¿ */
74 struct node_list_head {
75 struct lattice_node *head;
80 /* Á«°Ü¾õÂ֤Υꥹ¥È¤ÎÇÛÎó */
81 struct node_list_head *lattice_node_list;
82 struct splitter_context *sc;
83 /* ¥Î¡¼¥É¤Î¥¢¥í¥±¡¼¥¿ */
84 allocator node_allocator;
90 print_lattice_node(struct lattice_info *info, struct lattice_node *node)
93 printf("**lattice_node (null)*\n");
96 printf("**lattice_node probability=%.128f\n", node->real_probability);
98 anthy_print_metaword(info->sc, node->mw);
103 get_poisson(double lambda, int r)
108 /* Íפ¹¤ë¤Ë¥Ý¥ï¥½¥óʬÉÛ */
109 result = pow(lambda, r) * exp(-lambda);
110 for (i = 2; i <= r; ++i) {
117 /* ʸÀá¤Î·Á¼°¤«¤é¥¹¥³¥¢¤òÄ´À°¤¹¤ë */
119 get_form_bias(struct meta_word *mw)
123 /* wrap¤µ¤ì¤Æ¤¤¤ë¾ì¹ç¤ÏÆâÉô¤Î¤ò»È¤¦ */
124 while (mw->type == MW_WRAP) {
127 /* ʸÀáŤˤè¤ëÄ´À° */
135 if (mw->seg_class == SEG_RENTAI_SHUSHOKU &&
140 bias = get_poisson(anthy_normal_length, r);
145 build_feature_list(struct lattice_node *node,
146 struct feature_list *features)
150 cc = node->seg_class;
154 anthy_feature_list_set_cur_class(features, cc);
155 if (node && node->before_node) {
156 pc = node->before_node->seg_class;
160 anthy_feature_list_set_class_trans(features, pc, cc);
162 if (node && node->mw) {
163 struct meta_word *mw = node->mw;
164 anthy_feature_list_set_dep_class(features, mw->dep_class);
165 anthy_feature_list_set_dep_word(features,
167 anthy_feature_list_set_mw_features(features, mw->mw_features);
168 anthy_feature_list_set_noun_cos(features, mw->core_wt);
171 anthy_feature_list_sort(features);
175 calc_probability(int cc, struct feature_list *fl)
177 struct feature_freq *res, arg;
181 res = anthy_find_feature_freq(trans_info_array,
185 double pos = res->f[15];
186 double neg = res->f[14];
187 prob = 1 - (neg) / (double) (pos + neg);
190 /* ÎãʸÃæ¤Ë¸ºß¤·¤Ê¤¤¥Ñ¥¿¡¼¥ó¤Ê¤Î¤Ç0¤Ë¶á¤¤¥¹¥³¥¢ */
191 prob = 1.0f / (double)(10000 * 100);
194 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
195 anthy_feature_list_print(fl);
196 printf(" cc=%d(%s), P=%f\n", cc, anthy_seg_class_name(cc), prob);
202 get_transition_probability(struct lattice_node *node)
204 struct feature_list features;
208 anthy_feature_list_init(&features);
209 build_feature_list(node, &features);
210 probability = calc_probability(node->seg_class, &features);
211 anthy_feature_list_free(&features);
213 /* ʸÀá¤Î·Á¤ËÂФ¹¤ëɾ²Á */
214 probability *= get_form_bias(node->mw);
218 static struct lattice_info*
219 alloc_lattice_info(struct splitter_context *sc, int size)
222 struct lattice_info* info = (struct lattice_info*)malloc(sizeof(struct lattice_info));
224 info->lattice_node_list = (struct node_list_head*)
225 malloc((size + 1) * sizeof(struct node_list_head));
226 for (i = 0; i < size + 1; i++) {
227 info->lattice_node_list[i].head = NULL;
228 info->lattice_node_list[i].nr_nodes = 0;
230 info->node_allocator = anthy_create_allocator(sizeof(struct lattice_node),
236 calc_node_parameters(struct lattice_node *node)
238 /* Âбþ¤¹¤ëmetaword¤¬Ìµ¤¤¾ì¹ç¤ÏʸƬ¤ÈȽÃǤ¹¤ë */
239 node->seg_class = node->mw ? node->mw->seg_class : SEG_HEAD;
241 if (node->before_node) {
242 /* º¸¤ËÎÙÀܤ¹¤ë¥Î¡¼¥É¤¬¤¢¤ë¾ì¹ç */
243 node->real_probability = node->before_node->real_probability *
244 get_transition_probability(node);
245 node->adjusted_probability = node->real_probability *
246 (node->mw ? node->mw->score : 1000);
248 /* º¸¤ËÎÙÀܤ¹¤ë¥Î¡¼¥É¤¬Ìµ¤¤¾ì¹ç */
249 node->real_probability = 1.0;
250 node->adjusted_probability = node->real_probability;
254 static struct lattice_node*
255 alloc_lattice_node(struct lattice_info *info,
256 struct lattice_node* before_node,
257 struct meta_word* mw, int border)
259 struct lattice_node* node;
260 node = anthy_smalloc(info->node_allocator);
261 node->before_node = before_node;
262 node->border = border;
266 calc_node_parameters(node);
272 release_lattice_node(struct lattice_info *info, struct lattice_node* node)
274 anthy_sfree(info->node_allocator, node);
278 release_lattice_info(struct lattice_info* info)
280 anthy_free_allocator(info->node_allocator);
281 free(info->lattice_node_list);
286 cmp_node_by_type(struct lattice_node *lhs, struct lattice_node *rhs,
287 enum metaword_type type)
289 if (lhs->mw->type == type && rhs->mw->type != type) {
291 } else if (lhs->mw->type != type && rhs->mw->type == type) {
299 cmp_node_by_type_to_type(struct lattice_node *lhs, struct lattice_node *rhs,
300 enum metaword_type type1, enum metaword_type type2)
302 if (lhs->mw->type == type1 && rhs->mw->type == type2) {
304 } else if (lhs->mw->type == type2 && rhs->mw->type == type1) {
315 * 1: lhs¤ÎÊý¤¬³ÎΨ¤¬¹â¤¤
317 * -1: rhs¤ÎÊý¤¬³ÎΨ¤¬¹â¤¤
320 cmp_node(struct lattice_node *lhs, struct lattice_node *rhs)
322 struct lattice_node *lhs_before = lhs;
323 struct lattice_node *rhs_before = rhs;
326 if (lhs && !rhs) return 1;
327 if (!lhs && rhs) return -1;
328 if (!lhs && !rhs) return 0;
330 while (lhs_before && rhs_before) {
331 if (lhs_before->mw && rhs_before->mw &&
332 lhs_before->mw->from + lhs_before->mw->len == rhs_before->mw->from + rhs_before->mw->len) {
333 /* ³Ø½¬¤«¤éºî¤é¤ì¤¿¥Î¡¼¥É¤«¤É¤¦¤«¤ò¸«¤ë */
334 ret = cmp_node_by_type(lhs_before, rhs_before, MW_OCHAIRE);
335 if (ret != 0) return ret;
337 /* COMPOUND_PART¤è¤ê¤ÏCOMPOUND_HEAD¤òÍ¥Àè */
338 ret = cmp_node_by_type_to_type(lhs_before, rhs_before, MW_COMPOUND_HEAD, MW_COMPOUND_PART);
339 if (ret != 0) return ret;
343 lhs_before = lhs_before->before_node;
344 rhs_before = rhs_before->before_node;
347 /* ºÇ¸å¤ËÁ«°Ü³ÎΨ¤ò¸«¤ë */
348 if (lhs->adjusted_probability > rhs->adjusted_probability) {
350 } else if (lhs->adjusted_probability < rhs->adjusted_probability) {
358 * ¹½À®Ãæ¤Î¥é¥Æ¥£¥¹¤Ë¥Î¡¼¥É¤òÄɲ乤ë
361 push_node(struct lattice_info* info, struct lattice_node* new_node,
364 struct lattice_node* node;
365 struct lattice_node* previous_node = NULL;
367 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
368 print_lattice_node(info, new_node);
371 /* ÀèƬ¤Înode¤¬Ìµ¤±¤ì¤Ð̵¾ò·ï¤ËÄɲà */
372 node = info->lattice_node_list[position].head;
374 info->lattice_node_list[position].head = new_node;
375 info->lattice_node_list[position].nr_nodes ++;
380 /* ;·×¤Ê¥Î¡¼¥É¤òÄɲ䷤ʤ¤¤¿¤á¤Î»Þ´¢¤ê */
381 if (new_node->seg_class == node->seg_class &&
382 new_node->border == node->border) {
383 /* segclass¤¬Æ±¤¸¤Ç¡¢»Ï¤Þ¤ë°ÌÃÖ¤¬Æ±¤¸¤Ê¤é */
384 switch (cmp_node(new_node, node)) {
387 /* ¿·¤·¤¤Êý¤¬³ÎΨ¤¬Â礤¤¤«³Ø½¬¤Ë¤è¤ë¤â¤Î¤Ê¤é¡¢¸Å¤¤¤Î¤ÈÃÖ¤´¹¤¨*/
389 previous_node->next = new_node;
391 info->lattice_node_list[position].head = new_node;
393 new_node->next = node->next;
394 release_lattice_node(info, node);
397 /* ¤½¤¦¤Ç¤Ê¤¤¤Ê¤éºï½ü */
398 release_lattice_node(info, new_node);
403 previous_node = node;
407 /* ºÇ¸å¤Î¥Î¡¼¥É¤Î¸å¤í¤ËÄɲà */
408 node->next = new_node;
409 info->lattice_node_list[position].nr_nodes ++;
412 /* °ìÈÖ³ÎΨ¤ÎÄ㤤¥Î¡¼¥É¤ò¾Ãµî¤¹¤ë*/
414 remove_min_node(struct lattice_info *info, struct node_list_head *node_list)
416 struct lattice_node* node = node_list->head;
417 struct lattice_node* previous_node = NULL;
418 struct lattice_node* min_node = node;
419 struct lattice_node* previous_min_node = NULL;
421 /* °ìÈÖ³ÎΨ¤ÎÄ㤤¥Î¡¼¥É¤òõ¤¹ */
423 if (cmp_node(node, min_node) < 0) {
424 previous_min_node = previous_node;
427 previous_node = node;
431 /* °ìÈÖ³ÎΨ¤ÎÄ㤤¥Î¡¼¥É¤òºï½ü¤¹¤ë */
432 if (previous_min_node) {
433 previous_min_node->next = min_node->next;
435 node_list->head = min_node->next;
437 release_lattice_node(info, min_node);
438 node_list->nr_nodes --;
441 /* ¤¤¤ï¤æ¤ë¥Ó¥¿¥Ó¥¢¥ë¥´¥ê¥º¥à¤ò»ÈÍѤ·¤Æ·ÐÏ©¤òÁª¤Ö */
443 choose_path(struct lattice_info* info, int to)
445 /* ºÇ¸å¤Þ¤ÇÅþ㤷¤¿Á«°Ü¤Î¤Ê¤«¤Ç°ìÈÖ³ÎΨ¤ÎÂ礤¤¤â¤Î¤òÁª¤Ö */
446 struct lattice_node* node;
447 struct lattice_node* best_node = NULL;
449 while (!info->lattice_node_list[last].head) {
450 /* ºÇ¸å¤Îʸ»ú¤Þ¤ÇÁ«°Ü¤·¤Æ¤¤¤Ê¤«¤Ã¤¿¤é¸åÌá¤ê */
453 for (node = info->lattice_node_list[last].head; node; node = node->next) {
454 if (cmp_node(node, best_node) > 0) {
462 /* Á«°Ü¤òµÕ¤Ë¤¿¤É¤ê¤Ä¤ÄʸÀá¤ÎÀÚ¤ìÌܤòµÏ¿ */
464 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
465 printf("choose_path()\n");
467 while (node->before_node) {
468 info->sc->word_split_info->best_seg_class[node->border] =
470 anthy_mark_border_by_metaword(info->sc, node->mw);
472 if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
473 print_lattice_node(info, node);
476 node = node->before_node;
481 build_graph(struct lattice_info* info, int from, int to)
484 struct lattice_node* node;
485 struct lattice_node* left_node;
487 /* »ÏÅÀ¤È¤Ê¤ë¥Î¡¼¥É¤òÄɲà */
488 node = alloc_lattice_node(info, NULL, NULL, from);
489 push_node(info, node, from);
491 /* info->lattice_node_list[index]¤Ë¤Ïindex¤Þ¤Ç¤ÎÁ«°Ü¤¬Æþ¤Ã¤Æ¤¤¤ë¤Î¤Ç¤¢¤Ã¤Æ¡¢
492 * index¤«¤é¤ÎÁ«°Ü¤¬Æþ¤Ã¤Æ¤¤¤ë¤Î¤Ç¤Ï¤Ê¤¤
495 /* Á´¤Æ¤ÎÁ«°Ü¤òº¸¤«¤é»î¤¹ */
496 for (i = from; i < to; ++i) {
497 for (left_node = info->lattice_node_list[i].head; left_node;
498 left_node = left_node->next) {
499 struct meta_word *mw;
500 /* iʸ»úÌܤËÅþ㤹¤ëlattice_node¤Î¥ë¡¼¥× */
502 for (mw = info->sc->word_split_info->cnode[i].mw; mw; mw = mw->next) {
504 struct lattice_node* new_node;
505 /* iʸ»úÌܤ«¤é¤Îmeta_word¤Î¥ë¡¼¥× */
507 if (mw->can_use != ok) {
508 continue; /* ·è¤á¤é¤ì¤¿Ê¸Àá¤Î¶èÀÚ¤ê¤ò¤Þ¤¿¤°metaword¤Ï»È¤ï¤Ê¤¤ */
510 position = i + mw->len;
511 new_node = alloc_lattice_node(info, left_node, mw, i);
512 push_node(info, new_node, position);
514 /* ²ò¤Î¸õÊ䤬¿¤¹¤®¤¿¤é¡¢³ÎΨ¤ÎÄ㤤Êý¤«¤éºï¤ë */
515 if (info->lattice_node_list[position].nr_nodes >= NODE_MAX_SIZE) {
516 remove_min_node(info, &info->lattice_node_list[position]);
523 for (node = info->lattice_node_list[to].head; node; node = node->next) {
524 struct feature_list features;
525 anthy_feature_list_init(&features);
526 build_feature_list(NULL, &features);
527 node->adjusted_probability = node->adjusted_probability *
528 calc_probability(SEG_TAIL, &features);
529 anthy_feature_list_free(&features);
534 anthy_mark_borders(struct splitter_context *sc, int from, int to)
536 struct lattice_info* info = alloc_lattice_info(sc, to);
537 trans_info_array = anthy_file_dic_get_section("trans_info");
538 build_graph(info, from, to);
539 choose_path(info, to);
540 release_lattice_info(info);