3 * Copyright (C) 2006 Higashiyama Masahiko (thanks google summer of code program)
4 * Copyright (C) 2002-2007 TABATA Yusuke
6 * anthy_reorder_candidates_by_relation()
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2 of the License, or (at your option) any later version.
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License for more details.
20 You should have received a copy of the GNU Lesser General Public
21 License along with this library; if not, write to the Free Software
22 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 #include <arpa/inet.h>
28 #include <anthy/segclass.h>
29 #include <anthy/segment.h>
30 #include <anthy/ordering.h>
31 #include <anthy/dic.h>
32 #include <anthy/diclib.h>
33 #include <anthy/feature_set.h>
34 #include <anthy/corpus.h>
37 #define MAX_COLLISION 4
38 #define SEARCH_LIMIT 100
39 #define MAX_NEIGHBOR 10
42 /* Á´Ê¸¸¡º÷ÍѤΥ³¡¼¥Ñ¥¹ */
43 static struct corpus_ {
55 /* ¸¡º÷ÍѤÎiterator */
57 /* ¸¡º÷¤Î¥¡¼¤È¸½ºß¤Î¾ì½ê */
69 /** ʸÀá@seg¤ÎÃæ¤Ë@from_word_id¤Îñ¸ì¤È¶¦µ¯´Ø·¸¤Ë¤¢¤ë
70 * ¸õÊ䤬¤¢¤ë¤«¤É¤¦¤«¤òõ¤·¡¢¤¢¤ì¤Ð¥¹¥³¥¢¤ò¾å¤²¤ë¡£
73 reorder_candidate(int from_word_id, struct seg_ent *seg)
77 if (NULL == seg->cands) { /* ¼½ñ¤â¤·¤¯¤Ï³Ø½¬¥Ç¡¼¥¿¤¬²õ¤ì¤Æ¤¤¤¿»þ¤ÎÂкö */
81 if (ce->core_elm_index == -1) {
84 /* 0ÈÖÌܤθõÊä¤ÎÉÊ»ì */
85 pos = anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt);
87 for (i = 0; i < seg->nr_cands; i++) {
90 if (ce->core_elm_index == -1) {
93 word_id = ce->elm[ce->core_elm_index].id;
94 if (anthy_dic_check_word_relation(from_word_id, word_id) &&
95 anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt) == pos) {
96 /* ÍÑÎã¤Ë¥Þ¥Ã¥Á¤·¤¿¤Î¤Ç¡¢¸õÊä¤Î¥¹¥³¥¢¤ò¹¹¿· */
97 ce->flag |= CEF_USEDICT;
104 get_indep_word_id(struct seg_ent *seg, int nth)
107 if (NULL == seg->cands) { /* ¼½ñ¤â¤·¤¯¤Ï³Ø½¬¥Ç¡¼¥¿¤¬²õ¤ì¤Æ¤¤¤¿»þ¤ÎÂкö */
110 if (seg->cands[nth]->core_elm_index == -1) {
111 /* °ìÈÖÌܤθõÊ䤬seq_ent¤«¤éºî¤é¤ì¤¿¸õÊä¤Ç¤Ï¤Ê¤¤ */
114 ce = seg->cands[nth];
115 /* ¼«Î©¸ì¤Îid¤ò¼è¤ê½Ð¤¹ */
116 return ce->elm[ce->core_elm_index].id;
119 /* ÍÑÎã¼½ñ¤ò»È¤Ã¤ÆʤÓÂؤ¨¤ò¤¹¤ë */
121 reorder_by_use_dict(struct segment_list *sl, int nth)
124 struct seg_ent *cur_seg;
127 cur_seg = anthy_get_nth_segment(sl, nth);
128 word_id = get_indep_word_id(cur_seg, 0);
133 /* ¶á½ê¤ÎʸÀá¤ò½ç¤Ë¸«¤Æ¤¤¤¯ */
134 for (i = nth - 2; i < nth + 2 && i < sl->nr_segments; i++) {
135 struct seg_ent *target_seg;
136 if (i < 0 || i == nth) {
139 /* iÈÖÌܤÎʸÀá¤ÈÁ°¸å¤ÎjÈÖÌܤÎʸÀá¤ËÂФ·¤Æ */
140 target_seg = anthy_get_nth_segment(sl, i);
141 reorder_candidate(word_id, target_seg);
146 find_border_of_this_word(int idx)
152 val = ntohl(corpus_info.array[idx * 2]);
153 while (!(val & ELM_WORD_BORDER) &&
161 find_left_word_border(int idx)
167 val = ntohl(corpus_info.array[idx * 2]);
172 return find_border_of_this_word(idx);
176 find_right_word_border(int idx)
181 while (idx < corpus_info.array_size - 2) {
184 val = ntohl(corpus_info.array[idx * 2]);
188 if (val & ELM_WORD_BORDER) {
196 push_id(struct neighbor *ctx,
199 if (ctx->nr < MAX_NEIGHBOR - 1) {
200 ctx->id[ctx->nr] = id;
206 collect_word_context(struct neighbor *ctx, int idx)
208 int id = ntohl(corpus_info.array[idx * 2]) & CORPUS_KEY_MASK;
209 /*printf(" id=%d\n", id);*/
213 /* ÎãʸÃæ¤Ç¼þÊդξðÊó¤ò¼èÆÀ¤¹¤ë */
215 collect_corpus_context(struct neighbor *ctx,
221 this_idx = find_border_of_this_word(it->idx);
223 /*printf(" key=%d\n", it->key);*/
226 for (i = 0; i < 2; i++) {
227 idx = find_left_word_border(idx);
231 collect_word_context(ctx, idx);
235 for (i = 0; i < 2; i++) {
236 idx = find_right_word_border(idx);
240 collect_word_context(ctx, idx);
244 /* ÊÑ´¹ÂоݤÎʸ»úÎó¤Î¼þÊդξðÊó¤ò¼èÆÀ¤¹¤ë */
246 collect_user_context(struct neighbor *ctx,
247 struct segment_list *sl, int nth)
251 for (i = nth - 2; i <= nth + 2 && i < sl->nr_segments; i++) {
253 if ((i < 0) || (i == nth)) {
256 id = get_indep_word_id(anthy_get_nth_segment(sl, i), 0);
258 id &= CORPUS_KEY_MASK;
259 /*printf("user_ctx=%d\n", id);*/
265 /* ÎÙÀÜʸÀá¤Î¾ðÊó¤òÈæ³Ó¤¹¤ë */
267 do_compare_context(struct neighbor *n1,
272 for (i = 0; i < n1->nr; i++) {
273 for (j = 0; j < n2->nr; j++) {
274 if (n1->id[i] == n2->id[j]) {
282 /* ÎÙÀÜʸÀá¤Î¾ðÊó¤ò¼èÆÀ¤·¤ÆÈæ³Ó¤¹¤ë */
284 compare_context(struct neighbor *user,
287 struct neighbor sample;
291 /* ÎãʸÃæ¤Î¼þÊÕ¾ðÊó¤ò½¸¤á¤ë */
292 collect_corpus_context(&sample, it);
293 if (sample.nr == 0) {
297 nr = do_compare_context(user, &sample);
298 if (nr >= sample.nr / 2) {
304 /* key¤ÎºÇ½é¤Î½Ð¸½¾ì½ê¤ò¸«¤Ä¤±¤ë
305 * ¸«¤Ä¤«¤é¤Ê¤«¤Ã¤¿¤é-1¤òÊÖ¤¹
308 find_first_pos(int key)
311 for (i = 0; i < MAX_COLLISION; i++) {
312 int bkt = (key + i) % corpus_info.bucket_size;
313 if ((int)ntohl(corpus_info.bucket[bkt * 2]) == key) {
314 return ntohl(corpus_info.bucket[bkt * 2 + 1]);
320 /* key¤ÎºÇ½é¤Î½Ð¸½¾ì½ê¤Çiterator¤ò½é´ü²½¤¹¤ë
321 * ¸«¤Ä¤«¤é¤Ê¤«¤Ã¤¿¤é-1¤òÊÖ¤¹
324 find_first_from_corpus(int key, struct iterator *it, int limit)
326 key &= CORPUS_KEY_MASK;
327 it->idx = find_first_pos(key);
333 /* key¤Î¼¡¤Î½Ð¸½¾ì½ê¤Îiterator¤òÀßÄꤹ¤ë
336 find_next_from_corpus(struct iterator *it)
344 it->idx = ntohl(corpus_info.array[it->idx * 2 + 1]);
345 if (it->idx < 0 || it->idx >= corpus_info.array_size ||
353 check_candidate_context(struct seg_ent *cur_seg,
355 struct neighbor *user)
360 word_id = get_indep_word_id(cur_seg, i);
364 /* ³Æ½Ð¸½¾ì½ê¤ò¥¹¥¥ã¥ó¤¹¤ë */
365 find_first_from_corpus(word_id, &it, SEARCH_LIMIT);
366 /*printf("word_id=%d %d\n", word_id, it.idx);*/
367 while (it.idx > -1) {
368 nr += compare_context(user, &it);
370 find_next_from_corpus(&it);
374 cur_seg->cands[i]->flag |= CEF_CONTEXT;
378 /* Á´Ê¸¸¡º÷¤Ç¸õÊä¤òʤÓÂؤ¨¤ë */
380 reorder_by_corpus(struct segment_list *sl, int nth)
382 struct seg_ent *cur_seg;
383 struct neighbor user;
385 /* ʸÀá¤Î¼þÊÕ¾ðÊó¤ò½¸¤á¤ë */
386 collect_user_context(&user, sl, nth);
390 cur_seg = anthy_get_nth_segment(sl, nth);
391 if (NULL == cur_seg->cands) { /* ¼½ñ¤â¤·¤¯¤Ï³Ø½¬¥Ç¡¼¥¿¤¬²õ¤ì¤Æ¤¤¤¿»þ¤ÎÂкö */
395 for (i = 0; i < cur_seg->nr_cands; i++) {
396 check_candidate_context(cur_seg, i, &user);
398 /* ¥È¥Ã¥×¤Î¸õÊä¤ËÍÑÎ㤬¤¢¤ì¤Ð¡¢Â¾¤Î¸õÊä¤Ï¸«¤Ê¤¤ */
399 if (cur_seg->cands[0]->flag & CEF_CONTEXT) {
400 cur_seg->cands[0]->flag &= ~CEF_CONTEXT;
403 /* ÍÑÎã¤Ë¤è¤ë¥¹¥³¥¢²Ã»» */
404 for (i = 1; i < cur_seg->nr_cands; i++) {
405 if (cur_seg->cands[i]->flag & CEF_CONTEXT) {
406 cur_seg->cands[i]->score *= 2;
412 * ÍÑÎã¤òÍѤ¤¤Æ¸õÊä¤òʤÓÂؤ¨¤ë
413 * @nthÈÖÌܰʹߤÎʸÀá¤òÂоݤȤ¹¤ë
416 anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth)
419 for (i = nth; i < sl->nr_segments; i++) {
420 reorder_by_use_dict(sl, i);
421 reorder_by_corpus(sl, i);
426 anthy_relation_init(void)
428 corpus_info.corpus_array = anthy_file_dic_get_section("corpus_array");
429 corpus_info.corpus_bucket = anthy_file_dic_get_section("corpus_bucket");
430 if (!corpus_info.corpus_array ||
431 !corpus_info.corpus_array) {
434 corpus_info.array_size = ntohl(((int *)corpus_info.corpus_array)[1]);
435 corpus_info.bucket_size = ntohl(((int *)corpus_info.corpus_bucket)[1]);
436 corpus_info.array = &(((int *)corpus_info.corpus_array)[16]);
437 corpus_info.bucket = &(((int *)corpus_info.corpus_bucket)[16]);
441 for (i = 0; i < corpus_info.array_size; i++) {
442 int v = ntohl(corpus_info.array[i * 2]);
443 printf("%d: %d %d\n", i, v, v & CORPUS_KEY_MASK);