3 * ¥Õ¥¡¥¤¥ë¤Î¼½ñ¤Î¥¤¥ó¥¿¡¼¥Õ¥§¡¼¥¹¡¢Â¸ºß¤¹¤ë¥Ç¡¼¥¿¤Ï
4 * ¥¥ã¥Ã¥·¥å¤µ¤ì¤ë¤Î¤Ç¤³¤³¤Ç¤Ï¸ºß¤·¤Ê¤¤Ã±¸ì¤Î
5 * ¥µ¡¼¥Á¤ò¹â®¤Ë¤¹¤ëɬÍפ¬¤¢¤ë¡£
7 * anthy_gang_fill_seq_ent()¤¬Ãæ¿´¤È¤Ê¤ë´Ø¿ô¤Ç¤¢¤ë
8 * »ØÄꤷ¤¿word_dic¤«¤é»ØÄꤷ¤¿Ê¸»úÎó¤ò¥¤¥ó¥Ç¥Ã¥¯¥¹¤È¤·¤Æ¤â¤Ä¥¨¥ó¥È¥ê¤Ë
9 * ¸ìÈø¤òÉղä·¤Æseq_ent¤ËÄɲ乤ë
11 * a)¼½ñ¤Î·Á¼°¤Èb)¼½ñ¥¢¥¯¥»¥¹¤Î¹â®²½c)¼½ñ¥Õ¥¡¥¤¥ë¤Î¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°
12 * ¤³¤Î¥½¡¼¥¹Ãæ¤Ç°·¤Ã¤Æ¤ë¤Î¤Ç¤«¤Ê¤êÊ£»¨²½¤·¤Æ¤Þ¤¹¡¥
14 * Copyright (C) 2000-2007 TABATA Yusuke
15 * Copyright (C) 2005-2006 YOSHIDA Yuichi
16 * Copyright (C) 2001-2002 TAKAI Kosuke
20 This library is free software; you can redistribute it and/or
21 modify it under the terms of the GNU Lesser General Public
22 License as published by the Free Software Foundation; either
23 version 2 of the License, or (at your option) any later version.
25 This library is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28 Lesser General Public License for more details.
30 You should have received a copy of the GNU Lesser General Public
31 License along with this library; if not, write to the Free Software
32 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
41 #include <anthy/anthy.h>
42 #include <anthy/alloc.h>
43 #include <anthy/dic.h>
44 #include <anthy/word_dic.h>
45 #include <anthy/logger.h>
46 #include <anthy/xstr.h>
47 #include <anthy/diclib.h>
54 static allocator word_dic_ator;
56 struct lookup_context {
57 struct gang_elm **array;
63 /* 1¥Ð¥¤¥ÈÌܤò¸«¤Æ¡¢Ê¸»ú¤¬²¿¥Ð¥¤¥È¤¢¤ë¤«¤òÊÖ¤¹ */
65 mb_fragment_len(const char *str)
67 unsigned char c = *((const unsigned char *)str);
87 is_printable(char *str)
89 unsigned char *tmp = (unsigned char *)str;
90 if (*tmp > 31 && *tmp < 127) {
93 if (mb_fragment_len(str) > 1) {
99 /* ¼½ñ¤Î¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°¤«¤éxchar¤òºî¤ë */
101 form_mb_char(const char *str)
104 anthy_utf8_to_ucs4_xchar(str, &xc);
111 return anthy_xstr_hash(x)&
112 (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
116 check_hash_ent(struct word_dic *wdic, xstr *xs)
119 int idx = (val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
120 int bit = val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
121 return wdic->hash_ent[idx] & (1<<bit);
125 wtype_str_len(const char *str)
128 for (i = 0; str[i] && str[i]!= ' '; i++);
132 /* ¼½ñ¤Î¹ÔÃæ¤ò¥¹¥¥ã¥ó¤¹¤ë¤¿¤á¤Î¾õÂÖÊÝ»ý */
138 int order_bonus;/* ¼½ñÃæ¤Î½ç½ø¤Ë¤è¤ëÉÑÅ٤Υܡ¼¥Ê¥¹ */
139 int offset;/* ʸ»úÎóÃæ¤Î¥ª¥Õ¥»¥Ã¥È */
144 * #XX*123 ¤È¤¤¤¦Cannadic¤Î·Á¼°¤ò¥Ñ¡¼¥º¤¹¤ë
150 parse_wtype_str(struct wt_stat *ws)
157 /* ¥Ð¥Ã¥Õ¥¡¤Ø¥³¥Ô¡¼¤¹¤ë */
158 len = wtype_str_len(&ws->line[ws->offset]);
159 buf = alloca(len + 1);
160 strncpy(buf, &ws->line[ws->offset], len);
164 feature_part = strchr(buf, ',');
171 /* ÉÑÅÙ¤òparse¤¹¤ë */
172 freq_part = strchr(buf, '*');
176 ws->freq = atoi(freq_part) * FREQ_RATIO;
178 ws->freq = FREQ_RATIO - 2;
182 wt_name = anthy_type_to_wtype(buf, &ws->wt);
184 ws->wt = anthy_wt_none;
192 normalize_freq(struct wt_stat* ws)
197 return ws->freq + ws->order_bonus;
200 /* '\\'¤Ë¤è¤ë¥¨¥¹¥±¡¼¥×¤ËÂбþ¤·¤¿¥³¥Ô¡¼ */
202 copy_to_buf(char *buf, const char *src, int char_count)
207 for (i = 0; i < char_count; i++){
208 if (src[i] == '\\') {
209 if (src[i + 1] == ' ') {
211 } else if (src[i + 1] == '\\') {
221 /** seq_ent¤Ëdic_ent¤òÄɲ乤ë */
223 add_dic_ent(struct seq_ent *seq, struct wt_stat *ws,
224 xstr* yomi, int is_reverse)
227 /* ¼½ñ¥Õ¥¡¥¤¥ëÃæ¤Î¥Ð¥¤¥È¿ô */
233 const char *s = &ws->line[ws->offset];
235 /* ñ¸ì¤Îʸ»ú¿ô¤ò·×»» */
236 for (i = 0, char_count = 0;
237 s[i] && (s[i] != ' ') && (s[i] != '#'); i++) {
245 /* Éʻ줬ÄêµÁ¤µ¤ì¤Æ¤¤¤Ê¤¤¤Î¤Ç̵»ë */
250 /* freq¤¬Éé¤Ê¤Î¤ÏµÕÊÑ´¹ÍÑ */
251 if (!is_reverse && ws->freq < 0) {
255 /* buf¤Ëñ¸ì¤ò¥³¥Ô¡¼ */
256 buf = alloca(char_count+1);
257 copy_to_buf(buf, s, char_count);
259 xs = anthy_cstr_to_xstr(buf, ws->encoding);
261 /* freq¤¬Àµ¤Ê¤Î¤Ï½çÊÑ´¹ÍÑ */
262 if (is_reverse && ws->freq > 0) {
263 /* ºÆÊÑ´¹¤ÎºÝ¤Ë¡¢ÊÑ´¹ºÑ¤ß¤ÎÉôʬ¤È̤ÊÑ´¹¤ÎÉôʬ¤¬º®¤¸¤Ã¤Æ¤¤¤¿¾ì¹ç¤ËÂбþ¤¹¤ë°Ù¤Ë¡¢
264 Ê¿²¾Ì¾¤Î¤ß¤«¤é¤Ê¤ëÉôʬ¤Ï½ç¼½ñ¤Ë¤½¤ÎÆɤߤò»ý¤Äñ¸ì¤¬¤¢¤ì¤Ðdic_ent¤òÀ¸À®¤¹¤ë¡£
266 if (anthy_get_xstr_type(yomi) & XCT_HIRA) {
267 freq = normalize_freq(ws);
268 anthy_mem_dic_push_back_dic_ent(seq, 0, yomi, w,
269 ws->wt_name, freq, 0);
275 freq = normalize_freq(ws);
277 anthy_mem_dic_push_back_dic_ent(seq, 0, xs, w, ws->wt_name, freq, 0);
278 if (anthy_wtype_get_meisi(w)) {
279 /* Ï¢ÍÑ·Á¤¬Ì¾»ì²½¤¹¤ë¤ä¤Ä¤Ï̾»ì²½¤·¤¿¤â¤Î¤âÄɲà */
280 w = anthy_get_wtype_with_ct(w, CT_MEISIKA);
281 anthy_mem_dic_push_back_dic_ent(seq, 0, xs, w, ws->wt_name, freq, 0);
288 add_compound_ent(struct seq_ent *seq, struct wt_stat *ws,
292 int len = wtype_str_len(&ws->line[ws->offset]);
293 char *buf = alloca(len);
299 /* freq¤¬Éé¤Ê¤Î¤ÏµÕÊÑ´¹ÍÑ */
300 if (!is_reverse && ws->freq < 0) {
301 /* ÉáÃʤÎÊÑ´¹¤Ç¤ÏÍפé¤Ê¤¤ */
305 /* freq¤¬Àµ¤Ê¤Î¤Ï½çÊÑ´¹ÍÑ */
306 if (is_reverse && ws->freq > 0) {
308 /* ºÆÊÑ´¹¤ÎºÝ¤Ë¡¢ÊÑ´¹ºÑ¤ß¤ÎÉôʬ¤È̤ÊÑ´¹¤ÎÉôʬ¤¬º®¤¸¤Ã¤Æ¤¤¤¿¾ì¹ç¤ËÂбþ¤¹¤ë°Ù¤Ë¡¢
309 Ê¿²¾Ì¾¤Î¤ß¤«¤é¤Ê¤ëÉôʬ¤Ï½ç¼½ñ¤Ë¤½¤ÎÆɤߤò»ý¤Äñ¸ì¤¬¤¢¤ì¤Ðdic_ent¤òÀ¸À®¤¹¤ë¡£
312 yomi¤Ë#_Åù¤òÉղä·¤¿Ê¸»úÎó¤òºî¤ëɬÍפ¬¤¢¤ë
313 if (anthy_get_xstr_type(yomi) & (XCT_HIRA | XCT_KATA)) {
314 freq = normalize_freq(ws);
315 anthy_mem_dic_push_back_compound_ent(seq, xs, ws->wt, freq);
321 strncpy(buf, &ws->line[ws->offset + 1], len - 1);
323 xs = anthy_cstr_to_xstr(buf, ws->encoding);
325 freq = normalize_freq(ws);
326 anthy_mem_dic_push_back_dic_ent(seq, 1, xs, ws->wt,
327 ws->wt_name, freq, 0);
334 init_wt_stat(struct wt_stat *ws, char *line)
342 ws->encoding = ANTHY_EUC_JP_ENCODING;
343 if (*(ws->line) == 'u') {
344 ws->encoding = ANTHY_UTF8_ENCODING;
349 /** ¼½ñ¤Î¥¨¥ó¥È¥ê¤Î¾ðÊó¤ò¸µ¤Ëseq_ent¤ò¤¦¤á¤ë */
351 fill_dic_ent(char *line, struct seq_ent *seq,
352 xstr* yomi, int is_reverse)
355 init_wt_stat(&ws, line);
357 while (ws.line[ws.offset]) {
358 if (ws.line[ws.offset] == '#') {
359 if (isalpha(ws.line[ws.offset + 1])) {
361 ws.wt_name = parse_wtype_str(&ws);
363 ws.order_bonus = FREQ_RATIO - 1;
366 ws.offset += add_compound_ent(seq, &ws,
372 ws.offset += add_dic_ent(seq, &ws, yomi,
374 if (ws.order_bonus > 0) {
378 if (ws.line[ws.offset] == ' ') {
385 * s¤Ë½ñ¤«¤ì¤¿Ê¸»úÎó¤Ë¤è¤Ã¤Æx¤òÊѹ¹¤¹¤ë
386 * ÊÖ¤êÃͤÏÆɤ߿ʤ᤿¥Ð¥¤¥È¿ô
389 mkxstr(char *s, xstr *x)
392 /* s[0]¤Ë¤Ï´¬¤Ìᤷ¤Îʸ»ú¿ô */
393 x->len -= (s[0] - 1);
394 for (i = 1; is_printable(&s[i]); i ++) {
395 len = mb_fragment_len(&s[i]);
398 x->str[x->len] = form_mb_char(&s[i]);
403 x->str[x->len] = s[i];
411 set_next_idx(struct lookup_context *lc)
414 while (lc->nth < lc->nr) {
415 if (lc->array[lc->nth]->tmp.idx != NO_WORD) {
423 /** ¥Ú¡¼¥¸Ãæ¤Îñ¸ì¤Î¾ì½ê¤òÄ´¤Ù¤ë */
425 search_words_in_page(struct lookup_context *lc, int page, char *s)
431 /* ¤³¤Î¥Ú¡¼¥¸Ãæ¤Ë¤¢¤ë¤â¤Ã¤È¤âŤ¤Ã±¸ì¤ò³ÊǼ¤·¤¦¤ëŤµ */
432 buf = alloca(sizeof(xchar)*strlen(s)/2);
439 r = anthy_xstrcmp(&xs, &lc->array[lc->nth]->xs);
441 lc->array[lc->nth]->tmp.idx = o + page * WORDS_PER_PAGE;
443 if (!set_next_idx(lc)) {
446 /* Ʊ¤¸¥Ú¡¼¥¸Æâ¤Ç¼¡¤Îñ¸ì¤òõ¤¹ */
451 /* ¤³¤Î¥Ú¡¼¥¸¤Ç1¸ì¤â¸«¤Ä¤«¤é¤Ê¤«¤Ã¤¿¤é¡¢¤³¤Îñ¸ì¤Ï̵¤¤ */
452 lc->array[lc->nth]->tmp.idx = NO_WORD;
455 /* ¸½ºß¤Îñ¸ì¤Ï¼¡¤Î¸Æ¤Ó½Ð¤·¤Çõ¤¹ */
460 compare_page_index(struct word_dic *wdic, const char *key, int page)
463 char *s = &wdic->page[anthy_dic_ntohl(wdic->page_index[page])];
466 for (i = 0; is_printable(&s[i]);) {
467 int j, l = mb_fragment_len(&s[i]);
468 for (j = 0; j < l; j++) {
474 return strcmp(key ,buf);
477 /* ºÆµ¢Åª¤Ë¥Ð¥¤¥Ê¥ê¥µ¡¼¥Á¤ò¤¹¤ë */
479 get_page_index_search(struct word_dic *wdic, const char *key, int f, int t)
481 /* anthy_xstrcmp¤¬-1¤Ç̵¤¯¤Ê¤Ã¤¿¤È¤³¤í¤òõ¤¹ */
487 p = compare_page_index(wdic, key, c);
489 return get_page_index_search(wdic, key, f, c);
492 return get_page_index_search(wdic, key, c, t);
497 /** key¤ò´Þ¤à²ÄǽÀ¤Î¤¢¤ë¥Ú¡¼¥¸¤ÎÈÖ¹æ¤òÆÀ¤ë¡¢
498 * ÈÏ°Ï¥Á¥§¥Ã¥¯¤ò¤·¤Æ¥Ð¥¤¥Ê¥ê¥µ¡¼¥Á¤ò¹Ô¤¦get_page_index_search¤ò¸Æ¤Ö
501 get_page_index(struct word_dic *wdic, struct lookup_context *lc)
504 const char *key = lc->array[lc->nth]->key;
505 /* ºÇ½é¤Î¥Ú¡¼¥¸¤ÎÆɤߤè¤ê¤â¾®¤µ¤¤ */
506 if (compare_page_index(wdic, key, 0) < 0) {
509 /* ºÇ¸å¤Î¥Ú¡¼¥¸¤ÎÆɤߤè¤ê¤âÂ礤¤¤Î¤Ç¡¢ºÇ¸å¤Î¥Ú¡¼¥¸¤Ë´Þ¤Þ¤ì¤ë²ÄǽÀ¤¬¤¢¤ë */
510 if (compare_page_index(wdic, key, wdic->nr_pages-1) >= 0) {
511 return wdic->nr_pages-1;
514 page = get_page_index_search(wdic, key, 0, wdic->nr_pages);
519 get_nr_page(struct word_dic *h)
522 for (i = 1; anthy_dic_ntohl(h->page_index[i]); i++);
527 get_section(struct word_dic *wdic, int section)
529 int *p = (int *)wdic->dic_file;
530 int offset = anthy_dic_ntohl(p[section]);
531 return &wdic->dic_file[offset];
534 /** ¼½ñ¥Õ¥¡¥¤¥ë¤òmmap¤·¤Æ¡¢word_dicÃæ¤Î³Æ¥»¥¯¥·¥ç¥ó¤Î¥Ý¥¤¥ó¥¿¤ò¼èÆÀ¤¹¤ë */
536 get_word_dic_sections(struct word_dic *wdic)
538 wdic->entry_index = (int *)get_section(wdic, 2);
539 wdic->entry = (char *)get_section(wdic, 3);
540 wdic->page = (char *)get_section(wdic, 4);
541 wdic->page_index = (int *)get_section(wdic, 5);
542 wdic->uc_section = (char *)get_section(wdic, 6);
543 wdic->hash_ent = (unsigned char *)get_section(wdic, 7);
548 /** »ØÄꤵ¤ì¤¿Ã±¸ì¤Î¼½ñÃæ¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤òÄ´¤Ù¤ë */
550 search_yomi_index(struct word_dic *wdic, struct lookup_context *lc)
555 /* ¤¹¤Ç¤Ë̵¤¤¤³¤È¤¬Ê¬¤«¤Ã¤Æ¤¤¤ë */
556 if (lc->array[lc->nth]->tmp.idx == NO_WORD) {
561 p = get_page_index(wdic, lc);
563 lc->array[lc->nth]->tmp.idx = NO_WORD;
568 page_number = anthy_dic_ntohl(wdic->page_index[p]);
569 search_words_in_page(lc, p, &wdic->page[page_number]);
573 find_words(struct word_dic *wdic, struct lookup_context *lc)
577 for (i = 0; i < lc->nr; i++) {
578 lc->array[i]->tmp.idx = NO_WORD;
579 if (lc->array[i]->xs.len > 31) {
580 /* 32ʸ»ú°Ê¾åñ¸ì¤Ë¤Ï̤Âбþ */
583 /* hash¤Ë¤Ê¤¤¤Ê¤é½üµî */
584 if (!check_hash_ent(wdic, &lc->array[i]->xs)) {
587 /* NO_WORD¤Ç¤Ê¤¤ÃͤòÀßÄꤹ¤ë¤³¤È¤Ç¸¡º÷ÂоݤȤ¹¤ë */
588 lc->array[i]->tmp.idx = 0;
592 while (lc->nth < lc->nr) {
593 search_yomi_index(wdic, lc);
598 load_words(struct word_dic *wdic, struct lookup_context *lc)
601 for (i = 0; i < lc->nr; i++) {
603 yomi_index = lc->array[i]->tmp.idx;
604 if (yomi_index != NO_WORD) {
607 seq = anthy_cache_get_seq_ent(&lc->array[i]->xs,
609 entry_index = anthy_dic_ntohl(wdic->entry_index[yomi_index]);
610 fill_dic_ent(&wdic->entry[entry_index],
614 anthy_validate_seq_ent(seq, &lc->array[i]->xs, lc->is_reverse);
619 /** word_dic¤«¤éñ¸ì¤ò¸¡º÷¤¹¤ë
620 * ¼½ñ¥¥ã¥Ã¥·¥å¤«¤é¸Æ¤Ð¤ì¤ë
621 * (gang lookup¤Ë¤¹¤ë¤³¤È¤ò¸¡Æ¤¤¹¤ë)
624 anthy_gang_fill_seq_ent(struct word_dic *wdic,
625 struct gang_elm **array, int nr,
628 struct lookup_context lc;
631 lc.is_reverse = is_reverse;
633 /* ³Æñ¸ì¤Î¾ì½ê¤òõ¤¹ */
634 find_words(wdic, &lc);
635 /* ñ¸ì¤Î¾ðÊó¤òÆɤ߹þ¤à */
636 load_words(wdic, &lc);
640 anthy_create_word_dic(void)
642 struct word_dic *wdic;
645 wdic = anthy_smalloc(word_dic_ator);
646 memset(wdic, 0, sizeof(*wdic));
648 /* ¼½ñ¥Õ¥¡¥¤¥ë¤ò¥Þ¥Ã¥×¤¹¤ë */
649 wdic->dic_file = anthy_file_dic_get_section("word_dic");
651 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥Ý¥¤¥ó¥¿¤ò¼èÆÀ¤¹¤ë */
652 if (get_word_dic_sections(wdic) == -1) {
653 anthy_sfree(word_dic_ator, wdic);
656 wdic->nr_pages = get_nr_page(wdic);
658 /* ÍÑÎã¼½ñ¤ò¥Þ¥Ã¥×¤¹¤ë */
659 p = wdic->uc_section;
664 anthy_release_word_dic(struct word_dic *wdic)
666 anthy_sfree(word_dic_ator, wdic);
670 anthy_init_word_dic(void)
672 word_dic_ator = anthy_create_allocator(sizeof(struct word_dic), NULL);