2 * ¸Ä¿Í¼½ñ´ÉÍýÍѤδؿô·²
6 * eucjp¤Î¼½ñ¤Ïtexttrie
7 * ¤ª¤è¤Órecord¤ò»È¤Ã¤Æ¤Æº®Í𤷤ޤ¯¤ê
12 * ¿·µ¬ÅÐÏ¿¤Ïtextdict¤ËÂФ·¤Æ¹Ô¤¦¤è¤¦¤Ë¤¹¤ë <- todo
13 * texttrie¤Îñ¸ì¤Ï°Ü¹Ô¤¹¤ë¤è¤¦¤Ë¤¹¤ë
17 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2001 10/24
19 * Copyright (C) 2001-2007 TABATA Yusuke
23 This library is free software; you can redistribute it and/or
24 modify it under the terms of the GNU Lesser General Public
25 License as published by the Free Software Foundation; either
26 version 2 of the License, or (at your option) any later version.
28 This library is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 Lesser General Public License for more details.
33 You should have received a copy of the GNU Lesser General Public
34 License along with this library; if not, write to the Free Software
35 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
41 #include <anthy/anthy.h>
42 #include <anthy/conf.h>
43 #include <anthy/dic.h>
44 #include <anthy/texttrie.h>
45 #include <anthy/textdict.h>
46 #include <anthy/dicutil.h>
49 #include "dic_personality.h"
52 * ¸Ä¿Í¼½ñ¤ÏtexttrieÃæ¤Ë³ÊǼ¤µ¤ì¤ë¤È¤
53 * ¡Ö ¸«½Ð¤· ¿ô»ú¡× -> ¡Ö#ÉÊ»ì*ÉÑÅ٠ñ¸ì¡×¤È¤¤¤¦·Á¼°¤ò¤È¤ë
54 * (UTF8¤Î¾ì¹ç¤Ï¡Ö p¸«½Ð¤· ¿ô»ú¡× -> ¡Ö#ÉÊ»ì*ÉÑÅ٠ñ¸ì¡×)
55 * ºÇ½é¤Î2ʸ»ú¤Î¶õÇò¤Ïñ¸ì¾ðÊó¤Î¥»¥¯¥·¥ç¥ó¤Ç¤¢¤ë¤³¤È¤ò°ÕÌ£¤·¡¢
56 * ¿ô»ú¤ÎÉôʬ¤ÏƱ²»¸ì¤ò¶èÊ̤¹¤ë¤¿¤á¤ËÍѤ¤¤é¤ì¤ë¡£
60 /* UTF8¤Ç32ʸ»ú x 3bytes */
61 #define MAX_KEY_LEN 96
64 static int dic_util_encoding;
66 extern struct text_trie *anthy_private_tt_dic;
67 extern struct textdict *anthy_private_text_dic;
68 /* ¸½ºßÁªÂò¤µ¤ì¤Æ¤¤¤ëÆÉ¤ß */
69 static struct iterate_contex {
73 char key_buf[MAX_KEY_LEN+32];
74 /* textdict¤Î¸¡º÷ÍÑ */
89 set_current_line(const char *index, const char *line)
91 if (word_iterator.current_line) {
92 free(word_iterator.current_line);
93 word_iterator.current_line = NULL;
96 word_iterator.current_line = strdup(line);
98 if (word_iterator.current_index) {
99 free(word_iterator.current_index);
100 word_iterator.current_index = NULL;
103 word_iterator.current_index = strdup(index);
107 /** ¸Ä¿Í¼½ñ¥é¥¤¥Ö¥é¥ê¤ò½é´ü²½¤¹¤ë */
109 anthy_dic_util_init(void)
114 if (anthy_init_dic() == -1) {
117 anthy_dic_set_personality("default");
119 dic_util_encoding = ANTHY_EUC_JP_ENCODING;
121 word_iterator.key_buf[0] = 0;
122 word_iterator.in_tt = 1;
125 /** ¼½ñ¥é¥¤¥Ö¥é¥ê¤ò²òÊü¤¹¤ë */
127 anthy_dic_util_quit(void)
132 set_current_line(NULL, NULL);
136 /** ¼½ñ¥æ¡¼¥Æ¥£¥ê¥Æ¥£API¤Î¥¨¥ó¥³¡¼¥Ç¥£¥ó¥°¤òÀßÄꤹ¤ë */
138 anthy_dic_util_set_encoding(int enc)
140 if (enc == ANTHY_UTF8_ENCODING ||
141 enc == ANTHY_EUC_JP_ENCODING) {
142 dic_util_encoding = enc;
144 return dic_util_encoding;
148 anthy_dic_util_set_personality(const char *id)
150 anthy_dic_set_personality(id);
154 find_next_key(const char *prefix)
157 v = anthy_trie_find_next_key(anthy_private_tt_dic,
158 word_iterator.key_buf, MAX_KEY_LEN+32);
160 if (v && v[0] == prefix[0] && v[1] == prefix[1]) {
161 /* ¼¡¤Îkey¤â»ØÄꤵ¤ì¤¿prefix¤ò»ý¤Ã¤Æ¤¤¤ë */
165 sprintf(word_iterator.key_buf, "%s", prefix);
170 delete_prefix(const char *prefix)
172 sprintf(word_iterator.key_buf, "%s", prefix);
173 anthy_priv_dic_lock();
174 /* word_iterator.key_buf¤¬prefix¤Îʸ»úÎó¤Ç¤¢¤ì¤Ð¡¢find_next_key()¤Ï
176 while (find_next_key(prefix)) {
177 anthy_trie_delete(anthy_private_tt_dic, word_iterator.key_buf);
178 sprintf(word_iterator.key_buf, "%s", prefix);
180 anthy_priv_dic_unlock();
184 encoding_prefix(int encoding)
186 if (encoding == ANTHY_UTF8_ENCODING) {
193 /** (API) ¸Ä¿Í¼½ñ¤òÁ´Éô¾Ã¤¹ */
195 anthy_priv_dic_delete(void)
197 delete_prefix(encoding_prefix(ANTHY_EUC_JP_ENCODING));
199 while (!anthy_textdict_delete_line(anthy_private_text_dic, 0)) {
205 scan_one_word_cb(void *p, int next_offset, const char *key, const char *n)
208 set_current_line(key, n);
209 word_iterator.dicfile_offset = next_offset;
214 select_first_entry_in_textdict(void)
216 word_iterator.dicfile_offset = 0;
217 set_current_line(NULL, NULL);
218 anthy_textdict_scan(anthy_private_text_dic,
219 word_iterator.dicfile_offset, NULL,
221 if (word_iterator.current_line) {
222 word_iterator.in_tt = 0;
226 return ANTHY_DIC_UTIL_ERROR;
229 /** (API) ºÇ½é¤Îñ¸ì¤òÁªÂò¤¹¤ë */
231 anthy_priv_dic_select_first_entry(void)
233 if (dic_util_encoding == ANTHY_UTF8_ENCODING) {
234 return select_first_entry_in_textdict();
236 if (anthy_private_tt_dic) {
237 sprintf(word_iterator.key_buf, "%s", encoding_prefix(dic_util_encoding));
238 /* prefix¤Î¼¡¤Î¥¨¥ó¥È¥ê¤¬ºÇ½é¤Î¥¨¥ó¥È¥ê */
239 if (find_next_key(encoding_prefix(dic_util_encoding))) {
240 word_iterator.in_tt = 1;
244 /* ñ¸ì¤¬Ìµ¤¤¤Î¤Çtextdict¤Ë°ÜÆ°¤ò»î¤ß¤ë */
245 return select_first_entry_in_textdict();
248 /** (API) ¸½ºßÁªÂò¤µ¤ì¤Æ¤¤¤ëñ¸ì¤Î¼¡¤Îñ¸ì¤òÁªÂò¤¹¤ë */
250 anthy_priv_dic_select_next_entry(void)
252 if (!word_iterator.in_tt) {
253 set_current_line(NULL, NULL);
254 anthy_textdict_scan(anthy_private_text_dic, word_iterator.dicfile_offset,
257 if (word_iterator.current_line) {
260 return ANTHY_DIC_UTIL_ERROR;
262 if (find_next_key(encoding_prefix(dic_util_encoding))) {
265 /* ñ¸ì¤¬Ìµ¤¤¤Î¤Çtextdict¤Ë°ÜÆ°¤ò»î¤ß¤ë */
266 return select_first_entry_in_textdict();
271 anthy_priv_dic_select_entry(const char *index)
277 /** ¸½ºßÁªÂò¤µ¤ì¤Æ¤¤¤ëñ¸ì¤ÎÆɤߤò¤ò¼èÆÀ¤¹¤ë */
279 anthy_priv_dic_get_index(char *buf, int len)
283 if (word_iterator.in_tt) {
284 src_buf = &word_iterator.key_buf[2];
286 src_buf = word_iterator.current_index;
288 if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) {
290 src_buf = anthy_conv_utf8_to_euc(src_buf);
292 src_buf = strdup(src_buf);
294 /* ºÇ½é¤Î¶õÇò¤«\0¤Þ¤Ç¤ò¥³¥Ô¡¼¤¹¤ë */
295 for (i = 0; src_buf[i] && src_buf[i] != ' '; i++) {
307 /** ¸½ºßÁªÂò¤µ¤ì¤Æ¤¤¤ëñ¸ì¤ÎÉÑÅÙ¤ò¼èÆÀ¤¹¤ë */
309 anthy_priv_dic_get_freq(void)
311 struct word_line res;
313 if (word_iterator.in_tt) {
314 v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
315 anthy_parse_word_line(v, &res);
318 anthy_parse_word_line(word_iterator.current_line, &res);
323 /** ¸½ºßÁªÂò¤µ¤ì¤Æ¤¤¤ëñ¸ì¤ÎÉÊ»ì¤ò¼èÆÀ¤¹¤ë */
325 anthy_priv_dic_get_wtype(char *buf, int len)
327 struct word_line res;
329 if (word_iterator.in_tt) {
330 v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
331 anthy_parse_word_line(v, &res);
334 anthy_parse_word_line(word_iterator.current_line, &res);
336 if (len - 1 < (int)strlen(res.wt)) {
339 sprintf(buf, "%s", res.wt);
343 /** ¸½ºßÁªÂò¤µ¤ì¤Æ¤¤¤ëñ¸ì¤ò¼èÆÀ¤¹¤ë */
345 anthy_priv_dic_get_word(char *buf, int len)
349 if (word_iterator.in_tt) {
350 v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
352 v = word_iterator.current_line;
357 /* ÉÊ»ì¤Î¸å¤í¤Ë¤¢¤ëñ¸ì¤ò¼è¤ê½Ð¤¹ */
360 if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) {
361 s = anthy_conv_utf8_to_euc(s);
362 snprintf(buf, len, "%s", s);
365 snprintf(buf, len, "%s", s);
367 if (word_iterator.in_tt) {
374 find_cb(void *p, int next_offset, const char *key, const char *n)
376 struct scan_context *sc = p;
377 struct word_line res;
378 if (strcmp(key, sc->yomi)) {
379 sc->offset = next_offset;
382 anthy_parse_word_line(n, &res);
383 if (!strcmp(res.wt, sc->wt_name) &&
384 !strcmp(res.word, sc->word)) {
388 sc->offset = next_offset;
393 order_cb(void *p, int next_offset, const char *key, const char *n)
395 struct scan_context *sc = p;
397 if (strcmp(key, sc->yomi) >= 0) {
401 sc->offset = next_offset;
407 do_add_word_to_textdict(struct textdict *td, int offset,
408 const char *yomi, const char *word,
409 const char *wt_name, int freq)
411 char *buf = malloc(strlen(yomi) + strlen(word) + strlen(wt_name) + 20);
416 sprintf(buf, "%s %s*%d %s\n", yomi, wt_name, freq, word);
417 rv = anthy_textdict_insert_line(td, offset, buf);
423 dup_word_check(const char *v, const char *word, const char *wt)
425 struct word_line res;
427 if (anthy_parse_word_line(v, &res)) {
431 /* ÆɤߤÈñ¸ì¤òÈæ³Ó¤¹¤ë */
432 if (!strcmp(res.wt, wt) &&
433 !strcmp(res.word, word)) {
440 find_same_word(char *idx_buf, const char *yomi,
441 const char *word, const char *wt_name, int yomi_len)
444 sprintf(idx_buf, "%s%s ",
445 encoding_prefix(dic_util_encoding),
447 anthy_trie_find_next_key(anthy_private_tt_dic,
448 idx_buf, yomi_len + 12);
450 /* trie¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤òõ¤¹ */
453 if (strncmp(&idx_buf[2], yomi, yomi_len) ||
454 idx_buf[yomi_len+2] != ' ') {
455 /* ¸«½Ð¸ì¤¬°Û¤Ê¤ë¤Î¤Ç¥ë¡¼¥×½ªÎ» */
458 /* texttrie¤Ë¥¢¥¯¥»¥¹¤·¤Æ¡¢¸«½Ð¸ì°Ê³°¤â°ìÃפ·¤Æ¤¤¤ë¤«¤ò¥Á¥§¥Ã¥¯ */
459 v = anthy_trie_find(anthy_private_tt_dic, idx_buf);
461 found = dup_word_check(v, word, wt_name);
467 } while (anthy_trie_find_next_key(anthy_private_tt_dic,
468 idx_buf, yomi_len + 12));
474 add_word_to_textdict(const char *yomi, const char *word,
475 const char *wt_name, int freq)
477 struct scan_context sc;
479 int yomi_len = strlen(yomi);
481 if (yomi_len > MAX_KEY_LEN || yomi_len == 0) {
482 return ANTHY_DIC_UTIL_ERROR;
485 if (wt_name[0] != '#') {
486 return ANTHY_DIC_UTIL_ERROR;
489 /* texttrie¤Ë¤¢¤ì¤Ð¾Ã¤¹ */
490 if (anthy_private_tt_dic) {
491 char *idx_buf = malloc(yomi_len + 12);
492 if (find_same_word(idx_buf, yomi, word, wt_name, yomi_len)) {
493 anthy_trie_delete(anthy_private_tt_dic, idx_buf);
498 /* Ʊ¤¸Êª¤¬¤¢¤Ã¤¿¤é¾Ã¤¹ */
501 sc.wt_name = wt_name;
505 anthy_textdict_scan(anthy_private_text_dic, 0, &sc,
507 if (sc.found_word == 1) {
508 anthy_textdict_delete_line(anthy_private_text_dic, sc.offset);
511 return ANTHY_DIC_UTIL_OK;
513 /* Äɲ乤ë¾ì½ê¤òõ¤¹ */
516 anthy_textdict_scan(anthy_private_text_dic, 0, &sc,
519 rv = do_add_word_to_textdict(anthy_private_text_dic, sc.offset,
520 yomi, word, wt_name, freq);
522 return ANTHY_DIC_UTIL_OK;
524 return ANTHY_DIC_UTIL_ERROR;
528 * ÉÑÅÙ¤¬0¤Î¾ì¹ç¤Ïºï½ü
531 anthy_priv_dic_add_entry(const char *yomi, const char *word,
532 const char *wt_name, int freq)
534 if (dic_util_encoding == ANTHY_UTF8_ENCODING) {
535 return add_word_to_textdict(yomi, word, wt_name, freq);
538 char *yomi_utf8 = anthy_conv_euc_to_utf8(yomi);
539 char *word_utf8 = anthy_conv_euc_to_utf8(word);
540 rv = add_word_to_textdict(yomi_utf8, word_utf8, wt_name, freq);
548 anthy_dic_util_get_anthydir(void)
550 return anthy_conf_get_str("ANTHYDIR");
553 /* look¥³¥Þ¥ó¥É¤Î¼½ñ¤ò¸¡º÷¤¹¤ë¤¿¤á¤Î´Ø¿ô */
555 do_search(FILE *fp, const char *word)
559 int word_len = strlen(word);
560 while (fgets(buf, 32, fp)) {
561 int len = strlen(buf);
564 if (len > word_len) {
567 if (!strncasecmp(buf, word, len)) {
577 /* look¥³¥Þ¥ó¥É¤Î¼½ñ¤ò¸¡º÷¤¹¤ëAPI */
579 anthy_dic_search_words_file(const char *word)
583 const char *words_dict_fn = anthy_conf_get_str("WORDS_FILE");
584 if (!words_dict_fn) {
587 fp = fopen(words_dict_fn, "r");
591 res = do_search(fp, word);