2 * cannadic·Á¼°¤Î¥Õ¥¡¥¤¥ë¤«¤é¼½ñ¥Õ¥¡¥¤¥ë¤òºî¤ë
4 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2002 1/1
6 * Copyright (C) 2000-2007 TABATA Yusuke
7 * Copyright (C) 2005 YOSHIDA Yuichi
8 * Copyright (C) 2001-2002 TAKAI Kousuke
11 * ¼½ñ¤ÏÆɤߤòindex¤È¤·¡¢ÉÊ»ì¤äÊÑ´¹¸å¤Îñ¸ì(=entry)¤ò¸¡º÷
12 * ¤¹¤ë¹½Â¤¤Ë¤Ê¤Ã¤Æ¤¤¤ë¡£
14 * ÆÉ¤ß -> ñ¸ì¡¢Ã±¸ì¡¢¡¢
16 * ¼½ñ¥Õ¥¡¥¤¥ë¤Ï¥Í¥Ã¥È¥ï¡¼¥¯¥Ð¥¤¥È¥ª¡¼¥À¡¼¤òÍѤ¤¤ë¡£
18 * ¼½ñ¥Õ¥¡¥¤¥ë¤ÏÊ£¿ô¤Î¥»¥¯¥·¥ç¥ó¤«¤é¹½À®¤µ¤ì¤Æ¤¤¤ë
20 * 2 ÆɤߤΥ¤¥ó¥Ç¥Ã¥¯¥¹ (Æɤß512¸Ä¤´¤È)
23 * 5 ¥Ú¡¼¥¸¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹
27 * source ¸µ¤Î¼½ñ¥Õ¥¡¥¤¥ë
28 * file_dic À¸À®¤¹¤ë¥Õ¥¡¥¤¥ë
30 * yomi_hash ¼½ñ¥Õ¥¡¥¤¥ë¤Ë½ÐÎϤµ¤ì¤ëhash¤Îbitmap
31 * index_hash ¤³¤Î¥½¡¼¥¹Ãæ¤Çstruct yomi_entry¤ò¸¡º÷¤¹¤ë¤¿¤á¤Îhash
35 #include <sys/types.h>
45 #include <anthy/anthy.h>
46 #include <anthy/xstr.h>
47 #include <anthy/wtype.h>
48 #include <anthy/ruleparser.h>
49 #include <anthy/word_dic.h>
50 #include <anthy/diclib.h>
53 #define MAX_LINE_LEN 10240
54 #define NR_HEADER_SECTIONS 16
55 #define SECTION_ALIGNMENT 8
56 #define MAX_WTYPE_LEN 20
58 #define DEFAULT_FN "anthy.wdic"
60 static const char *progname;
62 /* writewords.c¤«¤é¥¢¥¯¥»¥¹¤¹¤ë¤¿¤á¤Ë¡¢globalÊÑ¿ô */
63 FILE *yomi_entry_index_out, *yomi_entry_out;
64 FILE *page_out, *page_index_out;
67 static FILE *yomi_hash_out;
68 /* ¥Ï¥Ã¥·¥å¤Î¾×Æͤοô¡¢Åý·×¾ðÊó */
69 static int yomi_hash_collision;
71 /* ¥Õ¥¡¥¤¥ëÃæ¤Î½ç½ø¤Ë½¾¤Ã¤Æʤ٤ë */
76 {&yomi_entry_index_out, NULL},
77 {&yomi_entry_out, NULL},
79 {&page_index_out, NULL},
81 {&yomi_hash_out, NULL},
88 struct yomi_entry_list yl;
90 struct adjust_command ac_list;
94 const char *output_fn;
99 char **excluded_wtypes;
102 /* ¼½ñ¤Î½ÐÎÏÀè¤Î¥Õ¥¡¥¤¥ë¤ò¥ª¡¼¥×¥ó¤¹¤ë */
104 open_output_files(void)
106 struct file_section *fs;
107 for (fs = file_array; fs->fpp; fs ++) {
108 char *tmpdir = getenv("TMPDIR");
111 /* tmpfile()¤¬TMPDIR¤ò¸«¤Ê¤¤¤¿¤á¡¢TMPDIR¤ò»ØÄꤵ¤ì¤¿¾ì¹çmkstemp¤ò»È¤¦¡£*/
114 snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
119 *(fs->fpp) = fdopen(fd, "w+");
120 fs->fn = strdup(buf);
123 *(fs->fpp) = tmpfile();
127 fprintf (stderr, "%s: cannot open temporary file: %s\n",
128 progname, strerror (errno));
136 flush_output_files (void)
138 struct file_section *fs;
139 for (fs = file_array; fs->fpp; fs ++) {
140 if (ferror(*(fs->fpp))) {
141 fprintf (stderr, "%s: write error\n", progname);
145 for (fs = file_array; fs->fpp; fs ++) {
146 if (fflush(*(fs->fpp))) {
147 fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
153 /* ¥Í¥Ã¥È¥ï¡¼¥¯byteorder¤Ç4bytes½ñ¤½Ð¤¹ */
155 write_nl(FILE *fp, int i)
157 i = anthy_dic_htonl(i);
158 fwrite(&i, sizeof(int), 1, fp);
164 printf("please do not use mkanthydic command directly.\n");
169 read_line(FILE *fp, char *buf)
171 /* Ť¹¤®¤ë¹Ô¤ò̵»ë¤¹¤ë */
174 while (fgets(buf, MAX_LINE_LEN, fp)) {
175 int len = strlen(buf);
179 if (buf[len - 1] != '\n') {
194 /** cannadic·Á¼°¤Î¼½ñ¤Î¹Ô¤«¤éindex¤È¤Ê¤ëÉôʬ¤ò¼è¤ê½Ð¤¹ */
196 get_index_from_line(struct mkdic_stat *mds, char *buf)
200 sp = strchr(buf, ' ');
202 /* ¼½ñ¤Î¥Õ¥©¡¼¥Þ¥Ã¥È¤¬¤ª¤«¤·¤¤ */
206 xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
211 /** cannadic·Á¼°¤Î¼½ñ¤Î¹Ô¤«¤éindex°Ê³°¤ÎÉôʬ¤ò¼è¤ê½Ð¤¹ */
213 get_entry_from_line(char *buf)
216 sp = strchr(buf, ' ');
228 for (i = 0; i < xs->len; i++) {
229 h += xs->str[i] * 11;
231 return (int)(h % YOMI_HASH);
235 get_wt_name(const char *name)
239 if (!strcmp(name, "#T35")) {
242 res = anthy_type_to_wtype(name, &dummy);
249 /** ÆɤߤËÂФ·¤Æ¡¢Ã±¸ì¤ò°ì¤Ä¤òÄɲ乤ë */
251 push_back_word_entry(struct mkdic_stat *mds,
252 struct yomi_entry *ye, const char *wt_name,
253 int freq, const char *word, int order)
260 if (!anthy_type_to_wtype(wt_name, &wt)) {
261 /* anthy¤ÎÃΤé¤Ê¤¤ÉÊ»ì */
264 ye->entries = realloc(ye->entries,
265 sizeof(struct word_entry) *
266 (ye->nr_entries + 1));
267 ye->entries[ye->nr_entries].ye = ye;
268 ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
269 ye->entries[ye->nr_entries].raw_freq = freq;
270 ye->entries[ye->nr_entries].feature = 0;
271 ye->entries[ye->nr_entries].source_order = order;
272 if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
273 s = anthy_conv_euc_to_utf8(word);
277 ye->entries[ye->nr_entries].word_utf8 = s;
282 parse_wtype(char *wtbuf, char *cur)
287 if (strlen(cur) >= MAX_WTYPE_LEN) {
292 t = strchr(wtbuf, '*');
306 /* Ê£¹ç¸ì¤ÎÍ×ÁǤÎŤµ¤Ï 1,2,3, ... 9,a,b,c */
308 get_element_len(xchar xc)
310 if (xc > '0' && xc <= '9') {
313 if (xc >= 'a' && xc <= 'z') {
314 return xc - 'a' + 10;
319 /** Ê£¹ç¸õÊä¤Î·Á¼°¥Á¥§¥Ã¥¯ */
321 check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
323 /* ÆɤߤÎʸ»ú¿ô¤Î¹ç·×¤ò¿ô¤¨¤ë */
324 xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
326 for (i = 0; i < xs->len - 1; i++) {
327 if (xs->str[i] == '_') {
328 total += get_element_len(xs->str[i+1]);
333 if (total != index->len) {
334 fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
342 is_excluded_wtype(struct mkdic_stat *mds, char *wt)
345 for (i = 0; i < mds->nr_excluded; i++) {
346 if (!strcmp(mds->excluded_wtypes[i], wt)) {
354 find_token_end(char *cur)
357 for (n = cur; *n != ' ' && *n != '\0'; n++) {
368 /** ÆɤߤËÂбþ¤¹¤ë¹Ô¤òʬ³ä¤·¤Æ¡¢ÇÛÎó¤ò¹½À®¤¹¤ë */
370 push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
373 char *buf = alloca(strlen(ent) + 1);
376 char wtbuf[MAX_WTYPE_LEN];
384 /* ¥È¡¼¥¯¥ó¤ò\0¤ÇÀڤ롣cur¤Î¸å¤Î¶õÇò¤«\0¤òõ¤¹ */
385 n = find_token_end(cur);
387 fprintf(stderr, "invalid \\ at the end of line (%s).\n",
398 if (isalpha((unsigned char)cur[1])) {
399 /* #XX*?? ¤ò¥Ñ¡¼¥¹ */
400 freq = parse_wtype(wtbuf, cur);
403 check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
405 push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
410 /* Éʻ줬½üµî¥ê¥¹¥È¤ËÆþ¤Ã¤Æ¤¤¤ë¤«¤ò¥Á¥§¥Ã¥¯ */
411 if (!is_excluded_wtype(mds, wtbuf)) {
413 push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
415 }/* :to extract excluded words
417 anthy_putxstr(ye->index_xstr);
418 printf(" %s*%d %s\n", wtbuf, freq, cur);
430 /** Ʊ¤¸Ã±¸ì¤¬Ìµ¤¤¤«¥Á¥§¥Ã¥¯ */
432 check_same_word(struct yomi_entry *ye, int idx)
434 struct word_entry *base = &ye->entries[idx];
436 for (i = idx -1; i >= 0; i--) {
437 struct word_entry *cur = &ye->entries[i];
438 if (base->raw_freq != cur->raw_freq) {
441 if (strcmp(base->wt_name, cur->wt_name)) {
444 if (strcmp(base->word_utf8, cur->word_utf8)) {
453 /** qsortÍѤÎÈæ³Ó´Ø¿ô */
455 compare_word_entry_by_freq(const void *p1, const void *p2)
457 const struct word_entry *e1 = p1;
458 const struct word_entry *e2 = p2;
459 return e2->raw_freq - e1->raw_freq;
462 /** qsortÍѤÎÈæ³Ó´Ø¿ô */
464 compare_word_entry_by_wtype(const void *p1, const void *p2)
466 const struct word_entry *e1 = p1;
467 const struct word_entry *e2 = p2;
468 int ret = strcmp(e1->wt_name, e2->wt_name);
472 return compare_word_entry_by_freq(p1, p2);
476 /** ÆɤߤËÂФ¹¤ëñ¸ì¤òÉÑÅÙ½ç¤Ëʤ١¢¤¤¤é¤Ê¤¤Ã±¸ì¤ò¾Ã¤¹ */
478 normalize_word_entry(struct yomi_entry *ye)
485 qsort(ye->entries, ye->nr_entries,
486 sizeof(struct word_entry),
487 compare_word_entry_by_freq);
488 /* ¥À¥Ö¤Ã¤¿¤é¡¢0ÅÀ */
489 for (i = 0; i < ye->nr_entries; i++) {
490 if (check_same_word(ye, i)) {
491 ye->entries[i].raw_freq = 0;
496 qsort(ye->entries, ye->nr_entries,
497 sizeof(struct word_entry),
498 compare_word_entry_by_wtype);
499 return ye->nr_entries - nr_dup;
502 /*¤½¤ÎÆɤߤËÂбþ¤¹¤ëyomi_entry¤òÊÖ¤¹
505 find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
507 struct yomi_entry *ye;
508 int hash = index_hash(index);
510 /* hash chain¤«¤éõ¤¹ */
511 for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
513 if (!anthy_xstrcmp(ye->index_xstr, index)) {
522 ye = malloc(sizeof(struct yomi_entry));
526 ye->index_xstr = anthy_xstr_dup(index);
527 ye->index_str = NULL;
529 /* hash chain¤Ë¤Ä¤Ê¤° */
530 ye->hash_next = yl->hash[hash];
543 /* ¼½ñ¥Õ¥¡¥¤¥ëÃæ¤Îhash bitmap¤Ë¥Þ¡¼¥¯¤òÉÕ¤±¤ë */
545 mark_hash_array(unsigned char *hash_array, xstr *xs)
547 int val, idx, bit, mask;
548 val = anthy_xstr_hash(xs);
549 val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
550 idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
551 bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
553 if (hash_array[idx] & mask) {
554 yomi_hash_collision ++;
556 hash_array[idx] |= mask;
559 /* Æɤßhash¤Î¥Ó¥Ã¥È¥Þ¥Ã¥×¤òºî¤ë */
561 mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
563 unsigned char *hash_array;
565 struct yomi_entry *ye;
566 hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
567 for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
570 for (i = 0; i < yl->nr_valid_entries; i++) {
571 ye = yl->ye_array[i];
572 mark_hash_array(hash_array, ye->index_xstr);
574 fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
575 printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
576 yomi_hash_collision, yl->nr_valid_entries);
580 static struct adjust_command *
581 parse_modify_freq_command(const char *buf)
583 char *line = alloca(strlen(buf) + 1);
584 char *yomi, *wt, *word, *type_str;
585 struct adjust_command *cmd;
588 yomi = strtok(line, " ");
589 wt = strtok(NULL, " ");
590 word = strtok(NULL, " ");
591 type_str = strtok(NULL, " ");
592 if (!yomi || !wt || !word || !type_str) {
595 if (!strcmp(type_str, "up")) {
596 type = ADJUST_FREQ_UP;
598 if (!strcmp(type_str, "down")) {
599 type = ADJUST_FREQ_DOWN;
601 if (!strcmp(type_str, "kill")) {
602 type = ADJUST_FREQ_KILL;
607 cmd = malloc(sizeof(struct adjust_command));
609 cmd->yomi = anthy_cstr_to_xstr(yomi, ANTHY_EUC_JP_ENCODING);
610 cmd->wt = get_wt_name(wt);
611 cmd->word = anthy_conv_euc_to_utf8(word);
616 parse_adjust_command(const char *buf, struct adjust_command *ac_list)
618 struct adjust_command *cmd = NULL;
619 if (!strncmp("\\modify_freq ", buf, 13)) {
620 cmd = parse_modify_freq_command(&buf[13]);
623 cmd->next = ac_list->next;
628 /** ¼½ñ¤ò°ì¹Ô¤º¤ÄÆɤ߹þ¤ó¤Ç¥ê¥¹¥È¤òºî¤ë
629 * ¤³¤Î¥³¥Þ¥ó¥É¤Î¥³¥¢ */
631 parse_dict_file(FILE *fin, struct mkdic_stat *mds)
634 char buf[MAX_LINE_LEN];
636 struct yomi_entry *ye = NULL;
639 while (read_line(fin, buf)) {
640 if (buf[0] == '\\' && buf[1] != ' ') {
641 parse_adjust_command(buf, &mds->ac_list);
644 index_xs = get_index_from_line(mds, buf);
648 ent = get_entry_from_line(buf);
650 /* Æɤߤ¬30ʸ»ú¤ò±Û¤¨¤ë¾ì¹ç¤Ï̵»ë */
651 if (index_xs->len < 31) {
652 ye = find_yomi_entry(&mds->yl, index_xs, 1);
653 push_back_word_entry_line(mds, ye, ent);
657 anthy_free_xstr(index_xs);
661 /* Æɤߡ¢Éʻ졢ñ¸ì¤Î»°¤ÄÁȤ«¤éñ¸ì¤Î¹½Â¤ÂΤò¼èÆÀ¤¹¤ë */
662 static struct word_entry *
663 find_word_entry(struct yomi_entry_list *yl, xstr *yomi,
664 const char *wt, char *word)
666 struct yomi_entry *ye = find_yomi_entry(yl, yomi, 0);
671 for (i = 0; i < ye->nr_entries; i++) {
672 struct word_entry *we = &ye->entries[i];
673 if (!strcmp(we->wt_name, wt) &&
674 !strcmp(we->word_utf8, word)) {
681 /* ÉÑÅÙÄ´À°¤Î¥³¥Þ¥ó¥É¤òŬÍѤ¹¤ë */
683 apply_adjust_command(struct yomi_entry_list *yl,
684 struct adjust_command *ac_list)
686 struct adjust_command *cmd;
687 for (cmd = ac_list->next; cmd; cmd = cmd->next) {
688 struct word_entry *we = find_word_entry(yl, cmd->yomi,
691 char *yomi = anthy_xstr_to_cstr(cmd->yomi, ANTHY_UTF8_ENCODING);
692 printf("failed to find target of adjust command (%s, %s, %s)\n",
693 yomi, cmd->wt, cmd->word);
697 if (cmd->type == ADJUST_FREQ_UP) {
700 if (cmd->type == ADJUST_FREQ_DOWN) {
702 if (we->raw_freq == 0) {
706 if (cmd->type == ADJUST_FREQ_KILL) {
712 /* qsortÍѤÎÈæ³Ó´Ø¿ô */
714 compare_yomi_entry(const void *p1, const void *p2)
716 const struct yomi_entry *const *y1 = p1;
717 const struct yomi_entry *const *y2 = p2;
718 return strcmp((*y1)->index_str, (*y2)->index_str);
721 /* yomi_entry¤Çsort¤¹¤ë */
723 sort_word_dict(struct yomi_entry_list *yl)
726 struct yomi_entry *ye;
727 yl->nr_valid_entries = 0;
728 /* ñ¸ì¤ò»ý¤ÄÆɤߤÀ¤±¤ò yl->ye_array¤ËµÍ¤áľ¤¹ */
729 yl->ye_array = malloc(sizeof(struct yomi_entry *) * yl->nr_entries);
730 for (i = 0, ye = yl->head; i < yl->nr_entries; i++, ye = ye->next) {
731 if (ye->nr_entries > 0) {
732 yl->ye_array[yl->nr_valid_entries] = ye;
733 yl->nr_valid_entries ++;
737 for (i = 0; i < yl->nr_valid_entries; i++) {
738 struct yomi_entry *ye = yl->ye_array[i];
739 ye->index_str = anthy_xstr_to_cstr(ye->index_xstr, yl->index_encoding);
742 qsort(yl->ye_array, yl->nr_valid_entries,
743 sizeof(struct yomi_entry *),
745 /* ÉÔÍפÊñ¸ì¤ò¾Ã¤¹ */
747 for (i = 0; i < yl->nr_valid_entries; i++) {
748 struct yomi_entry *ye = yl->ye_array[i];
749 yl->nr_words += normalize_word_entry(ye);
753 /** ¥Õ¥¡¥¤¥ë¤Î¥µ¥¤¥º¤ò¼èÆÀ¤¹¤ë */
755 get_file_size(FILE *fp)
760 return (ftell (fp) + SECTION_ALIGNMENT - 1) & (-SECTION_ALIGNMENT);
764 copy_file(struct mkdic_stat *mds, FILE *in, FILE *out)
770 /* Pad OUT to the next aligned offset. */
771 for (i = ftell (out); i & (SECTION_ALIGNMENT - 1); i++) {
775 /* Copy the contents. */
777 while ((nread = fread (buf, 1, sizeof buf, in)) > 0) {
778 if (fwrite (buf, 1, nread, out) < nread) {
779 /* Handle short write (maybe disk full). */
780 fprintf (stderr, "%s: %s: write error: %s\n",
781 progname, mds->output_fn, strerror (errno));
788 generate_header(FILE *fp)
790 int buf[NR_HEADER_SECTIONS];
792 struct file_section *fs;
796 for (i = 0; i < NR_HEADER_SECTIONS; i++) {
801 buf[0] = NR_HEADER_SECTIONS * sizeof(int);
804 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥ª¥Õ¥»¥Ã¥È */
806 for (i = 2, fs = file_array; fs->fpp; fs ++, i++) {
808 off += get_file_size(*(fs->fpp));
811 /* ¥Õ¥¡¥¤¥ë¤Ø½ÐÎϤ¹¤ë */
812 for (i = 0; i < NR_HEADER_SECTIONS; i++) {
813 write_nl(fp, buf[i]);
817 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥Õ¥¡¥¤¥ë¤ò¥Þ¡¼¥¸¤·¤Æ¡¢¤Ò¤È¤Ä¤Î¼½ñ¥Õ¥¡¥¤¥ë¤òºî¤ë */
819 link_dics(struct mkdic_stat *mds)
822 struct file_section *fs;
824 fp = fopen (mds->output_fn, "w");
826 fprintf (stderr, "%s: %s: cannot create: %s\n",
827 progname, mds->output_fn, strerror (errno));
831 /* ¥Ø¥Ã¥À¤ò½ÐÎϤ¹¤ë */
834 for (fs = file_array; fs->fpp; fs ++) {
835 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥Õ¥¡¥¤¥ë¤ò·ë¹ç¤¹¤ë */
836 copy_file(mds, *(fs->fpp), fp);
843 fprintf (stderr, "%s: %s: write error: %s\n",
844 progname, mds->output_fn, strerror (errno));
850 read_dict_file(struct mkdic_stat *mds, const char *fn)
853 /* ¥Õ¥¡¥¤¥ë̾¤¬»ØÄꤵ¤ì¤¿¤Î¤ÇÆɤ߹þ¤à */
856 printf("file = %s\n", fn);
857 parse_dict_file(fp, mds);
860 printf("failed file = %s\n", fn);
865 complete_words(struct mkdic_stat *mds)
867 /* ÉÑÅÙÊäÀµ¤òŬÍѤ¹¤ë */
868 apply_adjust_command(&mds->yl, &mds->ac_list);
873 /* ÆɤߤÇʤÓÂؤ¨¤ë */
874 sort_word_dict(&mds->yl);
876 /* ¥Õ¥¡¥¤¥ë¤ò½àÈ÷¤¹¤ë */
878 /* ñ¸ì¼½ñ¤ò½ÐÎϤ¹¤ë */
879 output_word_dict(&mds->yl);
881 /* Æɤߥϥå·¥å¤òºî¤ë */
882 mk_yomi_hash(yomi_hash_out, &mds->yl);
886 read_udict_file(struct mkdic_stat *mds, const char *fn)
889 mds->ud = create_uc_dict();
892 read_uc_file(mds->ud, fn);
893 printf("uc = %s\n", fn);
897 xstr_strncat(xstr* xs, xchar* src, int n)
900 xs->str = realloc(xs->str, sizeof(xchar) * (xs->len + n + 1));
902 for (i = 0; i < n; ++i) {
903 xs->str[xs->len + i] = src[i];
910 reverse_multi_segment_word(struct mkdic_stat *mds, struct word_entry *we)
913 ¡Ö¤«¤Ê¤«¤ó¤¸¤Ø¤ó¤«¤ó¤¨¤ó¤¸¤ó #T35 #_2²¾Ì¾_3´Á»ú_4ÊÑ´¹_4¥¨¥ó¥¸¥ó¡×
915 ¡Ö²¾Ì¾´Á»úÊÑ´¹¥¨¥ó¥¸¥ó #T35 #_2¤«¤Ê_2¤«¤ó¤¸_2¤Ø¤ó¤«¤ó_4¤¨¤ó¤¸¤ó¡×
919 /* yomi¤Ï²¾Ì¾´Á»úº®¤¸¤ê word¤ÏÊ¿²¾Ì¾¤Î¤ß¤«¤é¤Ê¤ë */
920 int yomi_seg_start = 0;
921 int word_seg_start = 0;
922 int word_seg_len = 0;
923 xstr *yomibuf = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
924 xstr *wordbuf = we->ye->index_xstr;
925 xstr *yomi_xs = anthy_cstr_to_xstr("", 0);
926 xstr *word_xs = anthy_cstr_to_xstr("#", 0);
929 struct yomi_entry *target_ye;
931 for (j = 0; j <= yomibuf->len; ++j) {
932 if (j == yomibuf->len || yomibuf->str[j] == '_') {
933 if (yomi_seg_start != 0) {
934 anthy_xstrappend(word_xs, '_');
935 snprintf(ch, 256, "%x", j - yomi_seg_start);
936 anthy_xstrappend(word_xs, (xchar)ch[0]);
937 xstr_strncat(word_xs, &wordbuf->str[word_seg_start], word_seg_len);
938 xstr_strncat(yomi_xs, &yomibuf->str[yomi_seg_start], j - yomi_seg_start);
940 if (j == yomibuf->len) {
943 yomi_seg_start = j + 2;
944 word_seg_start += word_seg_len;
945 word_seg_len = get_element_len(yomibuf->str[j + 1]);
949 target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
950 word = anthy_xstr_to_cstr(word_xs, mds->input_encoding);
952 /* µÕÊÑ´¹ÍѤμ½ñ¤Ïfreq¤¬Éé */
953 push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
954 word, we->source_order);
957 anthy_free_xstr(yomibuf);
958 anthy_free_xstr(yomi_xs);
959 anthy_free_xstr(word_xs);
962 /* µÕÊÑ´¹ÍѤμ½ñ¤òºî¤ë */
964 build_reverse_dict(struct mkdic_stat *mds)
966 struct yomi_entry *ye;
968 struct word_entry *we_array;
969 printf("building reverse index\n");
971 /* ñ¸ì¤Î¿ô¤ò¿ô¤¨¤ë */
973 for (ye = mds->yl.head; ye; ye = ye->next) {
974 for (i = 0; i < ye->nr_entries; i++) {
979 * (¸µ¤Î¼½ñÃæ¤Î¥Ý¥¤¥ó¥¿¤Ïrealloc¤ÇÆ°¤¯¤Î¤Ç¥³¥Ô¡¼¤¬É¬Í×)
981 we_array = malloc(sizeof(struct word_entry )* n);
983 for (ye = mds->yl.head; ye; ye = ye->next) {
984 for (i = 0; i < ye->nr_entries; i++) {
985 we_array[n] = ye->entries[i];
990 /* ¼½ñ¤ËÄɲ䷤Ƥ¤¤¯ */
991 for (i = 0; i < n; i++) {
992 struct word_entry *we;
993 struct yomi_entry *target_ye;
996 if (we->word_utf8[0] == '#') {
997 if (we->word_utf8[1] == '_') {
998 reverse_multi_segment_word(mds, we);
1001 /* yomi¤Ï²¾Ì¾´Á»úº®¤¸¤ê word¤ÏÊ¿²¾Ì¾¤Î¤ß¤«¤é¤Ê¤ë */
1005 yomi_xs = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
1006 target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
1007 word = anthy_xstr_to_cstr(we->ye->index_xstr, mds->input_encoding);
1009 /* µÕÊÑ´¹ÍѤμ½ñ¤Ïfreq¤¬Éé */
1010 push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
1011 word, we->source_order);
1013 anthy_free_xstr(yomi_xs);
1022 clear_exclude_wtypes(struct mkdic_stat *mds)
1025 for (i = 0; i < mds->nr_excluded; i++) {
1026 free(mds->excluded_wtypes[i]);
1028 free(mds->excluded_wtypes);
1030 mds->excluded_wtypes = NULL;
1031 mds->nr_excluded = 0;
1035 set_exclude_wtypes(struct mkdic_stat *mds, int nr, char **tokens)
1038 mds->nr_excluded = nr - 1;
1039 mds->excluded_wtypes = malloc(sizeof(char *) * (nr - 1));
1041 for (i = 1; i < nr; i++) {
1042 mds->excluded_wtypes[i - 1] = strdup(tokens[i]);
1047 set_dict_encoding(struct mkdic_stat *mds, const char *enc)
1049 if (!strcmp(enc, "utf8")) {
1050 mds->yl.body_encoding = ANTHY_UTF8_ENCODING;
1055 set_input_encoding(struct mkdic_stat *mds, const char *enc)
1057 if (!strcmp(enc, "utf8")) {
1058 mds->input_encoding = ANTHY_UTF8_ENCODING;
1060 if (!strcmp(enc, "eucjp")) {
1061 mds->input_encoding = ANTHY_EUC_JP_ENCODING;
1066 write_dict_file(struct mkdic_stat *mds)
1069 printf("can not build without use case dict\n");
1073 /* ÍÑÎã¼½ñ¤òºî¤ë */
1074 make_ucdict(uc_out, mds->ud);
1076 /* ¼½ñ¥Õ¥¡¥¤¥ë¤Ë¤Þ¤È¤á¤ë */
1077 flush_output_files();
1082 show_command(char **tokens, int nr)
1086 for (i = 0; i < nr; i++) {
1087 printf(" %s", tokens[i]);
1093 execute_batch(struct mkdic_stat *mds, const char *fn)
1097 if (anthy_open_file(fn)) {
1098 printf("mkanthydic: failed to open %s\n", fn);
1101 while (!anthy_read_line(&tokens, &nr)) {
1102 char *cmd = tokens[0];
1103 show_command(tokens, nr);
1104 if (!strcmp(cmd, "read") && nr == 2) {
1105 read_dict_file(mds, tokens[1]);
1106 } else if (!strcmp(cmd, "read_uc") && nr == 2) {
1107 read_udict_file(mds, tokens[1]);
1108 } else if (!strcmp(cmd, "build_reverse_dict")) {
1109 build_reverse_dict(mds);
1110 } else if (!strcmp(cmd, "write")) {
1111 write_dict_file(mds);
1112 } else if (!strcmp(cmd, "set_exclude_wtypes")) {
1113 set_exclude_wtypes(mds, nr, tokens);
1114 } else if (!strcmp(cmd, "clear_exclude_wtypes")) {
1115 clear_exclude_wtypes(mds);
1116 } else if (!strcmp(cmd, "set_dict_encoding") && nr == 2) {
1117 set_dict_encoding(mds, tokens[1]);
1118 } else if (!strcmp(cmd, "set_input_encoding") && nr == 2) {
1119 set_input_encoding(mds, tokens[1]);
1120 } else if (!strcmp(cmd, "done")) {
1124 printf("Unknown command(%s).\n", cmd);
1132 /* ¼½ñÀ¸À®¤Î¤¿¤á¤ÎÊÑ¿ô¤Î½é´ü²½ */
1134 init_mds(struct mkdic_stat *mds)
1137 mds->output_fn = DEFAULT_FN;
1140 /* ñ¸ì¼½ñ¤ò½é´ü²½¤¹¤ë */
1141 mds->yl.head = NULL;
1142 mds->yl.nr_entries = 0;
1143 for (i = 0; i < YOMI_HASH; i++) {
1144 mds->yl.hash[i] = NULL;
1146 mds->yl.index_encoding = ANTHY_UTF8_ENCODING;
1147 mds->yl.body_encoding = ANTHY_EUC_JP_ENCODING;
1149 mds->ac_list.next = NULL;
1151 mds->input_encoding = ANTHY_EUC_JP_ENCODING;
1153 mds->nr_excluded = 0;
1154 mds->excluded_wtypes = NULL;
1157 /* libanthy¤Î»ÈÍѤ¹¤ëÉôʬ¤À¤±¤ò½é´ü²½¤¹¤ë */
1162 res = anthy_init_xstr();
1164 fprintf (stderr, "failed to init dic lib\n");
1171 main(int argc, char **argv)
1173 struct mkdic_stat mds;
1175 char *script_fn = NULL;
1178 anthy_init_wtypes();
1182 for (i = 1; i < argc; i++) {
1183 char *arg = argv[i];
1184 char *prev_arg = argv[i-1];
1185 if (!strcmp(arg, "--help")) {
1188 if (!strcmp(prev_arg, "-f")) {
1193 if (help_mode || !script_fn) {
1197 return execute_batch(&mds, script_fn);