2 * cannadic·Á¼°¤Î¥Õ¥¡¥¤¥ë¤«¤é¼½ñ¥Õ¥¡¥¤¥ë¤òºî¤ë
4 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2002 1/1
6 * Copyright (C) 2000-2007 TABATA Yusuke
7 * Copyright (C) 2005 YOSHIDA Yuichi
8 * Copyright (C) 2001-2002 TAKAI Kousuke
11 * ¼½ñ¤ÏÆɤߤòindex¤È¤·¡¢ÉÊ»ì¤äÊÑ´¹¸å¤Îñ¸ì(=entry)¤ò¸¡º÷
12 * ¤¹¤ë¹½Â¤¤Ë¤Ê¤Ã¤Æ¤¤¤ë¡£
14 * ÆÉ¤ß -> ñ¸ì¡¢Ã±¸ì¡¢¡¢
16 * ¼½ñ¥Õ¥¡¥¤¥ë¤Ï¥Í¥Ã¥È¥ï¡¼¥¯¥Ð¥¤¥È¥ª¡¼¥À¡¼¤òÍѤ¤¤ë¡£
18 * ¼½ñ¥Õ¥¡¥¤¥ë¤ÏÊ£¿ô¤Î¥»¥¯¥·¥ç¥ó¤«¤é¹½À®¤µ¤ì¤Æ¤¤¤ë
20 * 2 ÆɤߤΥ¤¥ó¥Ç¥Ã¥¯¥¹ (Æɤß512¸Ä¤´¤È)
23 * 5 ¥Ú¡¼¥¸¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹
27 * source ¸µ¤Î¼½ñ¥Õ¥¡¥¤¥ë
28 * file_dic À¸À®¤¹¤ë¥Õ¥¡¥¤¥ë
30 * yomi_hash ¼½ñ¥Õ¥¡¥¤¥ë¤Ë½ÐÎϤµ¤ì¤ëhash¤Îbitmap
31 * index_hash ¤³¤Î¥½¡¼¥¹Ãæ¤Çstruct yomi_entry¤ò¸¡º÷¤¹¤ë¤¿¤á¤Îhash
35 #include <sys/types.h>
45 #include <anthy/anthy.h>
46 #include <anthy/xstr.h>
47 #include <anthy/wtype.h>
48 #include <anthy/ruleparser.h>
49 #include <anthy/word_dic.h>
50 #include <anthy/diclib.h>
53 #define MAX_LINE_LEN 10240
54 #define NR_HEADER_SECTIONS 16
55 #define SECTION_ALIGNMENT 8
56 #define MAX_WTYPE_LEN 20
58 #define DEFAULT_FN "anthy.wdic"
60 static const char *progname;
62 /* writewords.c¤«¤é¥¢¥¯¥»¥¹¤¹¤ë¤¿¤á¤Ë¡¢globalÊÑ¿ô */
63 FILE *yomi_entry_index_out, *yomi_entry_out;
64 FILE *page_out, *page_index_out;
67 static FILE *yomi_hash_out;
68 /* ¥Ï¥Ã¥·¥å¤Î¾×Æͤοô¡¢Åý·×¾ðÊó */
69 static int yomi_hash_collision;
71 /* ¥Õ¥¡¥¤¥ëÃæ¤Î½ç½ø¤Ë½¾¤Ã¤Æʤ٤ë */
76 {&yomi_entry_index_out, NULL},
77 {&yomi_entry_out, NULL},
79 {&page_index_out, NULL},
81 {&yomi_hash_out, NULL},
88 struct yomi_entry_list yl;
90 struct adjust_command ac_list;
94 const char *output_fn;
99 char **excluded_wtypes;
102 /* ¼½ñ¤Î½ÐÎÏÀè¤Î¥Õ¥¡¥¤¥ë¤ò¥ª¡¼¥×¥ó¤¹¤ë */
104 open_output_files(void)
106 struct file_section *fs;
107 for (fs = file_array; fs->fpp; fs ++) {
108 char *tmpdir = getenv("TMPDIR");
111 /* tmpfile()¤¬TMPDIR¤ò¸«¤Ê¤¤¤¿¤á¡¢TMPDIR¤ò»ØÄꤵ¤ì¤¿¾ì¹çmkstemp¤ò»È¤¦¡£*/
114 snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
119 *(fs->fpp) = fdopen(fd, "w+");
120 fs->fn = strdup(buf);
123 *(fs->fpp) = tmpfile();
127 fprintf (stderr, "%s: cannot open temporary file: %s\n",
128 progname, strerror (errno));
136 flush_output_files (void)
138 struct file_section *fs;
139 for (fs = file_array; fs->fpp; fs ++) {
140 if (ferror(*(fs->fpp))) {
141 fprintf (stderr, "%s: write error\n", progname);
145 for (fs = file_array; fs->fpp; fs ++) {
146 if (fflush(*(fs->fpp))) {
147 fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
153 /* ¥Í¥Ã¥È¥ï¡¼¥¯byteorder¤Ç4bytes½ñ¤½Ð¤¹ */
155 write_nl(FILE *fp, int i)
157 i = anthy_dic_htonl(i);
158 fwrite(&i, sizeof(int), 1, fp);
164 printf("please do not use mkanthydic command directly.\n");
169 read_line(FILE *fp, char *buf)
171 /* Ť¹¤®¤ë¹Ô¤ò̵»ë¤¹¤ë */
174 while (fgets(buf, MAX_LINE_LEN, fp)) {
175 int len = strlen(buf);
179 if (buf[len - 1] != '\n') {
194 /** cannadic·Á¼°¤Î¼½ñ¤Î¹Ô¤«¤éindex¤È¤Ê¤ëÉôʬ¤ò¼è¤ê½Ð¤¹ */
196 get_index_from_line(struct mkdic_stat *mds, char *buf)
200 sp = strchr(buf, ' ');
202 /* ¼½ñ¤Î¥Õ¥©¡¼¥Þ¥Ã¥È¤¬¤ª¤«¤·¤¤ */
206 xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
211 /** cannadic·Á¼°¤Î¼½ñ¤Î¹Ô¤«¤éindex°Ê³°¤ÎÉôʬ¤ò¼è¤ê½Ð¤¹ */
213 get_entry_from_line(char *buf)
216 sp = strchr(buf, ' ');
228 for (i = 0; i < xs->len; i++) {
229 h += xs->str[i] * 11;
231 return (int)(h % YOMI_HASH);
235 get_wt_name(const char *name)
239 if (!strcmp(name, "#T35")) {
242 res = anthy_type_to_wtype(name, &dummy);
249 /** ÆɤߤËÂФ·¤Æ¡¢Ã±¸ì¤ò°ì¤Ä¤òÄɲ乤ë */
251 push_back_word_entry(struct mkdic_stat *mds,
252 struct yomi_entry *ye, const char *wt_name,
253 int freq, const char *word, int order)
260 if (!anthy_type_to_wtype(wt_name, &wt)) {
261 /* anthy¤ÎÃΤé¤Ê¤¤ÉÊ»ì */
264 ye->entries = realloc(ye->entries,
265 sizeof(struct word_entry) *
266 (ye->nr_entries + 1));
267 ye->entries[ye->nr_entries].ye = ye;
268 ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
269 ye->entries[ye->nr_entries].raw_freq = freq;
270 ye->entries[ye->nr_entries].feature = 0;
271 ye->entries[ye->nr_entries].source_order = order;
272 if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
273 s = anthy_conv_euc_to_utf8(word);
277 ye->entries[ye->nr_entries].word_utf8 = s;
282 parse_wtype(char *wtbuf, char *cur)
287 if (strlen(cur) >= MAX_WTYPE_LEN) {
292 t = strchr(wtbuf, '*');
306 /* Ê£¹ç¸ì¤ÎÍ×ÁǤÎŤµ¤Ï 1,2,3, ... 9,a,b,c */
308 get_element_len(xchar xc)
310 if (xc > '0' && xc <= '9') {
313 if (xc >= 'a' && xc <= 'z') {
314 return xc - 'a' + 10;
319 /** Ê£¹ç¸õÊä¤Î·Á¼°¥Á¥§¥Ã¥¯ */
321 check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
323 /* ÆɤߤÎʸ»ú¿ô¤Î¹ç·×¤ò¿ô¤¨¤ë */
324 xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
326 for (i = 0; i < xs->len - 1; i++) {
327 if (xs->str[i] == '_') {
328 total += get_element_len(xs->str[i+1]);
333 if (total != index->len) {
334 fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
342 is_excluded_wtype(struct mkdic_stat *mds, char *wt)
345 for (i = 0; i < mds->nr_excluded; i++) {
346 if (!strcmp(mds->excluded_wtypes[i], wt)) {
354 find_token_end(char *cur)
357 for (n = cur; *n != ' ' && *n != '\0'; n++) {
368 /** ÆɤߤËÂбþ¤¹¤ë¹Ô¤òʬ³ä¤·¤Æ¡¢ÇÛÎó¤ò¹½À®¤¹¤ë */
370 push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
373 char *buf = alloca(strlen(ent) + 1);
376 char wtbuf[MAX_WTYPE_LEN];
384 /* ¥È¡¼¥¯¥ó¤ò\0¤ÇÀڤ롣cur¤Î¸å¤Î¶õÇò¤«\0¤òõ¤¹ */
385 n = find_token_end(cur);
387 fprintf(stderr, "invalid \\ at the end of line (%s).\n",
398 if (isalpha((unsigned char)cur[1])) {
399 /* #XX*?? ¤ò¥Ñ¡¼¥¹ */
400 freq = parse_wtype(wtbuf, cur);
403 check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
405 push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
410 /* Éʻ줬½üµî¥ê¥¹¥È¤ËÆþ¤Ã¤Æ¤¤¤ë¤«¤ò¥Á¥§¥Ã¥¯ */
411 if (!is_excluded_wtype(mds, wtbuf)) {
413 push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
415 }/* :to extract excluded words
417 anthy_putxstr(ye->index_xstr);
418 printf(" %s*%d %s\n", wtbuf, freq, cur);
430 /** Ʊ¤¸Ã±¸ì¤¬Ìµ¤¤¤«¥Á¥§¥Ã¥¯ */
432 check_same_word(struct yomi_entry *ye, int idx)
434 struct word_entry *base = &ye->entries[idx];
436 for (i = idx -1; i >= 0; i--) {
437 struct word_entry *cur = &ye->entries[i];
438 if (base->raw_freq != cur->raw_freq) {
441 if (strcmp(base->wt_name, cur->wt_name)) {
444 if (strcmp(base->word_utf8, cur->word_utf8)) {
453 /** qsortÍѤÎÈæ³Ó´Ø¿ô */
455 compare_word_entry_by_freq(const void *p1, const void *p2)
457 const struct word_entry *e1 = p1;
458 const struct word_entry *e2 = p2;
459 return e2->raw_freq - e1->raw_freq;
462 /** qsortÍѤÎÈæ³Ó´Ø¿ô */
464 compare_word_entry_by_wtype(const void *p1, const void *p2)
466 const struct word_entry *e1 = p1;
467 const struct word_entry *e2 = p2;
468 int ret = strcmp(e1->wt_name, e2->wt_name);
472 return compare_word_entry_by_freq(p1, p2);
476 /** ÆɤߤËÂФ¹¤ëñ¸ì¤òÉÑÅÙ½ç¤Ëʤ١¢¤¤¤é¤Ê¤¤Ã±¸ì¤ò¾Ã¤¹ */
478 normalize_word_entry(struct yomi_entry *ye)
485 qsort(ye->entries, ye->nr_entries,
486 sizeof(struct word_entry),
487 compare_word_entry_by_freq);
488 /* ¥À¥Ö¤Ã¤¿¤é¡¢0ÅÀ */
489 for (i = 0; i < ye->nr_entries; i++) {
490 if (check_same_word(ye, i)) {
491 ye->entries[i].raw_freq = 0;
496 qsort(ye->entries, ye->nr_entries,
497 sizeof(struct word_entry),
498 compare_word_entry_by_wtype);
499 return ye->nr_entries - nr_dup;
502 /*¤½¤ÎÆɤߤËÂбþ¤¹¤ëyomi_entry¤òÊÖ¤¹
505 find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
507 struct yomi_entry *ye;
508 int hash = index_hash(index);
510 /* hash chain¤«¤éõ¤¹ */
511 for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
513 if (!anthy_xstrcmp(ye->index_xstr, index)) {
522 ye = malloc(sizeof(struct yomi_entry));
526 ye->index_xstr = anthy_xstr_dup(index);
527 ye->index_str = NULL;
529 /* hash chain¤Ë¤Ä¤Ê¤° */
530 ye->hash_next = yl->hash[hash];
543 /* ¼½ñ¥Õ¥¡¥¤¥ëÃæ¤Îhash bitmap¤Ë¥Þ¡¼¥¯¤òÉÕ¤±¤ë */
545 mark_hash_array(unsigned char *hash_array, xstr *xs)
547 int val, idx, bit, mask;
548 val = anthy_xstr_hash(xs);
549 val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
550 idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
551 bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
553 if (hash_array[idx] & mask) {
554 yomi_hash_collision ++;
556 hash_array[idx] |= mask;
559 /* Æɤßhash¤Î¥Ó¥Ã¥È¥Þ¥Ã¥×¤òºî¤ë */
561 mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
563 unsigned char *hash_array;
565 struct yomi_entry *ye;
566 hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
567 for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
570 for (i = 0; i < yl->nr_valid_entries; i++) {
571 ye = yl->ye_array[i];
572 mark_hash_array(hash_array, ye->index_xstr);
574 fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
575 printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
576 yomi_hash_collision, yl->nr_valid_entries);
581 static struct adjust_command *
582 parse_modify_freq_command(const char *buf)
584 char *line = alloca(strlen(buf) + 1);
585 char *yomi, *wt, *word, *type_str;
586 struct adjust_command *cmd;
589 yomi = strtok(line, " ");
590 wt = strtok(NULL, " ");
591 word = strtok(NULL, " ");
592 type_str = strtok(NULL, " ");
593 if (!yomi || !wt || !word || !type_str) {
596 if (!strcmp(type_str, "up")) {
597 type = ADJUST_FREQ_UP;
599 if (!strcmp(type_str, "down")) {
600 type = ADJUST_FREQ_DOWN;
602 if (!strcmp(type_str, "kill")) {
603 type = ADJUST_FREQ_KILL;
608 cmd = malloc(sizeof(struct adjust_command));
610 cmd->yomi = anthy_cstr_to_xstr(yomi, ANTHY_EUC_JP_ENCODING);
611 cmd->wt = get_wt_name(wt);
612 cmd->word = anthy_conv_euc_to_utf8(word);
617 parse_adjust_command(const char *buf, struct adjust_command *ac_list)
619 struct adjust_command *cmd = NULL;
620 if (!strncmp("\\modify_freq ", buf, 13)) {
621 cmd = parse_modify_freq_command(&buf[13]);
624 cmd->next = ac_list->next;
629 /** ¼½ñ¤ò°ì¹Ô¤º¤ÄÆɤ߹þ¤ó¤Ç¥ê¥¹¥È¤òºî¤ë
630 * ¤³¤Î¥³¥Þ¥ó¥É¤Î¥³¥¢ */
632 parse_dict_file(FILE *fin, struct mkdic_stat *mds)
635 char buf[MAX_LINE_LEN];
637 struct yomi_entry *ye = NULL;
640 while (read_line(fin, buf)) {
641 if (buf[0] == '\\' && buf[1] != ' ') {
642 parse_adjust_command(buf, &mds->ac_list);
645 index_xs = get_index_from_line(mds, buf);
649 ent = get_entry_from_line(buf);
651 /* Æɤߤ¬30ʸ»ú¤ò±Û¤¨¤ë¾ì¹ç¤Ï̵»ë */
652 if (index_xs->len < 31) {
653 ye = find_yomi_entry(&mds->yl, index_xs, 1);
654 push_back_word_entry_line(mds, ye, ent);
658 anthy_free_xstr(index_xs);
662 /* Æɤߡ¢Éʻ졢ñ¸ì¤Î»°¤ÄÁȤ«¤éñ¸ì¤Î¹½Â¤ÂΤò¼èÆÀ¤¹¤ë */
663 static struct word_entry *
664 find_word_entry(struct yomi_entry_list *yl, xstr *yomi,
665 const char *wt, char *word)
667 struct yomi_entry *ye = find_yomi_entry(yl, yomi, 0);
672 for (i = 0; i < ye->nr_entries; i++) {
673 struct word_entry *we = &ye->entries[i];
674 if (!strcmp(we->wt_name, wt) &&
675 !strcmp(we->word_utf8, word)) {
682 /* ÉÑÅÙÄ´À°¤Î¥³¥Þ¥ó¥É¤òŬÍѤ¹¤ë */
684 apply_adjust_command(struct yomi_entry_list *yl,
685 struct adjust_command *ac_list)
687 struct adjust_command *cmd;
688 for (cmd = ac_list->next; cmd; cmd = cmd->next) {
689 struct word_entry *we = find_word_entry(yl, cmd->yomi,
692 char *yomi = anthy_xstr_to_cstr(cmd->yomi, ANTHY_UTF8_ENCODING);
693 printf("failed to find target of adjust command (%s, %s, %s)\n",
694 yomi, cmd->wt, cmd->word);
698 if (cmd->type == ADJUST_FREQ_UP) {
701 if (cmd->type == ADJUST_FREQ_DOWN) {
703 if (we->raw_freq == 0) {
707 if (cmd->type == ADJUST_FREQ_KILL) {
713 /* qsortÍѤÎÈæ³Ó´Ø¿ô */
715 compare_yomi_entry(const void *p1, const void *p2)
717 const struct yomi_entry *const *y1 = p1;
718 const struct yomi_entry *const *y2 = p2;
719 return strcmp((*y1)->index_str, (*y2)->index_str);
722 /* yomi_entry¤Çsort¤¹¤ë */
724 sort_word_dict(struct yomi_entry_list *yl)
727 struct yomi_entry *ye;
728 yl->nr_valid_entries = 0;
729 /* ñ¸ì¤ò»ý¤ÄÆɤߤÀ¤±¤ò yl->ye_array¤ËµÍ¤áľ¤¹ */
730 yl->ye_array = malloc(sizeof(struct yomi_entry *) * yl->nr_entries);
731 for (i = 0, ye = yl->head; i < yl->nr_entries; i++, ye = ye->next) {
732 if (ye->nr_entries > 0) {
733 yl->ye_array[yl->nr_valid_entries] = ye;
734 yl->nr_valid_entries ++;
738 for (i = 0; i < yl->nr_valid_entries; i++) {
739 struct yomi_entry *ye = yl->ye_array[i];
740 ye->index_str = anthy_xstr_to_cstr(ye->index_xstr, yl->index_encoding);
743 qsort(yl->ye_array, yl->nr_valid_entries,
744 sizeof(struct yomi_entry *),
746 /* ÉÔÍפÊñ¸ì¤ò¾Ã¤¹ */
748 for (i = 0; i < yl->nr_valid_entries; i++) {
749 struct yomi_entry *ye = yl->ye_array[i];
750 yl->nr_words += normalize_word_entry(ye);
754 /** ¥Õ¥¡¥¤¥ë¤Î¥µ¥¤¥º¤ò¼èÆÀ¤¹¤ë */
756 get_file_size(FILE *fp)
761 return (ftell (fp) + SECTION_ALIGNMENT - 1) & (-SECTION_ALIGNMENT);
765 copy_file(struct mkdic_stat *mds, FILE *in, FILE *out)
771 /* Pad OUT to the next aligned offset. */
772 for (i = ftell (out); i & (SECTION_ALIGNMENT - 1); i++) {
776 /* Copy the contents. */
778 while ((nread = fread (buf, 1, sizeof buf, in)) > 0) {
779 if (fwrite (buf, 1, nread, out) < nread) {
780 /* Handle short write (maybe disk full). */
781 fprintf (stderr, "%s: %s: write error: %s\n",
782 progname, mds->output_fn, strerror (errno));
789 generate_header(FILE *fp)
791 int buf[NR_HEADER_SECTIONS];
793 struct file_section *fs;
797 for (i = 0; i < NR_HEADER_SECTIONS; i++) {
802 buf[0] = NR_HEADER_SECTIONS * sizeof(int);
805 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥ª¥Õ¥»¥Ã¥È */
807 for (i = 2, fs = file_array; fs->fpp; fs ++, i++) {
809 off += get_file_size(*(fs->fpp));
812 /* ¥Õ¥¡¥¤¥ë¤Ø½ÐÎϤ¹¤ë */
813 for (i = 0; i < NR_HEADER_SECTIONS; i++) {
814 write_nl(fp, buf[i]);
818 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥Õ¥¡¥¤¥ë¤ò¥Þ¡¼¥¸¤·¤Æ¡¢¤Ò¤È¤Ä¤Î¼½ñ¥Õ¥¡¥¤¥ë¤òºî¤ë */
820 link_dics(struct mkdic_stat *mds)
823 struct file_section *fs;
825 fp = fopen (mds->output_fn, "w");
827 fprintf (stderr, "%s: %s: cannot create: %s\n",
828 progname, mds->output_fn, strerror (errno));
832 /* ¥Ø¥Ã¥À¤ò½ÐÎϤ¹¤ë */
835 for (fs = file_array; fs->fpp; fs ++) {
836 /* ³Æ¥»¥¯¥·¥ç¥ó¤Î¥Õ¥¡¥¤¥ë¤ò·ë¹ç¤¹¤ë */
837 copy_file(mds, *(fs->fpp), fp);
844 fprintf (stderr, "%s: %s: write error: %s\n",
845 progname, mds->output_fn, strerror (errno));
851 read_dict_file(struct mkdic_stat *mds, const char *fn)
854 /* ¥Õ¥¡¥¤¥ë̾¤¬»ØÄꤵ¤ì¤¿¤Î¤ÇÆɤ߹þ¤à */
857 printf("file = %s\n", fn);
858 parse_dict_file(fp, mds);
861 printf("failed file = %s\n", fn);
866 complete_words(struct mkdic_stat *mds)
868 /* ÉÑÅÙÊäÀµ¤òŬÍѤ¹¤ë */
869 apply_adjust_command(&mds->yl, &mds->ac_list);
874 /* ÆɤߤÇʤÓÂؤ¨¤ë */
875 sort_word_dict(&mds->yl);
877 /* ¥Õ¥¡¥¤¥ë¤ò½àÈ÷¤¹¤ë */
879 /* ñ¸ì¼½ñ¤ò½ÐÎϤ¹¤ë */
880 output_word_dict(&mds->yl);
882 /* Æɤߥϥå·¥å¤òºî¤ë */
883 mk_yomi_hash(yomi_hash_out, &mds->yl);
887 read_udict_file(struct mkdic_stat *mds, const char *fn)
890 mds->ud = create_uc_dict();
893 read_uc_file(mds->ud, fn);
894 printf("uc = %s\n", fn);
898 xstr_strncat(xstr* xs, xchar* src, int n)
901 xs->str = realloc(xs->str, sizeof(xchar) * (xs->len + n + 1));
903 for (i = 0; i < n; ++i) {
904 xs->str[xs->len + i] = src[i];
911 reverse_multi_segment_word(struct mkdic_stat *mds, struct word_entry *we)
914 ¡Ö¤«¤Ê¤«¤ó¤¸¤Ø¤ó¤«¤ó¤¨¤ó¤¸¤ó #T35 #_2²¾Ì¾_3´Á»ú_4ÊÑ´¹_4¥¨¥ó¥¸¥ó¡×
916 ¡Ö²¾Ì¾´Á»úÊÑ´¹¥¨¥ó¥¸¥ó #T35 #_2¤«¤Ê_2¤«¤ó¤¸_2¤Ø¤ó¤«¤ó_4¤¨¤ó¤¸¤ó¡×
920 /* yomi¤Ï²¾Ì¾´Á»úº®¤¸¤ê word¤ÏÊ¿²¾Ì¾¤Î¤ß¤«¤é¤Ê¤ë */
921 int yomi_seg_start = 0;
922 int word_seg_start = 0;
923 int word_seg_len = 0;
924 xstr *yomibuf = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
925 xstr *wordbuf = we->ye->index_xstr;
926 xstr *yomi_xs = anthy_cstr_to_xstr("", 0);
927 xstr *word_xs = anthy_cstr_to_xstr("#", 0);
930 struct yomi_entry *target_ye;
932 for (j = 0; j <= yomibuf->len; ++j) {
933 if (j == yomibuf->len || yomibuf->str[j] == '_') {
934 if (yomi_seg_start != 0) {
935 anthy_xstrappend(word_xs, '_');
936 snprintf(ch, 256, "%x", j - yomi_seg_start);
937 anthy_xstrappend(word_xs, (xchar)ch[0]);
938 xstr_strncat(word_xs, &wordbuf->str[word_seg_start], word_seg_len);
939 xstr_strncat(yomi_xs, &yomibuf->str[yomi_seg_start], j - yomi_seg_start);
941 if (j == yomibuf->len) {
944 yomi_seg_start = j + 2;
945 word_seg_start += word_seg_len;
946 word_seg_len = get_element_len(yomibuf->str[j + 1]);
950 target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
951 word = anthy_xstr_to_cstr(word_xs, mds->input_encoding);
953 /* µÕÊÑ´¹ÍѤμ½ñ¤Ïfreq¤¬Éé */
954 push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
955 word, we->source_order);
958 anthy_free_xstr(yomibuf);
959 anthy_free_xstr(yomi_xs);
960 anthy_free_xstr(word_xs);
963 /* µÕÊÑ´¹ÍѤμ½ñ¤òºî¤ë */
965 build_reverse_dict(struct mkdic_stat *mds)
967 struct yomi_entry *ye;
969 struct word_entry *we_array;
970 printf("building reverse index\n");
972 /* ñ¸ì¤Î¿ô¤ò¿ô¤¨¤ë */
974 for (ye = mds->yl.head; ye; ye = ye->next) {
975 for (i = 0; i < ye->nr_entries; i++) {
980 * (¸µ¤Î¼½ñÃæ¤Î¥Ý¥¤¥ó¥¿¤Ïrealloc¤ÇÆ°¤¯¤Î¤Ç¥³¥Ô¡¼¤¬É¬Í×)
982 we_array = malloc(sizeof(struct word_entry )* n);
984 for (ye = mds->yl.head; ye; ye = ye->next) {
985 for (i = 0; i < ye->nr_entries; i++) {
986 we_array[n] = ye->entries[i];
991 /* ¼½ñ¤ËÄɲ䷤Ƥ¤¤¯ */
992 for (i = 0; i < n; i++) {
993 struct word_entry *we;
994 struct yomi_entry *target_ye;
997 if (we->word_utf8[0] == '#') {
998 if (we->word_utf8[1] == '_') {
999 reverse_multi_segment_word(mds, we);
1002 /* yomi¤Ï²¾Ì¾´Á»úº®¤¸¤ê word¤ÏÊ¿²¾Ì¾¤Î¤ß¤«¤é¤Ê¤ë */
1006 yomi_xs = anthy_cstr_to_xstr(we->word_utf8, ANTHY_UTF8_ENCODING);
1007 target_ye = find_yomi_entry(&mds->yl, yomi_xs, 1);
1008 word = anthy_xstr_to_cstr(we->ye->index_xstr, mds->input_encoding);
1010 /* µÕÊÑ´¹ÍѤμ½ñ¤Ïfreq¤¬Éé */
1011 push_back_word_entry(mds, target_ye, we->wt_name, -we->raw_freq,
1012 word, we->source_order);
1014 anthy_free_xstr(yomi_xs);
1023 clear_exclude_wtypes(struct mkdic_stat *mds)
1026 for (i = 0; i < mds->nr_excluded; i++) {
1027 free(mds->excluded_wtypes[i]);
1029 free(mds->excluded_wtypes);
1031 mds->excluded_wtypes = NULL;
1032 mds->nr_excluded = 0;
1036 set_exclude_wtypes(struct mkdic_stat *mds, int nr, char **tokens)
1039 mds->nr_excluded = nr - 1;
1040 mds->excluded_wtypes = malloc(sizeof(char *) * (nr - 1));
1042 for (i = 1; i < nr; i++) {
1043 mds->excluded_wtypes[i - 1] = strdup(tokens[i]);
1048 set_dict_encoding(struct mkdic_stat *mds, const char *enc)
1050 if (!strcmp(enc, "utf8")) {
1051 mds->yl.body_encoding = ANTHY_UTF8_ENCODING;
1056 set_input_encoding(struct mkdic_stat *mds, const char *enc)
1058 if (!strcmp(enc, "utf8")) {
1059 mds->input_encoding = ANTHY_UTF8_ENCODING;
1061 if (!strcmp(enc, "eucjp")) {
1062 mds->input_encoding = ANTHY_EUC_JP_ENCODING;
1067 write_dict_file(struct mkdic_stat *mds)
1070 printf("can not build without use case dict\n");
1074 /* ÍÑÎã¼½ñ¤òºî¤ë */
1075 make_ucdict(uc_out, mds->ud);
1077 /* ¼½ñ¥Õ¥¡¥¤¥ë¤Ë¤Þ¤È¤á¤ë */
1078 flush_output_files();
1083 show_command(char **tokens, int nr)
1087 for (i = 0; i < nr; i++) {
1088 printf(" %s", tokens[i]);
1094 execute_batch(struct mkdic_stat *mds, const char *fn)
1098 if (anthy_open_file(fn)) {
1099 printf("mkanthydic: failed to open %s\n", fn);
1102 while (!anthy_read_line(&tokens, &nr)) {
1103 char *cmd = tokens[0];
1104 show_command(tokens, nr);
1105 if (!strcmp(cmd, "read") && nr == 2) {
1106 read_dict_file(mds, tokens[1]);
1107 } else if (!strcmp(cmd, "read_uc") && nr == 2) {
1108 read_udict_file(mds, tokens[1]);
1109 } else if (!strcmp(cmd, "build_reverse_dict")) {
1110 build_reverse_dict(mds);
1111 } else if (!strcmp(cmd, "write")) {
1112 write_dict_file(mds);
1113 } else if (!strcmp(cmd, "set_exclude_wtypes")) {
1114 set_exclude_wtypes(mds, nr, tokens);
1115 } else if (!strcmp(cmd, "clear_exclude_wtypes")) {
1116 clear_exclude_wtypes(mds);
1117 } else if (!strcmp(cmd, "set_dict_encoding") && nr == 2) {
1118 set_dict_encoding(mds, tokens[1]);
1119 } else if (!strcmp(cmd, "set_input_encoding") && nr == 2) {
1120 set_input_encoding(mds, tokens[1]);
1121 } else if (!strcmp(cmd, "done")) {
1125 printf("Unknown command(%s).\n", cmd);
1133 /* ¼½ñÀ¸À®¤Î¤¿¤á¤ÎÊÑ¿ô¤Î½é´ü²½ */
1135 init_mds(struct mkdic_stat *mds)
1138 mds->output_fn = DEFAULT_FN;
1141 /* ñ¸ì¼½ñ¤ò½é´ü²½¤¹¤ë */
1142 mds->yl.head = NULL;
1143 mds->yl.nr_entries = 0;
1144 for (i = 0; i < YOMI_HASH; i++) {
1145 mds->yl.hash[i] = NULL;
1147 mds->yl.index_encoding = ANTHY_UTF8_ENCODING;
1148 mds->yl.body_encoding = ANTHY_EUC_JP_ENCODING;
1150 mds->ac_list.next = NULL;
1152 mds->input_encoding = ANTHY_EUC_JP_ENCODING;
1154 mds->nr_excluded = 0;
1155 mds->excluded_wtypes = NULL;
1158 /* libanthy¤Î»ÈÍѤ¹¤ëÉôʬ¤À¤±¤ò½é´ü²½¤¹¤ë */
1163 res = anthy_init_xstr();
1165 fprintf (stderr, "failed to init dic lib\n");
1172 main(int argc, char **argv)
1174 struct mkdic_stat mds;
1176 char *script_fn = NULL;
1179 anthy_init_wtypes();
1183 for (i = 1; i < argc; i++) {
1184 char *arg = argv[i];
1185 char *prev_arg = argv[i-1];
1186 if (!strcmp(arg, "--help")) {
1189 if (!strcmp(prev_arg, "-f")) {
1194 if (help_mode || !script_fn) {
1198 return execute_batch(&mds, script_fn);