2 * ʸÀá¤ÎÁ«°Ü¹ÔÎó¤òºîÀ®¤¹¤ë
4 * ¤³¤Î¥³¥Þ¥ó¥É¤ÏÆó¤Ä¤Îµ¡Ç½¤ò»ý¤Ã¤Æ¤¤¤ë¡£(-c¥ª¥×¥·¥ç¥ó¤ÇÀ©¸æ)
5 * (1) proccorpus¤Î·ë²Ì¤«¤é¥Æ¥¥¹¥È·Á¼°¤Ç·Ð¸³Åª³ÊΨ¤Îɽ¤òºî¤ë
6 * (2) ¥Æ¥¥¹¥È·Á¼°¤Îɽ¤«¤é¥Ð¥¤¥Ê¥ê·Á¼°¤ËÊÑ´¹¤¹¤ë
8 * morphological-analyzer¤Î½ÐÎϤˤϲ¼µ¤Î¥Þ¡¼¥¯¤¬ÉÕ¤±¤Æ¤¢¤ë
11 * ^ Ê£¹çʸÀá¤Î2¤Ä¤á°Ê¹ß¤ÎÍ×ÁÇ
13 * generate transition matrix
15 * Copyright (C) 2006 HANAOKA Toshiyuki
16 * Copyright (C) 2006-2007 TABATA Yusuke
20 This library is free software; you can redistribute it and/or
21 modify it under the terms of the GNU Lesser General Public
22 License as published by the Free Software Foundation; either
23 version 2 of the License, or (at your option) any later version.
25 This library is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28 Lesser General Public License for more details.
30 You should have received a copy of the GNU Lesser General Public
31 License along with this library; if not, write to the Free Software
32 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39 #include <anthy/anthy.h>
40 #include <anthy/xstr.h>
41 #include <anthy/feature_set.h>
42 #include <anthy/diclib.h>
43 #include "input_set.h"
44 #include <anthy/corpus.h>
46 #define FEATURE_SET_SIZE NR_EM_FEATURES
55 #define MAX_SEGMENT 64
62 struct sentence_info {
64 struct segment_info segs[MAX_SEGMENT];
70 struct input_set *cand_is;
72 struct input_set *seg_is;
73 /* ¼«Î©¸ì¤ÎÁ´Ê¸¸¡º÷ÍѾðÊó */
74 struct corpus *indep_corpus;
77 struct array missed_cand_features;
82 /* ÆþÎϤµ¤ì¤¿Îãʸ¤ÎÎ̤˴ؤ¹¤ë¾ðÊó */
87 static struct input_info *
91 m = malloc(sizeof(struct input_info));
92 m->seg_is = input_set_create();
93 m->cand_is = input_set_create();
94 m->indep_corpus = corpus_new();
95 m->missed_cand_features.len = 0;
96 m->nth_input_file = 0;
98 m->nr_connections = 0;
102 /* features=1,2,3,,¤Î·Á¼°¤òparse¤¹¤ë */
104 parse_features(struct array *features, char *s)
107 tok = strtok(str, ",");
110 features->f[features->len] = atoi(tok);
112 tok = strtok(NULL, ",");
117 add_seg_struct_info(struct input_info *m,
118 struct array *features,
121 input_set_set_features(m->cand_is, features->f, features->len, weight);
125 set_hash(struct sentence_info *sinfo, int error_class,
129 sinfo->segs[sinfo->nr_segments].orig_hash = hash;
131 sinfo->segs[sinfo->nr_segments].hash = hash;
134 sinfo->nr_segments++;
139 compare_array(struct array *a1, struct array *a2)
142 if (a1->len != a2->len) {
145 for (i = 0; i < a1->len; i++) {
146 if (a1->f[i] != a2->f[i]) {
153 /* ¼«Î©¸ì¤Î¹Ô¤òparse¤¹¤ë */
155 parse_indep(struct input_info *m, struct sentence_info *sinfo,
156 char *line, char *buf, int error_class)
158 struct array features;
162 s = strstr(buf, "features=");
165 parse_features(&features, s);
166 m->nr_connections ++;
168 s = strstr(buf, "hash=");
171 set_hash(sinfo, error_class, line[0], atoi(s));
176 if (line[0] == '~') {
177 /* ¸í¤Ã¤¿¸õÊä¤Î¹½Â¤¤òÊݸ */
178 m->missed_cand_features = features;
180 if (line[0] == '!') {
182 input_set_set_features(m->seg_is, features.f, features.len, -weight);
186 input_set_set_features(m->seg_is, features.f, features.len, weight);
188 if (m->missed_cand_features.len != 0 &&
189 compare_array(&features, &m->missed_cand_features)) {
190 /* Àµ²ò¤È°Û¤Ê¤ë¹½Â¤¤Ê¤éʬÊì¤Ë²Ã»» */
191 add_seg_struct_info(m, &m->missed_cand_features, -weight);
193 m->missed_cand_features.len = 0;
194 add_seg_struct_info(m, &features, weight);
199 init_sentence_info(struct sentence_info *sinfo)
202 sinfo->nr_segments = 0;
203 for (i = 0; i < MAX_SEGMENT; i++) {
204 sinfo->segs[i].orig_hash = 0;
205 sinfo->segs[i].hash = 0;
209 /* °ì¤Ä¤Îʸ¤òÆɤó¤À¤È¤¤ËÁ´Ê¸¸¡º÷ÍѤΥǡ¼¥¿¤òºî¤ë
212 complete_sentence_info(struct input_info *m, struct sentence_info *sinfo)
215 if (m->nth_input_file > 0) {
216 /* Æó¤Ä¤á°Ê¹ß¤ÎÆþÎÏ¥Õ¥¡¥¤¥ë¤Ï»È¤ï¤Ê¤¤ */
219 for (i = 0; i < sinfo->nr_segments; i++) {
220 int flags = ELM_NONE;
227 buf[0] = sinfo->segs[i].hash;
228 if (sinfo->segs[i].orig_hash) {
230 buf[1] = sinfo->segs[i].orig_hash;
234 corpus_push_back(m->indep_corpus, buf, nr, flags);
239 do_read_file(struct input_info *m, FILE *fp)
242 struct sentence_info sinfo;
244 init_sentence_info(&sinfo);
246 while (fgets(line, 1024, fp)) {
249 if (!strncmp(buf, "eos", 3)) {
251 complete_sentence_info(m, &sinfo);
252 init_sentence_info(&sinfo);
254 if (line[0] == '~' || line[0] == '!' ||
259 if (!strncmp(buf, "indep_word", 10) ||
260 !strncmp(buf, "eos", 3)) {
261 parse_indep(m, &sinfo, line, buf, error_class);
267 read_file(struct input_info *m, char *fn)
270 ifp = fopen(fn, "r");
274 do_read_file(m, ifp);
279 write_nl(FILE *fp, int i)
281 i = anthy_dic_htonl(i);
282 fwrite(&i, sizeof(int), 1, fp);
286 dump_line(FILE *ofp, struct input_line *il)
289 for (i = 0; i < FEATURE_SET_SIZE || i < il->nr_features; i++) {
293 if (i < il->nr_features) {
294 fprintf(ofp, "%d", il->features[i]);
299 fprintf(ofp,",%d,%d\n", (int)il->negative_weight, (int)il->weight);
303 compare_line(const void *p1, const void *p2)
305 const struct input_line *const *il1 = p1;
306 const struct input_line *const *il2 = p2;
308 for (i = 0; i < (*il1)->nr_features &&
309 i < (*il2)->nr_features; i++) {
310 if ((*il1)->features[i] !=
311 (*il2)->features[i]) {
312 return (*il1)->features[i] - (*il2)->features[i];
315 return (*il1)->nr_features - (*il2)->nr_features;
319 dump_features(FILE *ofp, struct input_set *is)
321 struct input_line *il, **lines;
326 for (il = input_set_get_input_line(is); il; il = il->next_line) {
328 weight += (int)il->weight;
331 lines = malloc(sizeof(struct input_line *) * nr);
332 for (il = input_set_get_input_line(is), i = 0; i < nr;
333 i++, il = il->next_line) {
337 qsort(lines, nr, sizeof(struct input_line *), compare_line);
339 fprintf(ofp, "%d %d total_line_weight,count\n", weight, nr);
341 for (i = 0; i < nr; i++) {
342 dump_line(ofp, lines[i]);
349 dump_input_info(FILE *ofp, struct input_info *m)
351 fprintf(ofp, "section anthy.trans_info ");
352 dump_features(ofp, m->seg_is);
353 fprintf(ofp, "section anthy.cand_info ");
354 dump_features(ofp, m->cand_is);
355 fprintf(ofp, "section anthy.corpus_bucket ");
356 corpus_write_bucket(ofp, m->indep_corpus);
357 fprintf(ofp, "section anthy.corpus_array ");
358 corpus_write_array(ofp, m->indep_corpus);
360 fprintf(ofp, "section anthy.feature_info ");
361 input_set_output_feature_freq(ofp, m->seg_is);
365 convert_line(FILE *ofp, char *buf)
368 tok = strtok(buf, ",");
372 tok = strtok(NULL, ",");
377 convert_file(FILE *ifp)
381 while (fgets(buf, 1024, ifp)) {
386 if (!strncmp("section", buf, 7)) {
393 sscanf(buf, "section %s %d %d", fn, &w, &n);
394 ofp = fopen(fn, "w");
396 fprintf(stderr, "failed to open (%s)\n", fn);
401 for (i = 0; i < NR_EM_FEATURES; i++) {
405 convert_line(ofp, buf);
414 convert_data(int nr_fn, char **fns)
419 for (i = 0; i < nr_fn; i++) {
420 ifp = fopen(fns[i], "r");
422 fprintf(stderr, "failed to open (%s)\n", fns[i]);
431 #define STRING_HASH_SIZE 256
435 struct string_node *next_hash;
439 struct string_node hash[STRING_HASH_SIZE];
440 struct string_node **array;
446 struct extract_stat {
448 struct resize_info info[MAX_SEGMENT];
452 string_pool_init(struct string_pool *sp)
455 for (i = 0; i < STRING_HASH_SIZE; i++) {
456 sp->hash[i].next_hash = NULL;
462 compare_string_node(const void *p1, const void *p2)
464 const struct string_node *const *n1 = p1;
465 const struct string_node *const *n2 = p2;
466 return (*n1)->key -(*n2)->key;
470 string_pool_sort(struct string_pool *sp)
473 sp->array = malloc(sizeof(struct string_node *) * sp->nr);
474 for (idx = 0, h = 0; h < STRING_HASH_SIZE; h++) {
475 struct string_node *node;
476 for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
477 sp->array[idx] = node;
482 qsort(sp->array, sp->nr, sizeof(struct string_node *), compare_string_node);
486 string_pool_dump(FILE *ofp, struct string_pool *sp)
489 fprintf(ofp, "section anthy.weak_words 0 %d\n", sp->nr);
490 for (i = 0; i < sp->nr; i++) {
491 fprintf(ofp, "%d\n", sp->array[i]->key);
496 string_hash(const unsigned char *str)
504 return h % STRING_HASH_SIZE;
507 static struct string_node *
508 find_string_node(struct string_pool *sp, const char *str)
510 int h = (int)string_hash((const unsigned char *)str);
511 struct string_node *node;
512 for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
513 if (!strcmp(str, node->str)) {
518 node = malloc(sizeof(*node));
519 node->str = strdup(str);
521 node->next_hash = sp->hash[h].next_hash;
522 sp->hash[h].next_hash = node;
528 flush_extract_stat(struct extract_stat *es, struct string_pool *sp)
531 for (i = 0; i < es->nr; i++) {
532 if (es->info[i].valid) {
533 struct string_node *node;
534 node = find_string_node(sp, es->info[i].indep);
535 if (node->key == 0) {
536 xstr *xs = anthy_cstr_to_xstr(node->str, ANTHY_EUC_JP_ENCODING);
537 node->key = anthy_xstr_hash(xs);
540 /* printf("(%s)%d\n", es->info[i].indep, node->key); */
542 free(es->info[i].indep);
543 es->info[i].indep = NULL;
549 get_indep_part(char *buf)
552 char *c = strchr(buf, '#');
572 fixup_missed_word(struct extract_stat *es, char *buf)
575 char *c = get_indep_part(buf);
579 for (i = 0; i < es->nr; i++) {
580 if (!strcmp(es->info[i].indep, c)) {
581 es->info[i].valid = 0;
587 fill_missed_word(struct extract_stat *es, char *buf)
589 char *c = get_indep_part(buf);
593 es->info[es->nr].indep = strdup(c);
594 es->info[es->nr].valid = 1;
599 extract_word_from_file(FILE *ifp, struct string_pool *sp)
603 struct extract_stat es;
606 for (i = 0; i < MAX_SEGMENT; i++) {
607 es.info[i].indep = NULL;
610 while (fgets(buf, 1024, ifp)) {
614 if (buf[0] == '\n' ||
616 flush_extract_stat(&es, sp);
620 if (!strncmp("!indep_word ", buf, 12)) {
621 fill_missed_word(&es, buf);
623 if (!strncmp("indep_word", buf, 10)) {
624 fixup_missed_word(&es, buf);
627 flush_extract_stat(&es, sp);
631 extract_word(int nr_fn, char **fns, FILE *ofp)
633 struct string_pool sp;
637 string_pool_init(&sp);
639 for (i = 0; i < nr_fn; i++) {
640 ifp = fopen(fns[i], "r");
642 fprintf(stderr, "failed to open (%s)\n", fns[i]);
645 extract_word_from_file(ifp, &sp);
649 string_pool_sort(&sp);
650 string_pool_dump(ofp, &sp);
653 /* ÊÑ´¹·ë²Ì¤«¤é³ÎΨ¤Î¥Æ¡¼¥Ö¥ë¤òºî¤ë */
655 proc_corpus(int nr_fn, char **fns, FILE *ofp)
658 struct input_info *m;
660 m = init_input_info();
662 for (i = 0; i < nr_fn; i++) {
663 m->nth_input_file = i;
664 read_file(m, fns[i]);
667 corpus_build(m->indep_corpus);
669 dump_input_info(ofp, m);
671 fprintf(stderr, " %d sentences\n", m->nr_sentences);
672 fprintf(stderr, " %d connections\n", m->nr_connections);
673 fprintf(stderr, " %d segments\n", m->nr_connections - m->nr_sentences);
677 main(int argc, char **argv)
687 input_files = malloc(sizeof(char *) * argc);
689 for (i = 1; i < argc; i++) {
691 if (!strcmp(arg, "-o")) {
692 ofp = fopen(argv[i+1], "w");
694 fprintf(stderr, "failed to open (%s)\n", argv[i+1]);
697 } else if (!strcmp(arg, "-c") ||
698 !strcmp(arg, "--convert")) {
700 } else if (!strcmp(arg, "-e") ||
701 !strcmp(arg, "--extract")) {
704 input_files[nr_input] = arg;
709 printf(" -- extracting missed words\n");
713 extract_word(nr_input, input_files, ofp);
718 printf(" -- generating dictionary in text form\n");
719 proc_corpus(nr_input, input_files, ofp);
723 printf(" -- converting dictionary from text to binary form\n");
724 convert_data(nr_input, input_files);