2 * Îãʸ¤ÎÃæ¤ò¸¡º÷¤Ç¤¤ë¥Ç¡¼¥¿¹½Â¤¤òºî¤ë
3 * ¸½»þÅÀ¤Ç¤ÏÎãʸ¤ò¤¹¤Ù¤ÆÆþ¤ì¤Æ¤¤¤ë¤¬¡¢¤½¤Î¤¦¤Á¥Õ¥£¥ë¥¿¡¼¤¹¤ë¤³¤È¤â¹Í¤¨¤é¤ì¤ë
5 * Copyright (C) 2007 TABATA Yusuke
9 This library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2 of the License, or (at your option) any later version.
14 This library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
19 You should have received a copy of the GNU Lesser General Public
20 License along with this library; if not, write to the Free Software
21 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 #include <anthy/corpus.h>
30 #define BUCKET_SIZE 8192
31 #define MAX_COLLISION 8
40 /* word feature in corpus file */
44 /* ͸ú¤Ê(ELM_INVALID¤Î̵¤¤)¥¨¥ó¥È¥ê¤È¤·¤Æ¤Îindex */
46 /* ¤³¤ÎhashÃͤμ¡¤Î½Ð¸½¾ì½ê */
73 struct bucket *buckets;
80 corpus_setup_bucket(struct corpus *c, int nr)
86 c->buckets = malloc(sizeof(struct bucket) * nr);
87 for (i = 0; i < nr; i++) {
88 c->buckets[i].key = -1;
89 c->buckets[i].first_idx = -1;
90 c->buckets[i].last_idx = -1;
97 struct corpus *c = malloc(sizeof(*c));
104 c->bucket_collision = 0;
109 corpus_ensure_array(struct corpus *c, int nr)
112 if (c->array_size >= nr) {
116 c->array = realloc(c->array, sizeof(struct node) * size);
117 for (i = c->array_size; i < size; i++) {
124 corpus_dump(struct corpus *c)
127 for (i = 0; i < c->nr_values; i++) {
128 if (c->elms[i].flags & ELM_WORD_BORDER) {
131 printf("%d(%d) ", c->elms[i].val, c->elms[i].next_idx);
137 count_nr_valid_values(struct corpus *c)
141 for (i = 0; i < c->nr_node; i++) {
142 struct node *nd = &c->array[i];
143 if (nd->flags & ELM_INVALID) {
152 corpus_build_flatten(struct corpus *c)
156 int nr_valid_elms = count_nr_valid_values(c);
157 c->elms = malloc(sizeof(struct element) * nr_valid_elms);
158 for (i = 0; i < c->nr_node; i++) {
159 struct node *nd = &c->array[i];
160 if (nd->flags & ELM_INVALID) {
163 for (j = 0; j < nd->nr; j++) {
164 c->elms[idx].val = nd->val[j];
165 c->elms[idx].next_idx = -1;
166 c->elms[idx].flags = nd->flags;
168 c->elms[idx].flags |= ELM_WORD_BORDER;
170 c->elms[idx].idx = idx;
176 static struct bucket *
177 find_bucket(struct corpus *c, int val)
180 int h = val % c->nr_buckets;
181 for (i = 0; i < MAX_COLLISION; i++) {
182 struct bucket *bkt = &c->buckets[h];
183 if (bkt->key == val) {
186 if (bkt->key == -1) {
194 c->bucket_collision ++;
199 corpus_build_link(struct corpus *c)
202 for (i = 0; i < c->nr_values; i++) {
203 struct element *elm = &c->elms[i];
204 struct bucket *bkt = find_bucket(c, elm->val);
208 if (bkt->first_idx < 0) {
210 bkt->first_idx = c->elms[i].idx;
212 c->elms[bkt->last_idx].next_idx = c->elms[i].idx;
214 bkt->last_idx = c->elms[i].idx;
215 c->elms[i].next_idx = -1;
220 corpus_build(struct corpus *c)
222 corpus_setup_bucket(c, BUCKET_SIZE);
224 corpus_build_flatten(c);
225 corpus_build_link(c);
230 corpus_push_back(struct corpus *c, int *val, int nr, int flags)
234 for (i = 0; i < nr; i++) {
240 corpus_ensure_array(c, c->nr_node+1);
241 c->array[c->nr_node] = nd;
243 c->nr_values += nd.nr;
247 corpus_write_bucket(FILE *fp, struct corpus *c)
250 fprintf(fp, "0 %d\n", c->nr_buckets);
251 for (i = 0; i < c->nr_buckets; i++) {
252 fprintf(fp, "%d,%d\n", (c->buckets[i].key & CORPUS_KEY_MASK),
253 c->buckets[i].first_idx);
255 printf(" %d collisions in corpus bucket\n", c->bucket_collision);
259 corpus_write_array(FILE *fp, struct corpus *c)
262 fprintf(fp, "0 %d\n", c->nr_values);
263 for (i = 0; i < c->nr_values; i++) {
264 struct element *elm = &c->elms[i];
267 val &= CORPUS_KEY_MASK;
268 if (elm->flags & ELM_BOS) {
271 if (elm->flags & ELM_WORD_BORDER) {
272 val |= ELM_WORD_BORDER;
274 fprintf(fp, "%d,%d\n", val,
275 c->elms[i].next_idx);