2 * ʸ¤òʸÀá¤Ësplit¤¹¤ësplitter
5 * anthy_init_split_context() ʬ³äÍѤΥ³¥ó¥Æ¥¥¹¥È¤òºî¤Ã¤Æ
6 * anthy_mark_border() ʬ³ä¤ò¤·¤Æ
7 * anthy_release_split_context() ¥³¥ó¥Æ¥¥¹¥È¤ò²òÊü¤¹¤ë
9 * anthy_commit_border() ¥³¥ß¥Ã¥È¤µ¤ì¤¿ÆâÍƤËÂФ·¤Æ³Ø½¬¤ò¤¹¤ë
11 * Funded by IPA̤Ƨ¥½¥Õ¥È¥¦¥§¥¢ÁϤ»ö¶È 2001 9/22
13 * Copyright (C) 2004 YOSHIDA Yuichi
14 * Copyright (C) 2000-2004 TABATA Yusuke
15 * Copyright (C) 2000-2001 UGAWA Tomoharu
17 * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
20 This library is free software; you can redistribute it and/or
21 modify it under the terms of the GNU Lesser General Public
22 License as published by the Free Software Foundation; either
23 version 2 of the License, or (at your option) any later version.
25 This library is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28 Lesser General Public License for more details.
30 You should have received a copy of the GNU Lesser General Public
31 License along with this library; if not, write to the Free Software
32 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37 #include <anthy/alloc.h>
38 #include <anthy/record.h>
39 #include <anthy/splitter.h>
40 #include <anthy/logger.h>
41 #include "wordborder.h"
43 #define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
45 static int splitter_debug_flags;
48 wtype_t anthy_wtype_noun;
49 wtype_t anthy_wtype_name_noun;
50 wtype_t anthy_wtype_num_noun;
51 wtype_t anthy_wtype_prefix;
52 wtype_t anthy_wtype_num_prefix;
53 wtype_t anthy_wtype_num_postfix;
54 wtype_t anthy_wtype_name_postfix;
55 wtype_t anthy_wtype_sv_postfix;
56 wtype_t anthy_wtype_a_tail_of_v_renyou;
57 wtype_t anthy_wtype_v_renyou;
58 wtype_t anthy_wtype_noun_tail;/* ¤¤¤ì¡Ö¤¿¤Æ¡×¤È¤« */
59 wtype_t anthy_wtype_n1;
60 wtype_t anthy_wtype_n10;
63 /** make_word_cache¤ÇºîÀ®¤·¤¿Ê¸Àá¾ðÊó¤ò²òÊü¤¹¤ë
66 release_info_cache(struct splitter_context *sc)
68 struct word_split_info_cache *info = sc->word_split_info;
70 anthy_free_allocator(info->MwAllocator);
71 anthy_free_allocator(info->WlAllocator);
74 free(info->rev_seq_len);
79 metaword_dtor(void *p)
81 struct meta_word *mw = (struct meta_word*)p;
82 if (mw->cand_hint.str) {
83 free(mw->cand_hint.str);
89 alloc_char_ent(xstr *xs, struct splitter_context *sc)
93 sc->char_count = xs->len;
94 sc->ce = (struct char_ent*)
95 malloc(sizeof(struct char_ent)*(xs->len + 1));
96 for (i = 0; i <= xs->len; i++) {
97 sc->ce[i].c = &xs->str[i];
98 sc->ce[i].seg_border = 0;
99 sc->ce[i].initial_seg_len = 0;
100 sc->ce[i].best_seg_class = SEG_HEAD;
101 sc->ce[i].best_mw = NULL;
104 /* º¸±¦Î¾Ã¼¤ÏʸÀá¤Î¶³¦¤Ç¤¢¤ë */
105 sc->ce[0].seg_border = 1;
106 sc->ce[xs->len].seg_border = 1;
109 /* ¤³¤³¤Ç³ÎÊݤ·¤¿ÆâÍƤÏrelease_info_cache¤Ç²òÊü¤µ¤ì¤ë
112 alloc_info_cache(struct splitter_context *sc)
115 struct word_split_info_cache *info;
117 /* ¥¥ã¥Ã¥·¥å¤Î¥Ç¡¼¥¿¤ò³ÎÊÝ */
118 sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
119 info = sc->word_split_info;
120 info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
121 info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
123 malloc(sizeof(struct char_node) * (sc->char_count + 1));
125 info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
126 info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
128 /* ³Æʸ»ú¥¤¥ó¥Ç¥Ã¥¯¥¹¤ËÂФ·¤Æ½é´ü²½¤ò¹Ô¤¦ */
129 for (i = 0; i <= sc->char_count; i++) {
130 info->seq_len[i] = 0;
131 info->rev_seq_len[i] = 0;
132 info->cnode[i].wl = NULL;
133 info->cnode[i].mw = NULL;
134 info->cnode[i].max_len = 0;
138 /** ³°¤«¤é¸Æ¤Ó½Ð¤µ¤ì¤ëwordsplitter¤Î¥È¥Ã¥×¥ì¥Ù¥ë¤Î´Ø¿ô */
140 anthy_mark_border(struct splitter_context *sc,
141 int from, int from2, int to)
144 struct word_split_info_cache *info;
147 if ((to - from) <= 0) {
151 /* ¶³¦¥Þ¡¼¥¯ÍѤÈlattice¤Î¸¡º÷¤ÇÍѤ¤¤é¤ì¤ë¥¯¥é¥¹ÍѤÎÎΰè¤ò³ÎÊÝ */
152 info = sc->word_split_info;
153 info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
154 info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
155 info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
156 for (i = 0; i < sc->char_count + 1; ++i) {
157 info->seg_border[i] = sc->ce[i].seg_border;
158 info->best_seg_class[i] = sc->ce[i].best_seg_class;
159 info->best_mw[i] = sc->ce[i].best_mw;
163 anthy_eval_border(sc, from, from2, to);
165 for (i = from; i < to; ++i) {
166 sc->ce[i].seg_border = info->seg_border[i];
167 sc->ce[i].best_seg_class = info->best_seg_class[i];
168 sc->ce[i].best_mw = info->best_mw[i];
172 /* ʸÀ᤬³ÈÂ礵¤ì¤¿¤Î¤Ç¡¤¤½¤ì¤ò³Ø½¬¤¹¤ë */
174 proc_expanded_segment(struct splitter_context *sc,
177 int initial_len = sc->ce[from].initial_seg_len;
179 xstr from_xs, to_xs, *xs;
181 from_xs.str = sc->ce[from].c;
182 from_xs.len = initial_len;
183 to_xs.str = sc->ce[from].c;
185 if (anthy_select_section("EXPANDPAIR", 1) == -1) {
188 if (anthy_select_row(&from_xs, 1) == -1) {
191 nr = anthy_get_nr_values();
192 for (i = 0; i < nr; i ++) {
193 xs = anthy_get_nth_xstr(i);
194 if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
199 anthy_set_nth_xstr(nr, &to_xs);
200 anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
203 /* ʸÀá¤Î¥Þ¡¼¥¸¤È¸ìÈø¤ò³Ø½¬¤¹¤ë */
205 anthy_commit_border(struct splitter_context *sc, int nr_segments,
206 struct meta_word **mw, int *seg_len)
211 for (i = 0; i < nr_segments; i++) {
212 /* ¤½¤ì¤¾¤ì¤ÎʸÀá¤ËÂФ·¤Æ */
214 int len = seg_len[i];
215 int initial_len = sc->ce[from].initial_seg_len;
219 if (!initial_len || from + initial_len == sc->char_count) {
220 /* ¤½¤³¤Ï¶³¦¤Ç¤Ï¤Ê¤¤ */
223 l2 = sc->ce[from + initial_len].initial_seg_len;
224 if (initial_len + l2 > len) {
225 /* ÎÙ¤ÎʸÀá¤ò´Þ¤à¤Û¤É³ÈÂ礵¤ì¤¿¤ï¤±¤Ç¤Ï¤Ê¤¤ */
229 real_len = mw[i]->len;
231 if (real_len <= initial_len) {
234 /* ±¦¤ÎʸÀá¤ò´Þ¤àŤµ¤Ë³ÈÄ¥¤µ¤ì¤¿Ê¸À᤬¥³¥ß¥Ã¥È¤µ¤ì¤¿ */
235 proc_expanded_segment(sc, from, real_len);
242 anthy_splitter_debug_flags(void)
244 return splitter_debug_flags;
248 anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
250 alloc_char_ent(xs, sc);
251 alloc_info_cache(sc);
252 sc->is_reverse = is_reverse;
253 /* Á´¤Æ¤ÎÉôʬʸ»úÎó¤ò¥Á¥§¥Ã¥¯¤·¤Æ¡¢Ê¸Àá¤Î¸õÊä¤òÎóµó¤¹¤ë
254 word_list¤ò¹½À®¤·¤Æ¤«¤émetaword¤ò¹½À®¤¹¤ë */
256 anthy_make_word_list_all(sc);
258 anthy_make_metaword_all(sc);
263 anthy_release_split_context(struct splitter_context *sc)
265 if (sc->word_split_info) {
266 release_info_cache(sc);
267 sc->word_split_info = 0;
275 /** splitterÁ´ÂΤνé´ü²½¤ò¹Ô¤¦ */
277 anthy_init_splitter(void)
279 /* ¥Ç¥Ð¥Ã¥°¥×¥ê¥ó¥È¤ÎÀßÄê */
280 char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
281 char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
282 splitter_debug_flags = SPLITTER_DEBUG_NONE;
283 if (!dis && en && strlen(en)) {
284 char *fs = getenv("ANTHY_SPLITTER_PRINT");
286 if (strchr(fs, 'w')) {
287 splitter_debug_flags |= SPLITTER_DEBUG_WL;
289 if (strchr(fs, 'm')) {
290 splitter_debug_flags |= SPLITTER_DEBUG_MW;
292 if (strchr(fs, 'l')) {
293 splitter_debug_flags |= SPLITTER_DEBUG_LN;
295 if (strchr(fs, 'i')) {
296 splitter_debug_flags |= SPLITTER_DEBUG_ID;
298 if (strchr(fs, 'c')) {
299 splitter_debug_flags |= SPLITTER_DEBUG_CAND;
303 /* ÉÕ°¸ì¥°¥é¥Õ¤Î½é´ü²½ */
304 if (anthy_init_depword_tab()) {
305 anthy_log(0, "Failed to init dependent word table.\n");
309 anthy_wtype_noun = anthy_init_wtype_by_name("̾»ì35");
310 anthy_wtype_name_noun = anthy_init_wtype_by_name("¿Í̾");
311 anthy_wtype_num_noun = anthy_init_wtype_by_name("¿ô»ì");
312 anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("·ÁÍƻ첽ÀÜÈø¸ì");
313 anthy_wtype_v_renyou = anthy_init_wtype_by_name("Æ°»ìÏ¢ÍÑ·Á");
314 anthy_wtype_noun_tail = anthy_init_wtype_by_name("̾»ì²½ÀÜÈø¸ì");
315 anthy_wtype_prefix = anthy_init_wtype_by_name("̾»ìÀÜƬ¼");
316 anthy_wtype_num_prefix = anthy_init_wtype_by_name("¿ôÀÜƬ¼");
317 anthy_wtype_num_postfix = anthy_init_wtype_by_name("¿ôÀÜÈø¼");
318 anthy_wtype_name_postfix = anthy_init_wtype_by_name("¿Í̾ÀÜÈø¼");
319 anthy_wtype_sv_postfix = anthy_init_wtype_by_name("¥µÊÑÀÜÈø¼");
320 anthy_wtype_n1 = anthy_init_wtype_by_name("¿ô»ì1");
321 anthy_wtype_n10 = anthy_init_wtype_by_name("¿ô»ì10");
326 anthy_quit_splitter(void)
328 anthy_quit_depword_tab();