5 #include "novel_types.h"
6 #include "phrase_index.h"
7 #include "phrase_large_table2.h"
8 #include "tag_utility.h"
12 /* internal taglib structure */
17 char ** m_required_tags;
18 /* char ** m_optional_tags; */
19 /* int m_optional_count = 0; */
20 char ** m_ignored_tags;
23 tag_entry tag_entry_copy(int line_type, const char * line_tag,
25 char * required_tags[],
26 char * ignored_tags[]){
28 entry.m_line_type = line_type;
29 entry.m_line_tag = g_strdup( line_tag );
30 entry.m_num_of_values = num_of_values;
31 entry.m_required_tags = g_strdupv( required_tags );
32 entry.m_ignored_tags = g_strdupv( ignored_tags );
36 tag_entry tag_entry_clone(tag_entry * entry){
37 return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
38 entry->m_num_of_values,
39 entry->m_required_tags, entry->m_ignored_tags);
42 void tag_entry_reclaim(tag_entry * entry){
43 g_free( entry->m_line_tag );
44 g_strfreev( entry->m_required_tags );
45 g_strfreev(entry->m_ignored_tags);
48 static bool taglib_free_tag_array(GArray * tag_array){
49 for ( size_t i = 0; i < tag_array->len; ++i) {
50 tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
51 tag_entry_reclaim(entry);
53 g_array_free(tag_array, TRUE);
57 /* special unichar to be handled in split_line. */
58 static gunichar backslash = 0;
59 static gunichar quote = 0;
61 static gboolean split_line_init(){
62 backslash = g_utf8_get_char("\\");
63 quote = g_utf8_get_char("\"");
67 /* Pointer Array of Array of tag_entry */
68 static GPtrArray * g_tagutils_stack = NULL;
71 assert( g_tagutils_stack == NULL);
72 g_tagutils_stack = g_ptr_array_new();
73 GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
74 g_ptr_array_add(g_tagutils_stack, tag_array);
76 /* init split_line. */
81 bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
82 const char * required_tags, const char * ignored_tags){
83 GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
84 g_tagutils_stack->len - 1);
86 /* some duplicate tagname or line_type check here. */
87 for ( size_t i = 0; i < tag_array->len; ++i) {
88 tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
89 if ( entry->m_line_type == line_type ||
90 strcmp( entry->m_line_tag, line_tag ) == 0 )
94 char ** required = g_strsplit_set(required_tags, ",:", -1);
95 char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
97 tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
99 g_array_append_val(tag_array, entry);
101 g_strfreev(required);
106 static void ptr_array_entry_free(gpointer data, gpointer user_data){
110 static gboolean hash_table_key_value_free(gpointer key, gpointer value,
117 /* split the line into tokens. */
118 static gchar ** split_line(const gchar * line){
119 /* array for tokens. */
120 GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
122 for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
123 gunichar unichar = g_utf8_get_char(cur);
124 const gchar * begin = cur;
125 gchar * token = NULL;
127 if ( g_unichar_isspace (unichar) ) {
129 }else if ( unichar == quote ) {
131 /* skip the first '"'. */
132 begin = cur = g_utf8_next_char(cur);
134 unichar = g_utf8_get_char(cur);
135 if ( unichar == backslash ) {
136 cur = g_utf8_next_char(cur);
137 g_return_val_if_fail(*cur, NULL);
138 } else if ( unichar == quote ){
141 cur = g_utf8_next_char(cur);
143 gchar * tmp = g_strndup( begin, cur - begin);
144 /* TODO: switch to own strdup_escape implementation
145 for \"->" transforming. */
146 token = g_strdup_printf("%s", tmp);
149 /* handles other tokens. */
151 unichar = g_utf8_get_char(cur);
152 if ( g_unichar_isgraph(unichar) ) {
154 cur = g_utf8_next_char(cur);
156 /* space and other characters handles. */
160 token = g_strndup( begin, cur - begin );
163 g_array_append_val(tokens, token);
168 return (gchar **)g_array_free(tokens, FALSE);
171 bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
172 GHashTable * required){
173 /* reset values and required. */
174 g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
175 g_ptr_array_set_size(values, 0);
176 g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
178 /* use own version of split_line
179 instead of g_strsplit_set for special token.*/
180 char ** tokens = split_line(input_line);
181 int num_of_tokens = g_strv_length(tokens);
183 char * line_tag = tokens[0];
184 GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
186 tag_entry * cur_entry = NULL;
187 /* find line type. */
188 for ( size_t i = 0; i < tag_array->len; ++i) {
189 tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
190 if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
199 line_type = cur_entry->m_line_type;
201 for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
202 g_return_val_if_fail(i < num_of_tokens, false);
203 char * value = g_strdup( tokens[i] );
204 g_ptr_array_add(values, value);
207 int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
208 int required_len = g_strv_length( cur_entry->m_required_tags);
210 for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
211 g_return_val_if_fail(i < num_of_tokens, false);
212 const char * tmp = tokens[i];
214 /* check ignored tags. */
215 bool tag_ignored = false;
216 for ( int m = 0; m < ignored_len; ++m) {
217 if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
228 /* check required tags. */
229 bool tag_required = false;
230 for ( int m = 0; m < required_len; ++m) {
231 if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
237 /* warning on the un-expected tags. */
238 if ( !tag_required ) {
239 g_warning("un-expected tags:%s.\n", tmp);
244 char * key = g_strdup(tokens[i]);
246 g_return_val_if_fail(i < num_of_tokens, false);
247 char * value = g_strdup(tokens[i]);
248 g_hash_table_insert(required, key, value);
251 /* check for all required tags. */
252 for ( int i = 0; i < required_len; ++i) {
253 const char * required_tag_str = cur_entry->m_required_tags[i];
254 gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
256 g_warning("missed required tags: %s.\n", required_tag_str);
266 bool taglib_remove_tag(int line_type){
267 /* Note: duplicate entry check is in taglib_add_tag. */
268 GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
269 for ( size_t i = 0; i < tag_array->len; ++i) {
270 tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
271 if (entry->m_line_type != line_type)
273 tag_entry_reclaim(entry);
274 g_array_remove_index(tag_array, i);
280 bool taglib_push_state(){
281 assert(g_tagutils_stack->len >= 1);
282 GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
283 GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
284 for ( size_t i = 0; i < prev_tag_array->len; ++i) {
285 tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
286 tag_entry new_entry = tag_entry_clone(entry);
287 g_array_append_val(next_tag_array, new_entry);
289 g_ptr_array_add(g_tagutils_stack, next_tag_array);
293 bool taglib_pop_state(){
294 assert(g_tagutils_stack->len > 1);
295 GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
296 g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
297 taglib_free_tag_array(tag_array);
302 for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
303 GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
304 taglib_free_tag_array(tag_array);
306 g_ptr_array_free(g_tagutils_stack, TRUE);
307 g_tagutils_stack = NULL;
313 static phrase_token_t taglib_special_string_to_token(const char * string){
315 phrase_token_t token;
319 static const token_pair tokens [] = {
320 {sentence_start, "<start>"},
324 const token_pair * pair = tokens;
325 while (pair->string) {
326 if ( strcmp(string, pair->string ) == 0 )
331 fprintf(stderr, "error: unknown token:%s.\n", string);
335 phrase_token_t taglib_string_to_token(PhraseLargeTable2 * phrase_table,
336 FacadePhraseIndex * phrase_index,
337 const char * string){
338 phrase_token_t token = null_token;
339 if ( string[0] == '<' ) {
340 return taglib_special_string_to_token(string);
343 glong phrase_len = g_utf8_strlen(string, -1);
344 ucs4_t * phrase = g_utf8_to_ucs4(string, -1, NULL, NULL, NULL);
347 memset(tokens, 0, sizeof(PhraseTokens));
348 phrase_index->prepare_tokens(tokens);
349 int result = phrase_table->search(phrase_len, phrase, tokens);
350 int num = get_first_token(tokens, token);
351 phrase_index->destroy_tokens(tokens);
353 if ( !(result & SEARCH_OK) )
354 fprintf(stderr, "error: unknown token:%s.\n", string);
362 static const char * taglib_special_token_to_string(phrase_token_t token){
364 phrase_token_t token;
368 static const token_pair tokens [] = {
369 {sentence_start, "<start>"},
373 const token_pair * pair = tokens;
374 while (pair->token) {
375 if ( token == pair->token )
380 fprintf(stderr, "error: unknown token:%d.\n", token);
384 char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
385 phrase_token_t token) {
387 ucs4_t buffer[MAX_PHRASE_LENGTH];
390 /* deal with the special phrase index, for "<start>..." */
391 if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
392 return g_strdup(taglib_special_token_to_string(token));
395 int result = phrase_index->get_phrase_item(token, item);
396 if (result != ERROR_OK) {
397 fprintf(stderr, "error: unknown token:%d.\n", token);
401 item.get_phrase_string(buffer);
402 guint8 length = item.get_phrase_length();
403 phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
407 bool taglib_validate_token_with_string(FacadePhraseIndex * phrase_index,
408 phrase_token_t token,
409 const char * string){
412 char * str = taglib_token_to_string(phrase_index, token);
413 result = (0 == strcmp(str, string));