src/storage/tag_utility.cpp

   1 #include <glib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <assert.h>
   5 #include "novel_types.h"
   6 #include "phrase_index.h"
   7 #include "phrase_large_table2.h"
   8 #include "tag_utility.h"
   9
  10 namespace pinyin{
  11
  12 /* internal taglib structure */
  13 struct tag_entry{
  14     int m_line_type;
  15     char * m_line_tag;
  16     int m_num_of_values;
  17     char ** m_required_tags;
  18     /* char ** m_optional_tags; */
  19     /* int m_optional_count = 0; */
  20     char ** m_ignored_tags;
  21 };
  22
  23 tag_entry tag_entry_copy(int line_type, const char * line_tag,
  24                          int num_of_values,
  25                          char * required_tags[],
  26                          char * ignored_tags[]){
  27     tag_entry entry;
  28     entry.m_line_type = line_type;
  29     entry.m_line_tag = g_strdup( line_tag );
  30     entry.m_num_of_values = num_of_values;
  31     entry.m_required_tags = g_strdupv( required_tags );
  32     entry.m_ignored_tags = g_strdupv( ignored_tags );
  33     return entry;
  34 }
  35
  36 tag_entry tag_entry_clone(tag_entry * entry){
  37     return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
  38                           entry->m_num_of_values,
  39                           entry->m_required_tags, entry->m_ignored_tags);
  40 }
  41
  42 void tag_entry_reclaim(tag_entry * entry){
  43     g_free( entry->m_line_tag );
  44     g_strfreev( entry->m_required_tags );
  45     g_strfreev(entry->m_ignored_tags);
  46 }
  47
  48 static bool taglib_free_tag_array(GArray * tag_array){
  49     for ( size_t i = 0; i < tag_array->len; ++i) {
  50         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
  51         tag_entry_reclaim(entry);
  52     }
  53     g_array_free(tag_array, TRUE);
  54     return true;
  55 }
  56
  57 /* special unichar to be handled in split_line. */
  58 static gunichar backslash = 0;
  59 static gunichar quote = 0;
  60
  61 static gboolean split_line_init(){
  62     backslash = g_utf8_get_char("\\");
  63     quote = g_utf8_get_char("\"");
  64     return TRUE;
  65 }
  66
  67 /* Pointer Array of Array of tag_entry */
  68 static GPtrArray * g_tagutils_stack = NULL;
  69
  70 bool taglib_init(){
  71     assert( g_tagutils_stack == NULL);
  72     g_tagutils_stack = g_ptr_array_new();
  73     GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
  74     g_ptr_array_add(g_tagutils_stack, tag_array);
  75
  76     /* init split_line. */
  77     split_line_init();
  78     return true;
  79 }
  80
  81 bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
  82                     const char * required_tags, const char * ignored_tags){
  83     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
  84                                      g_tagutils_stack->len - 1);
  85
  86     /* some duplicate tagname or line_type check here. */
  87     for ( size_t i = 0; i < tag_array->len; ++i) {
  88         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
  89         if ( entry->m_line_type == line_type ||
  90              strcmp( entry->m_line_tag, line_tag ) == 0 )
  91             return false;
  92     }
  93
  94     char ** required = g_strsplit_set(required_tags, ",:", -1);
  95     char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
  96
  97     tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
  98                                      required, ignored);
  99     g_array_append_val(tag_array, entry);
 100
 101     g_strfreev(required);
 102     g_strfreev(ignored);
 103     return true;
 104 }
 105
 106 static void ptr_array_entry_free(gpointer data, gpointer user_data){
 107     g_free(data);
 108 }
 109
 110 static gboolean hash_table_key_value_free(gpointer key, gpointer value,
 111                                           gpointer user_data){
 112     g_free(key);
 113     g_free(value);
 114     return TRUE;
 115 }
 116
 117 /* split the line into tokens. */
 118 static gchar ** split_line(const gchar * line){
 119     /* array for tokens. */
 120     GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
 121
 122     for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
 123         gunichar unichar = g_utf8_get_char(cur);
 124         const gchar * begin = cur;
 125         gchar * token = NULL;
 126
 127         if ( g_unichar_isspace (unichar) ) {
 128             continue;
 129         }else if ( unichar == quote ) {
 130             /* handles "\"". */
 131             /* skip the first '"'. */
 132             begin = cur = g_utf8_next_char(cur);
 133             while (*cur) {
 134                 unichar = g_utf8_get_char(cur);
 135                 if ( unichar == backslash ) {
 136                     cur = g_utf8_next_char(cur);
 137                     g_return_val_if_fail(*cur, NULL);
 138                 } else if ( unichar == quote ){
 139                     break;
 140                 }
 141                 cur = g_utf8_next_char(cur);
 142             }
 143             gchar * tmp = g_strndup( begin, cur - begin);
 144             /* TODO: switch to own strdup_escape implementation
 145                for \"->" transforming. */
 146             token = g_strdup_printf("%s", tmp);
 147             g_free(tmp);
 148         } else {
 149             /* handles other tokens. */
 150             while(*cur) {
 151                 unichar = g_utf8_get_char(cur);
 152                 if ( g_unichar_isgraph(unichar) ) {
 153                     /* next unichar */
 154                     cur = g_utf8_next_char(cur);
 155                 } else {
 156                     /* space and other characters handles. */
 157                     break;
 158                 }
 159             }
 160             token = g_strndup( begin, cur - begin );
 161         }
 162
 163         g_array_append_val(tokens, token);
 164         if ( !*cur )
 165             break;
 166     }
 167
 168     return (gchar **)g_array_free(tokens, FALSE);
 169 }
 170
 171 bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
 172                  GHashTable * required){
 173     /* reset values and required. */
 174     g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
 175     g_ptr_array_set_size(values, 0);
 176     g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
 177
 178     /* use own version of split_line
 179        instead of g_strsplit_set for special token.*/
 180     char ** tokens = split_line(input_line);
 181     int num_of_tokens = g_strv_length(tokens);
 182
 183     char * line_tag = tokens[0];
 184     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 185
 186     tag_entry * cur_entry = NULL;
 187     /* find line type. */
 188     for ( size_t i = 0; i < tag_array->len; ++i) {
 189         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
 190         if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
 191             cur_entry = entry;
 192             break;
 193         }
 194     }
 195
 196     if ( !cur_entry )
 197         return false;
 198
 199     line_type = cur_entry->m_line_type;
 200
 201     for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
 202         g_return_val_if_fail(i < num_of_tokens, false);
 203         char * value = g_strdup( tokens[i] );
 204         g_ptr_array_add(values, value);
 205     }
 206
 207     int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
 208     int required_len = g_strv_length( cur_entry->m_required_tags);
 209
 210     for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
 211         g_return_val_if_fail(i < num_of_tokens, false);
 212         const char * tmp = tokens[i];
 213
 214         /* check ignored tags. */
 215         bool tag_ignored = false;
 216         for ( int m = 0; m < ignored_len; ++m) {
 217             if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
 218                 tag_ignored = true;
 219                 break;
 220             }
 221         }
 222
 223         if ( tag_ignored ) {
 224             ++i;
 225             continue;
 226         }
 227
 228         /* check required tags. */
 229         bool tag_required = false;
 230         for ( int m = 0; m < required_len; ++m) {
 231             if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
 232                 tag_required = true;
 233                 break;
 234             }
 235         }
 236
 237         /* warning on the un-expected tags. */
 238         if ( !tag_required ) {
 239             g_warning("un-expected tags:%s.\n", tmp);
 240             ++i;
 241             continue;
 242         }
 243
 244         char * key = g_strdup(tokens[i]);
 245         ++i;
 246         g_return_val_if_fail(i < num_of_tokens, false);
 247         char * value = g_strdup(tokens[i]);
 248         g_hash_table_insert(required, key, value);
 249     }
 250
 251     /* check for all required tags. */
 252     for ( int i = 0; i < required_len; ++i) {
 253         const char * required_tag_str = cur_entry->m_required_tags[i];
 254         gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
 255         if ( !result ) {
 256             g_warning("missed required tags: %s.\n", required_tag_str);
 257             g_strfreev(tokens);
 258             return false;
 259         }
 260     }
 261
 262     g_strfreev(tokens);
 263     return true;
 264 }
 265
 266 bool taglib_remove_tag(int line_type){
 267     /* Note: duplicate entry check is in taglib_add_tag. */
 268     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 269     for ( size_t i = 0; i < tag_array->len; ++i) {
 270         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
 271         if (entry->m_line_type != line_type)
 272             continue;
 273         tag_entry_reclaim(entry);
 274         g_array_remove_index(tag_array, i);
 275         return true;
 276     }
 277     return false;
 278 }
 279
 280 bool taglib_push_state(){
 281     assert(g_tagutils_stack->len >= 1);
 282     GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
 283     GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 284     for ( size_t i = 0; i < prev_tag_array->len; ++i) {
 285         tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
 286         tag_entry new_entry = tag_entry_clone(entry);
 287         g_array_append_val(next_tag_array, new_entry);
 288     }
 289     g_ptr_array_add(g_tagutils_stack, next_tag_array);
 290     return true;
 291 }
 292
 293 bool taglib_pop_state(){
 294     assert(g_tagutils_stack->len > 1);
 295     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 296     g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 297     taglib_free_tag_array(tag_array);
 298     return true;
 299 }
 300
 301 bool taglib_fini(){
 302     for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
 303         GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
 304         taglib_free_tag_array(tag_array);
 305     }
 306     g_ptr_array_free(g_tagutils_stack, TRUE);
 307     g_tagutils_stack = NULL;
 308     return true;
 309 }
 310
 311 #if 0
 312
 313 static phrase_token_t taglib_special_string_to_token(const char * string){
 314     struct token_pair{
 315         phrase_token_t token;
 316         const char * string;
 317     };
 318
 319     static const token_pair tokens [] = {
 320         {sentence_start, "<start>"},
 321         {0, NULL}
 322     };
 323
 324     const token_pair * pair = tokens;
 325     while (pair->string) {
 326         if ( strcmp(string, pair->string ) == 0 )
 327             return pair->token;
 328         pair++;
 329     }
 330
 331     fprintf(stderr, "error: unknown token:%s.\n", string);
 332     return 0;
 333 }
 334
 335 phrase_token_t taglib_string_to_token(PhraseLargeTable2 * phrase_table,
 336                                       FacadePhraseIndex * phrase_index,
 337                                       const char * string){
 338     phrase_token_t token = null_token;
 339     if ( string[0] == '<' ) {
 340         return taglib_special_string_to_token(string);
 341     }
 342
 343     glong phrase_len = g_utf8_strlen(string, -1);
 344     ucs4_t * phrase = g_utf8_to_ucs4(string, -1, NULL, NULL, NULL);
 345
 346     PhraseTokens tokens;
 347     memset(tokens, 0, sizeof(PhraseTokens));
 348     phrase_index->prepare_tokens(tokens);
 349     int result = phrase_table->search(phrase_len, phrase, tokens);
 350     int num = get_first_token(tokens, token);
 351     phrase_index->destroy_tokens(tokens);
 352
 353     if ( !(result & SEARCH_OK) )
 354         fprintf(stderr, "error: unknown token:%s.\n", string);
 355
 356     g_free(phrase);
 357     return token;
 358 }
 359
 360 #endif
 361
 362 static const char * taglib_special_token_to_string(phrase_token_t token){
 363     struct token_pair{
 364         phrase_token_t token;
 365         const char * string;
 366     };
 367
 368     static const token_pair tokens [] = {
 369         {sentence_start, "<start>"},
 370         {0, NULL}
 371     };
 372
 373     const token_pair * pair = tokens;
 374     while (pair->token) {
 375         if ( token == pair->token )
 376             return pair->string;
 377         pair++;
 378     }
 379
 380     fprintf(stderr, "error: unknown token:%d.\n", token);
 381     return NULL;
 382 }
 383
 384 char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
 385                               phrase_token_t token) {
 386     PhraseItem item;
 387     ucs4_t buffer[MAX_PHRASE_LENGTH];
 388
 389     gchar * phrase;
 390     /* deal with the special phrase index, for "<start>..." */
 391     if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
 392         return g_strdup(taglib_special_token_to_string(token));
 393     }
 394
 395     int result = phrase_index->get_phrase_item(token, item);
 396     if (result != ERROR_OK) {
 397         fprintf(stderr, "error: unknown token:%d.\n", token);
 398         return NULL;
 399     }
 400
 401     item.get_phrase_string(buffer);
 402     guint8 length = item.get_phrase_length();
 403     phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
 404     return phrase;
 405 }
 406
 407 bool taglib_validate_token_with_string(FacadePhraseIndex * phrase_index,
 408                                        phrase_token_t token,
 409                                        const char * string){
 410     bool result = false;
 411
 412     char * str = taglib_token_to_string(phrase_index, token);
 413     result = (0 == strcmp(str, string));
 414     g_free(str);
 415
 416     return result;
 417 }
 418
 419
 420 };