src/storage/tag_utility.cpp

   1 #include <glib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <assert.h>
   5 #include "novel_types.h"
   6 #include "phrase_index.h"
   7 #include "phrase_large_table.h"
   8 #include "tag_utility.h"
   9
  10 namespace pinyin{
  11
  12 /* internal taglib structure */
  13 struct tag_entry{
  14     int m_line_type;
  15     char * m_line_tag;
  16     int m_num_of_values;
  17     char ** m_required_tags;
  18     /* char ** m_optional_tags; */
  19     /* int m_optional_count = 0; */
  20     char ** m_ignored_tags;
  21 };
  22
  23 tag_entry tag_entry_copy(int line_type, const char * line_tag,
  24                          int num_of_values,
  25                          char * required_tags[],
  26                          char * ignored_tags[]){
  27     tag_entry entry;
  28     entry.m_line_type = line_type;
  29     entry.m_line_tag = g_strdup( line_tag );
  30     entry.m_num_of_values = num_of_values;
  31     entry.m_required_tags = g_strdupv( required_tags );
  32     entry.m_ignored_tags = g_strdupv( ignored_tags );
  33     return entry;
  34 }
  35
  36 tag_entry tag_entry_clone(tag_entry * entry){
  37     return tag_entry_copy(entry->m_line_type, entry->m_line_tag,
  38                           entry->m_num_of_values,
  39                           entry->m_required_tags, entry->m_ignored_tags);
  40 }
  41
  42 void tag_entry_reclaim(tag_entry * entry){
  43     g_free( entry->m_line_tag );
  44     g_strfreev( entry->m_required_tags );
  45     g_strfreev(entry->m_ignored_tags);
  46 }
  47
  48 static bool taglib_free_tag_array(GArray * tag_array){
  49     for ( size_t i = 0; i < tag_array->len; ++i) {
  50         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
  51         tag_entry_reclaim(entry);
  52     }
  53     g_array_free(tag_array, TRUE);
  54     return true;
  55 }
  56
  57 /* special unichar to be handled in split_line. */
  58 static gunichar backslash = 0;
  59 static gunichar quote = 0;
  60
  61 static gboolean split_line_init(){
  62     backslash = g_utf8_get_char("\\");
  63     quote = g_utf8_get_char("\"");
  64     return TRUE;
  65 }
  66
  67 /* Pointer Array of Array of tag_entry */
  68 static GPtrArray * g_tagutils_stack = NULL;
  69
  70 bool taglib_init(){
  71     assert( g_tagutils_stack == NULL);
  72     g_tagutils_stack = g_ptr_array_new();
  73     GArray * tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
  74     g_ptr_array_add(g_tagutils_stack, tag_array);
  75
  76     /* init split_line. */
  77     split_line_init();
  78     return true;
  79 }
  80
  81 bool taglib_add_tag(int line_type, const char * line_tag, int num_of_values,
  82                     const char * required_tags, const char * ignored_tags){
  83     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack,
  84                                      g_tagutils_stack->len - 1);
  85
  86     /* some duplicate tagname or line_type check here. */
  87     for ( size_t i = 0; i < tag_array->len; ++i) {
  88         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
  89         if ( entry->m_line_type == line_type ||
  90              strcmp( entry->m_line_tag, line_tag ) == 0 )
  91             return false;
  92     }
  93
  94     char ** required = g_strsplit_set(required_tags, ",:", -1);
  95     char ** ignored = g_strsplit_set(ignored_tags, ",:", -1);
  96
  97     tag_entry entry = tag_entry_copy(line_type, line_tag, num_of_values,
  98                                      required, ignored);
  99     g_array_append_val(tag_array, entry);
 100
 101     g_strfreev(required);
 102     g_strfreev(ignored);
 103     return true;
 104 }
 105
 106 static void ptr_array_entry_free(gpointer data, gpointer user_data){
 107     g_free(data);
 108 }
 109
 110 static gboolean hash_table_key_value_free(gpointer key, gpointer value,
 111                                           gpointer user_data){
 112     g_free(key);
 113     g_free(value);
 114     return TRUE;
 115 }
 116
 117 /* split the line into tokens. */
 118 static gchar ** split_line(const gchar * line){
 119     /* array for tokens. */
 120     GArray * tokens = g_array_new(TRUE, TRUE, sizeof(gchar *));
 121
 122     for ( const gchar * cur = line; *cur; cur = g_utf8_next_char(cur) ){
 123         gunichar unichar = g_utf8_get_char(cur);
 124         const gchar * begin = cur;
 125         gchar * token = NULL;
 126
 127         if ( g_unichar_isspace (unichar) ) {
 128             continue;
 129         }else if ( unichar == quote ) {
 130             /* handles "\"". */
 131             /* skip the first '"'. */
 132             begin = cur = g_utf8_next_char(cur);
 133             while (*cur) {
 134                 unichar = g_utf8_get_char(cur);
 135                 if ( unichar == backslash ) {
 136                     cur = g_utf8_next_char(cur);
 137                     g_return_val_if_fail(*cur, NULL);
 138                 } else if ( unichar == quote ){
 139                     break;
 140                 }
 141                 cur = g_utf8_next_char(cur);
 142             }
 143             gchar * tmp = g_strndup( begin, cur - begin);
 144             /* TODO: switch to own strdup_escape implementation
 145                for \"->" transforming. */
 146             token = g_strdup_printf("%s", tmp);
 147             g_free(tmp);
 148         } else {
 149             /* handles other tokens. */
 150             while(*cur) {
 151                 unichar = g_utf8_get_char(cur);
 152                 if ( g_unichar_isgraph(unichar) ) {
 153                     /* next unichar */
 154                     cur = g_utf8_next_char(cur);
 155                 } else {
 156                     /* space and other characters handles. */
 157                     break;
 158                 }
 159             }
 160             token = g_strndup( begin, cur - begin );
 161         }
 162
 163         g_array_append_val(tokens, token);
 164         if ( !*cur )
 165             break;
 166     }
 167
 168     return (gchar **)g_array_free(tokens, FALSE);
 169 }
 170
 171 bool taglib_read(const char * input_line, int & line_type, GPtrArray * values,
 172                  GHashTable * required){
 173     /* reset values and required. */
 174     g_ptr_array_foreach(values, ptr_array_entry_free, NULL);
 175     g_ptr_array_set_size(values, 0);
 176     g_hash_table_foreach_steal(required, hash_table_key_value_free, NULL);
 177
 178     /* use own version of split_line
 179        instead of g_strsplit_set for special token.*/
 180     char ** tokens = split_line(input_line);
 181     int num_of_tokens = g_strv_length(tokens);
 182
 183     char * line_tag = tokens[0];
 184     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 185
 186     tag_entry * cur_entry = NULL;
 187     /* find line type. */
 188     for ( size_t i = 0; i < tag_array->len; ++i) {
 189         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
 190         if ( strcmp( entry->m_line_tag, line_tag ) == 0 ) {
 191             cur_entry = entry;
 192             break;
 193         }
 194     }
 195
 196     if ( !cur_entry )
 197         return false;
 198
 199     line_type = cur_entry->m_line_type;
 200
 201     for ( int i = 1; i < cur_entry->m_num_of_values + 1; ++i) {
 202         g_return_val_if_fail(i < num_of_tokens, false);
 203         char * value = g_strdup( tokens[i] );
 204         g_ptr_array_add(values, value);
 205     }
 206
 207     int ignored_len = g_strv_length( cur_entry->m_ignored_tags );
 208     int required_len = g_strv_length( cur_entry->m_required_tags);
 209
 210     for ( int i = cur_entry->m_num_of_values + 1; i < num_of_tokens; ++i){
 211         g_return_val_if_fail(i < num_of_tokens, false);
 212         const char * tmp = tokens[i];
 213
 214         /* check ignored tags. */
 215         bool tag_ignored = false;
 216         for ( int m = 0; m < ignored_len; ++m) {
 217             if ( strcmp(tmp, cur_entry->m_ignored_tags[m]) == 0) {
 218                 tag_ignored = true;
 219                 break;
 220             }
 221         }
 222
 223         if ( tag_ignored ) {
 224             ++i;
 225             continue;
 226         }
 227
 228         /* check required tags. */
 229         bool tag_required = false;
 230         for ( int m = 0; m < required_len; ++m) {
 231             if ( strcmp(tmp, cur_entry->m_required_tags[m]) == 0) {
 232                 tag_required = true;
 233                 break;
 234             }
 235         }
 236
 237         /* warning on the un-expected tags. */
 238         if ( !tag_required ) {
 239             g_warning("un-expected tags:%s.\n", tmp);
 240             ++i;
 241             continue;
 242         }
 243
 244         char * key = g_strdup(tokens[i]);
 245         ++i;
 246         g_return_val_if_fail(i < num_of_tokens, false);
 247         char * value = g_strdup(tokens[i]);
 248         g_hash_table_insert(required, key, value);
 249     }
 250
 251     /* check for all required tags. */
 252     for ( int i = 0; i < required_len; ++i) {
 253         const char * required_tag_str = cur_entry->m_required_tags[i];
 254         gboolean result = g_hash_table_lookup_extended(required, required_tag_str, NULL, NULL);
 255         if ( !result ) {
 256             g_warning("missed required tags: %s.\n", required_tag_str);
 257             g_strfreev(tokens);
 258             return false;
 259         }
 260     }
 261
 262     g_strfreev(tokens);
 263     return true;
 264 }
 265
 266 bool taglib_remove_tag(int line_type){
 267     /* Note: duplicate entry check is in taglib_add_tag. */
 268     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 269     for ( size_t i = 0; i < tag_array->len; ++i) {
 270         tag_entry * entry = &g_array_index(tag_array, tag_entry, i);
 271         if (entry->m_line_type != line_type)
 272             continue;
 273         tag_entry_reclaim(entry);
 274         g_array_remove_index(tag_array, i);
 275         return true;
 276     }
 277     return false;
 278 }
 279
 280 bool taglib_push_state(){
 281     assert(g_tagutils_stack->len >= 1);
 282     GArray * next_tag_array = g_array_new(TRUE, TRUE, sizeof(tag_entry));
 283     GArray * prev_tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 284     for ( size_t i = 0; i < prev_tag_array->len; ++i) {
 285         tag_entry * entry = &g_array_index(prev_tag_array, tag_entry, i);
 286         tag_entry new_entry = tag_entry_clone(entry);
 287         g_array_append_val(next_tag_array, new_entry);
 288     }
 289     g_ptr_array_add(g_tagutils_stack, next_tag_array);
 290     return true;
 291 }
 292
 293 bool taglib_pop_state(){
 294     assert(g_tagutils_stack->len > 1);
 295     GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 296     g_ptr_array_remove_index(g_tagutils_stack, g_tagutils_stack->len - 1);
 297     taglib_free_tag_array(tag_array);
 298     return true;
 299 }
 300
 301 bool taglib_fini(){
 302     for ( size_t i = 0; i < g_tagutils_stack->len; ++i){
 303         GArray * tag_array = (GArray *) g_ptr_array_index(g_tagutils_stack, i);
 304         taglib_free_tag_array(tag_array);
 305     }
 306     g_ptr_array_free(g_tagutils_stack, TRUE);
 307     g_tagutils_stack = NULL;
 308     return true;
 309 }
 310
 311 static phrase_token_t taglib_special_string_to_token(const char * string){
 312     struct token_pair{
 313         phrase_token_t token;
 314         const char * string;
 315     };
 316
 317     static const token_pair tokens [] = {
 318         {sentence_start, "<start>"},
 319         {0, NULL}
 320     };
 321
 322     const token_pair * pair = tokens;
 323     while (pair->string) {
 324         if ( strcmp(string, pair->string ) == 0 )
 325             return pair->token;
 326         pair++;
 327     }
 328
 329     fprintf(stderr, "error: unknown token:%s.\n", string);
 330     return 0;
 331 }
 332
 333 phrase_token_t taglib_string_to_token(PhraseLargeTable * phrases, const char * string){
 334     phrase_token_t token = 0;
 335     if ( string[0] == '<' ) {
 336         return taglib_special_string_to_token(string);
 337     }
 338
 339     glong phrase_len = g_utf8_strlen(string, -1);
 340     ucs4_t * phrase = g_utf8_to_ucs4(string, -1, NULL, NULL, NULL);
 341     int result = phrases->search(phrase_len, phrase, token);
 342     if ( !(result & SEARCH_OK) )
 343         fprintf(stderr, "error: unknown token:%s.\n", string);
 344
 345     g_free(phrase);
 346     return token;
 347 }
 348
 349 static const char * taglib_special_token_to_string(phrase_token_t token){
 350     struct token_pair{
 351         phrase_token_t token;
 352         const char * string;
 353     };
 354
 355     static const token_pair tokens [] = {
 356         {sentence_start, "<start>"},
 357         {0, NULL}
 358     };
 359
 360     const token_pair * pair = tokens;
 361     while (pair->token) {
 362         if ( token == pair->token )
 363             return pair->string;
 364         pair++;
 365     }
 366
 367     fprintf(stderr, "error: unknown token:%d.\n", token);
 368     return NULL;
 369 }
 370
 371 char * taglib_token_to_string(FacadePhraseIndex * phrase_index,
 372                               phrase_token_t token) {
 373     PhraseItem item;
 374     ucs4_t buffer[MAX_PHRASE_LENGTH];
 375
 376     gchar * phrase;
 377     /* deal with the special phrase index, for "<start>..." */
 378     if ( PHRASE_INDEX_LIBRARY_INDEX(token) == 0 ) {
 379         return g_strdup(taglib_special_token_to_string(token));
 380     }
 381
 382     int result = phrase_index->get_phrase_item(token, item);
 383     if (result != ERROR_OK) {
 384         fprintf(stderr, "error: unknown token:%d.\n", token);
 385         return NULL;
 386     }
 387
 388     item.get_phrase_string(buffer);
 389     guint8 length = item.get_phrase_length();
 390     phrase = g_ucs4_to_utf8(buffer, length, NULL, NULL, NULL);
 391     return phrase;
 392 }
 393
 394 };