wearable/gst/subparse/samiparse.c

   1 /* GStreamer SAMI subtitle parser
   2  * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #define _GNU_SOURCE
  21 #include "samiparse.h"
  22
  23 #include <glib.h>
  24 #include <string.h>
  25 #include <stdlib.h>
  26
  27 #define ITALIC_TAG 'i'
  28 #define SPAN_TAG   's'
  29 #define RUBY_TAG   'r'
  30 #define RT_TAG     't'
  31 #define CLEAR_TAG  '0'
  32
  33 typedef struct _HtmlParser HtmlParser;
  34 typedef struct _HtmlContext HtmlContext;
  35 typedef struct _GstSamiContext GstSamiContext;
  36 #ifdef SUBPARSE_MODIFICATION
  37 typedef struct _LanguageStruct  GstLangStruct;
  38 struct _LanguageStruct
  39 {
  40     gchar *language_code;
  41     gchar *language_key;
  42 };
  43 #endif
  44 struct _GstSamiContext
  45 {
  46   GString *buf;                 /* buffer to collect content */
  47   GString *rubybuf;             /* buffer to collect ruby content */
  48   GString *resultbuf;           /* when opening the next 'sync' tag, move
  49                                  * from 'buf' to avoid to append following
  50                                  * content */
  51   GString *state;               /* in many sami files there are tags that
  52                                  * are not closed, so for each open tag the
  53                                  * parser will append a tag flag here so
  54                                  * that tags can be closed properly on
  55                                  * 'sync' tags. See _context_push_state()
  56                                  * and _context_pop_state(). */
  57   HtmlContext *htmlctxt;        /* html parser context */
  58   gboolean has_result;          /* set when ready to push out result */
  59   gboolean in_sync;             /* flag to avoid appending anything except the
  60                                  * content of the sync elements to buf */
  61   guint64 time1;                /* previous start attribute in sync tag */
  62   guint64 time2;                /* current start attribute in sync tag  */
  63 #ifdef SUBPARSE_MODIFICATION
  64   guint64 time3;                /* To store the last current time when language is changed */
  65   GList *lang_list;             /* Language list for an external subtitle file */
  66   gboolean time_set;            /* If language is set already by user */
  67   gchar *current_language;      /* Current language parsed */
  68   gchar *desired_language;      /* Language set by user */
  69   gboolean language_changed;    /* language changed signal */
  70 #endif
  71 };
  72
  73 struct _HtmlParser
  74 {
  75   void (*start_element) (HtmlContext * ctx,
  76       const gchar * name, const gchar ** attr, gpointer user_data);
  77   void (*end_element) (HtmlContext * ctx,
  78       const gchar * name, gpointer user_data);
  79   void (*text) (HtmlContext * ctx,
  80       const gchar * text, gsize text_len, gpointer user_data);
  81 };
  82
  83 struct _HtmlContext
  84 {
  85   const HtmlParser *parser;
  86   gpointer user_data;
  87   GString *buf;
  88 };
  89
  90 static HtmlContext *
  91 html_context_new (HtmlParser * parser, gpointer user_data)
  92 {
  93   HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
  94   ctxt->parser = parser;
  95   ctxt->user_data = user_data;
  96   ctxt->buf = g_string_new (NULL);
  97   return ctxt;
  98 }
  99
 100 static void
 101 html_context_free (HtmlContext * ctxt)
 102 {
 103   g_string_free (ctxt->buf, TRUE);
 104   g_free (ctxt);
 105 }
 106
 107 struct EntityMap
 108 {
 109   const gunichar unescaped;
 110   const gchar *escaped;
 111 };
 112
 113 struct EntityMap XmlEntities[] = {
 114   {34, "quot;"},
 115   {38, "amp;"},
 116   {39, "apos;"},
 117   {60, "lt;"},
 118   {62, "gt;"},
 119   {0, NULL},
 120 };
 121
 122 struct EntityMap HtmlEntities[] = {
 123 /* nbsp will handle manually
 124 { 160,  "nbsp;" }, */
 125   {161, "iexcl;"},
 126   {162, "cent;"},
 127   {163, "pound;"},
 128   {164, "curren;"},
 129   {165, "yen;"},
 130   {166, "brvbar;"},
 131   {167, "sect;"},
 132   {168, "uml;"},
 133   {169, "copy;"},
 134   {170, "ordf;"},
 135   {171, "laquo;"},
 136   {172, "not;"},
 137   {173, "shy;"},
 138   {174, "reg;"},
 139   {175, "macr;"},
 140   {176, "deg;"},
 141   {177, "plusmn;"},
 142   {178, "sup2;"},
 143   {179, "sup3;"},
 144   {180, "acute;"},
 145   {181, "micro;"},
 146   {182, "para;"},
 147   {183, "middot;"},
 148   {184, "cedil;"},
 149   {185, "sup1;"},
 150   {186, "ordm;"},
 151   {187, "raquo;"},
 152   {188, "frac14;"},
 153   {189, "frac12;"},
 154   {190, "frac34;"},
 155   {191, "iquest;"},
 156   {192, "Agrave;"},
 157   {193, "Aacute;"},
 158   {194, "Acirc;"},
 159   {195, "Atilde;"},
 160   {196, "Auml;"},
 161   {197, "Aring;"},
 162   {198, "AElig;"},
 163   {199, "Ccedil;"},
 164   {200, "Egrave;"},
 165   {201, "Eacute;"},
 166   {202, "Ecirc;"},
 167   {203, "Euml;"},
 168   {204, "Igrave;"},
 169   {205, "Iacute;"},
 170   {206, "Icirc;"},
 171   {207, "Iuml;"},
 172   {208, "ETH;"},
 173   {209, "Ntilde;"},
 174   {210, "Ograve;"},
 175   {211, "Oacute;"},
 176   {212, "Ocirc;"},
 177   {213, "Otilde;"},
 178   {214, "Ouml;"},
 179   {215, "times;"},
 180   {216, "Oslash;"},
 181   {217, "Ugrave;"},
 182   {218, "Uacute;"},
 183   {219, "Ucirc;"},
 184   {220, "Uuml;"},
 185   {221, "Yacute;"},
 186   {222, "THORN;"},
 187   {223, "szlig;"},
 188   {224, "agrave;"},
 189   {225, "aacute;"},
 190   {226, "acirc;"},
 191   {227, "atilde;"},
 192   {228, "auml;"},
 193   {229, "aring;"},
 194   {230, "aelig;"},
 195   {231, "ccedil;"},
 196   {232, "egrave;"},
 197   {233, "eacute;"},
 198   {234, "ecirc;"},
 199   {235, "euml;"},
 200   {236, "igrave;"},
 201   {237, "iacute;"},
 202   {238, "icirc;"},
 203   {239, "iuml;"},
 204   {240, "eth;"},
 205   {241, "ntilde;"},
 206   {242, "ograve;"},
 207   {243, "oacute;"},
 208   {244, "ocirc;"},
 209   {245, "otilde;"},
 210   {246, "ouml;"},
 211   {247, "divide;"},
 212   {248, "oslash;"},
 213   {249, "ugrave;"},
 214   {250, "uacute;"},
 215   {251, "ucirc;"},
 216   {252, "uuml;"},
 217   {253, "yacute;"},
 218   {254, "thorn;"},
 219   {255, "yuml;"},
 220   {338, "OElig;"},
 221   {339, "oelig;"},
 222   {352, "Scaron;"},
 223   {353, "scaron;"},
 224   {376, "Yuml;"},
 225   {402, "fnof;"},
 226   {710, "circ;"},
 227   {732, "tilde;"},
 228   {913, "Alpha;"},
 229   {914, "Beta;"},
 230   {915, "Gamma;"},
 231   {916, "Delta;"},
 232   {917, "Epsilon;"},
 233   {918, "Zeta;"},
 234   {919, "Eta;"},
 235   {920, "Theta;"},
 236   {921, "Iota;"},
 237   {922, "Kappa;"},
 238   {923, "Lambda;"},
 239   {924, "Mu;"},
 240   {925, "Nu;"},
 241   {926, "Xi;"},
 242   {927, "Omicron;"},
 243   {928, "Pi;"},
 244   {929, "Rho;"},
 245   {931, "Sigma;"},
 246   {932, "Tau;"},
 247   {933, "Upsilon;"},
 248   {934, "Phi;"},
 249   {935, "Chi;"},
 250   {936, "Psi;"},
 251   {937, "Omega;"},
 252   {945, "alpha;"},
 253   {946, "beta;"},
 254   {947, "gamma;"},
 255   {948, "delta;"},
 256   {949, "epsilon;"},
 257   {950, "zeta;"},
 258   {951, "eta;"},
 259   {952, "theta;"},
 260   {953, "iota;"},
 261   {954, "kappa;"},
 262   {955, "lambda;"},
 263   {956, "mu;"},
 264   {957, "nu;"},
 265   {958, "xi;"},
 266   {959, "omicron;"},
 267   {960, "pi;"},
 268   {961, "rho;"},
 269   {962, "sigmaf;"},
 270   {963, "sigma;"},
 271   {964, "tau;"},
 272   {965, "upsilon;"},
 273   {966, "phi;"},
 274   {967, "chi;"},
 275   {968, "psi;"},
 276   {969, "omega;"},
 277   {977, "thetasym;"},
 278   {978, "upsih;"},
 279   {982, "piv;"},
 280   {8194, "ensp;"},
 281   {8195, "emsp;"},
 282   {8201, "thinsp;"},
 283   {8204, "zwnj;"},
 284   {8205, "zwj;"},
 285   {8206, "lrm;"},
 286   {8207, "rlm;"},
 287   {8211, "ndash;"},
 288   {8212, "mdash;"},
 289   {8216, "lsquo;"},
 290   {8217, "rsquo;"},
 291   {8218, "sbquo;"},
 292   {8220, "ldquo;"},
 293   {8221, "rdquo;"},
 294   {8222, "bdquo;"},
 295   {8224, "dagger;"},
 296   {8225, "Dagger;"},
 297   {8226, "bull;"},
 298   {8230, "hellip;"},
 299   {8240, "permil;"},
 300   {8242, "prime;"},
 301   {8243, "Prime;"},
 302   {8249, "lsaquo;"},
 303   {8250, "rsaquo;"},
 304   {8254, "oline;"},
 305   {8260, "frasl;"},
 306   {8364, "euro;"},
 307   {8465, "image;"},
 308   {8472, "weierp;"},
 309   {8476, "real;"},
 310   {8482, "trade;"},
 311   {8501, "alefsym;"},
 312   {8592, "larr;"},
 313   {8593, "uarr;"},
 314   {8594, "rarr;"},
 315   {8595, "darr;"},
 316   {8596, "harr;"},
 317   {8629, "crarr;"},
 318   {8656, "lArr;"},
 319   {8657, "uArr;"},
 320   {8658, "rArr;"},
 321   {8659, "dArr;"},
 322   {8660, "hArr;"},
 323   {8704, "forall;"},
 324   {8706, "part;"},
 325   {8707, "exist;"},
 326   {8709, "empty;"},
 327   {8711, "nabla;"},
 328   {8712, "isin;"},
 329   {8713, "notin;"},
 330   {8715, "ni;"},
 331   {8719, "prod;"},
 332   {8721, "sum;"},
 333   {8722, "minus;"},
 334   {8727, "lowast;"},
 335   {8730, "radic;"},
 336   {8733, "prop;"},
 337   {8734, "infin;"},
 338   {8736, "ang;"},
 339   {8743, "and;"},
 340   {8744, "or;"},
 341   {8745, "cap;"},
 342   {8746, "cup;"},
 343   {8747, "int;"},
 344   {8756, "there4;"},
 345   {8764, "sim;"},
 346   {8773, "cong;"},
 347   {8776, "asymp;"},
 348   {8800, "ne;"},
 349   {8801, "equiv;"},
 350   {8804, "le;"},
 351   {8805, "ge;"},
 352   {8834, "sub;"},
 353   {8835, "sup;"},
 354   {8836, "nsub;"},
 355   {8838, "sube;"},
 356   {8839, "supe;"},
 357   {8853, "oplus;"},
 358   {8855, "otimes;"},
 359   {8869, "perp;"},
 360   {8901, "sdot;"},
 361   {8968, "lceil;"},
 362   {8969, "rceil;"},
 363   {8970, "lfloor;"},
 364   {8971, "rfloor;"},
 365   {9001, "lang;"},
 366   {9002, "rang;"},
 367   {9674, "loz;"},
 368   {9824, "spades;"},
 369   {9827, "clubs;"},
 370   {9829, "hearts;"},
 371   {9830, "diams;"},
 372   {0, NULL},
 373 };
 374
 375 static gchar *
 376 unescape_string (const gchar * text)
 377 {
 378   gint i;
 379   GString *unescaped = g_string_new (NULL);
 380
 381   while (*text) {
 382     if (*text == '&') {
 383       text++;
 384
 385       /* unescape &nbsp and &nbsp; */
 386       if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
 387         unescaped = g_string_append_unichar (unescaped, 160);
 388         text += 4;
 389         if (*text == ';') {
 390           text++;
 391         }
 392         goto next;
 393       }
 394
 395       /* pass xml entities. these will be processed as pango markup */
 396       for (i = 0; XmlEntities[i].escaped; i++) {
 397         gssize len = strlen (XmlEntities[i].escaped);
 398         if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
 399           unescaped = g_string_append_c (unescaped, '&');
 400           unescaped =
 401               g_string_append_len (unescaped, XmlEntities[i].escaped, len);
 402           text += len;
 403           goto next;
 404         }
 405       }
 406
 407       /* convert html entities */
 408       for (i = 0; HtmlEntities[i].escaped; i++) {
 409         gssize len = strlen (HtmlEntities[i].escaped);
 410         if (!strncmp (text, HtmlEntities[i].escaped, len)) {
 411           unescaped =
 412               g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
 413           text += len;
 414           goto next;
 415         }
 416       }
 417
 418       if (*text == '#') {
 419         gboolean is_hex = FALSE;
 420         gunichar l;
 421         gchar *end = NULL;
 422
 423         text++;
 424         if (*text == 'x') {
 425           is_hex = TRUE;
 426           text++;
 427         }
 428         errno = 0;
 429         if (is_hex) {
 430           l = strtoul (text, &end, 16);
 431         } else {
 432           l = strtoul (text, &end, 10);
 433         }
 434
 435         if (text == end || errno != 0) {
 436           /* error occured. pass it */
 437           goto next;
 438         }
 439         unescaped = g_string_append_unichar (unescaped, l);
 440         text = end;
 441
 442         if (*text == ';') {
 443           text++;
 444         }
 445         goto next;
 446       }
 447
 448       /* escape & */
 449       unescaped = g_string_append (unescaped, "&amp;");
 450
 451     next:
 452       continue;
 453
 454     } else if (g_ascii_isspace (*text)) {
 455       unescaped = g_string_append_c (unescaped, ' ');
 456       /* strip whitespace */
 457       do {
 458         text++;
 459       } while ((*text) && g_ascii_isspace (*text));
 460     } else {
 461       unescaped = g_string_append_c (unescaped, *text);
 462       text++;
 463     }
 464   }
 465
 466   return g_string_free (unescaped, FALSE);
 467 }
 468
 469 static const gchar *
 470 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
 471 {
 472   gchar *next = strstr (string, delimiter);
 473   if (next) {
 474     *first = g_strndup (string, next - string);
 475   } else {
 476     *first = g_strdup (string);
 477   }
 478   return next;
 479 }
 480
 481 static void
 482 html_context_handle_element (HtmlContext * ctxt,
 483     const gchar * string, gboolean must_close)
 484 {
 485   gchar *name = NULL;
 486   gint count = 0, i;
 487   gchar **attrs;
 488   const gchar *found, *next;
 489 #ifdef SUBPARSE_MODIFICATION
 490   const gchar *name_temp = NULL;
 491   gint j = 0;
 492 #endif
 493   /* split element name and attributes */
 494   next = string_token (string, " ", &name);
 495
 496   if (next) {
 497     /* count attributes */
 498     found = next + 1;
 499     while (TRUE) {
 500       found = strchr (found, '=');
 501       if (!found)
 502         break;
 503       found++;
 504       count++;
 505     }
 506   } else {
 507     count = 0;
 508   }
 509
 510   attrs = g_new0 (gchar *, (count + 1) * 2);
 511
 512   for (i = 0; i < count; i += 2) {
 513     gchar *attr_name = NULL, *attr_value = NULL;
 514     gsize length;
 515     next = string_token (next + 1, "=", &attr_name);
 516     next = string_token (next + 1, " ", &attr_value);
 517
 518     /* strip " or ' from attribute value */
 519     if (attr_value[0] == '"' || attr_value[0] == '\'') {
 520       gchar *tmp = g_strdup (attr_value + 1);
 521       g_free (attr_value);
 522       attr_value = tmp;
 523     }
 524
 525     length = strlen (attr_value);
 526     if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
 527       attr_value[length - 1] = '\0';
 528     }
 529
 530     attrs[i] = attr_name;
 531     attrs[i + 1] = attr_value;
 532   }
 533 #ifdef SUBPARSE_MODIFICATION
 534   /* sometimes spaces can be there in between !-- and P
 535    * that also we have to take care */
 536   if (!g_ascii_strcasecmp("!--", name)) {
 537     gchar* tempchar = (gchar*)(string + 3);
 538     while (*tempchar == ' ') {
 539       tempchar++;
 540       if (*tempchar == 'P' || *tempchar == 'p') {
 541         *(name + 3) = *tempchar;
 542         *(name + 4) = '\0';
 543         next = tempchar + 1;
 544         break;
 545       }
 546     }
 547   }
 548   if (next && (!g_ascii_strcasecmp("!--P", name))) {
 549     gint attrindex = 0;
 550     count = 0;
 551     /* count attributes */
 552     found = next + 1;
 553     while (TRUE) {
 554       found = (gchar*)strcasestr (found, "lang");
 555       if (!found)
 556         break;
 557       found++;
 558       count++;
 559     }
 560     g_strfreev (attrs);
 561
 562     attrs = g_new0 (gchar *, count * 2);
 563
 564     for (i = 0; i < count; i++) {
 565       gchar *attr_name = NULL, *attr_value = NULL;
 566
 567       next = (gchar*)strcasestr (next, "lang:");
 568       attr_value = (gchar*)malloc (3);
 569       next = next + 5;
 570       strncpy (attr_value, next, 2);
 571       attr_value[2] = '\0';
 572       GST_LOG ("Language value comes as %s", attr_value);
 573       name_temp = next;
 574       while (TRUE) {
 575         if (*name_temp == '{') {
 576           int character_count = 0;
 577
 578           while (TRUE) {
 579             name_temp--;
 580
 581             if (*name_temp == '.') {
 582               attr_name = (gchar*) malloc (character_count + 1);
 583               break;
 584             }
 585             else if (*name_temp != ' ')
 586               character_count++;
 587           }
 588           break;
 589         }
 590         name_temp--;
 591       }
 592       name_temp++;
 593       for (j = 0; *(name_temp + j) != ' '; j++) {
 594         attr_name[j] = *(name_temp + j);
 595       }
 596       attr_name[j] = '\0';
 597       attrs[attrindex++] = attr_name;
 598       attrs[attrindex++] = attr_value;
 599     }
 600   } else {
 601     count = 0;
 602   }
 603 #endif
 604   ctxt->parser->start_element (ctxt, name,
 605       (const gchar **) attrs, ctxt->user_data);
 606   if (must_close) {
 607     ctxt->parser->end_element (ctxt, name, ctxt->user_data);
 608   }
 609   g_strfreev (attrs);
 610   g_free (name);
 611 }
 612
 613 static void
 614 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
 615 {
 616   const gchar *next = NULL;
 617   ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
 618   next = ctxt->buf->str;
 619   while (TRUE) {
 620     if (next[0] == '<') {
 621       gchar *element = NULL;
 622       /* find <blahblah> */
 623       if (!strchr (next, '>')) {
 624         /* no tag end point. buffer will be process in next time */
 625         return;
 626       }
 627
 628       next = string_token (next, ">", &element);
 629       next++;
 630       if (g_str_has_suffix (next, "/")) {
 631         /* handle <blah/> */
 632         element[strlen (element) - 1] = '\0';
 633         html_context_handle_element (ctxt, element + 1, TRUE);
 634       } else if (element[1] == '/') {
 635         /* handle </blah> */
 636         ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
 637       } else {
 638         /* handle <blah> */
 639         html_context_handle_element (ctxt, element + 1, FALSE);
 640       }
 641       g_free (element);
 642     } else if (strchr (next, '<')) {
 643       gchar *text = NULL;
 644       gsize length;
 645       next = string_token (next, "<", &text);
 646       text = g_strstrip (text);
 647       length = strlen (text);
 648       ctxt->parser->text (ctxt, text, length, ctxt->user_data);
 649       g_free (text);
 650
 651     } else {
 652       gchar *text = (gchar *) next;
 653       gsize length;
 654       text = g_strstrip (text);
 655       length = strlen (text);
 656       ctxt->parser->text (ctxt, text, length, ctxt->user_data);
 657       ctxt->buf = g_string_assign (ctxt->buf, "");
 658       return;
 659     }
 660   }
 661
 662   ctxt->buf = g_string_assign (ctxt->buf, next);
 663 }
 664
 665 static gchar *
 666 has_tag (GString * str, const gchar tag)
 667 {
 668   return strrchr (str->str, tag);
 669 }
 670
 671 static void
 672 sami_context_push_state (GstSamiContext * sctx, char state)
 673 {
 674   GST_LOG ("state %c", state);
 675   g_string_append_c (sctx->state, state);
 676 }
 677
 678 static void
 679 sami_context_pop_state (GstSamiContext * sctx, char state)
 680 {
 681   GString *str = g_string_new ("");
 682   GString *context_state = sctx->state;
 683   int i;
 684
 685   GST_LOG ("state %c", state);
 686   for (i = context_state->len - 1; i >= 0; i--) {
 687     switch (context_state->str[i]) {
 688       case ITALIC_TAG:         /* <i> */
 689       {
 690         g_string_append (str, "</i>");
 691         break;
 692       }
 693       case SPAN_TAG:           /* <span foreground= > */
 694       {
 695         g_string_append (str, "</span>");
 696         break;
 697       }
 698       case RUBY_TAG:           /* <span size= >  -- ruby */
 699       {
 700         break;
 701       }
 702       case RT_TAG:             /*  ruby */
 703       {
 704         /* FIXME: support for furigana/ruby once implemented in pango */
 705         g_string_append (sctx->rubybuf, "</span>");
 706         if (has_tag (context_state, ITALIC_TAG)) {
 707           g_string_append (sctx->rubybuf, "</i>");
 708         }
 709
 710         break;
 711       }
 712       default:
 713         break;
 714     }
 715     if (context_state->str[i] == state) {
 716       g_string_append (sctx->buf, str->str);
 717       g_string_free (str, TRUE);
 718       g_string_truncate (context_state, i);
 719       return;
 720     }
 721   }
 722   if (state == CLEAR_TAG) {
 723     g_string_append (sctx->buf, str->str);
 724     g_string_truncate (context_state, 0);
 725   }
 726   g_string_free (str, TRUE);
 727 }
 728
 729 static void
 730 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
 731 {
 732   int i;
 733
 734   sami_context_pop_state (sctx, CLEAR_TAG);
 735   if (atts != NULL) {
 736     for (i = 0; (atts[i] != NULL); i += 2) {
 737       const gchar *key, *value;
 738
 739       key = atts[i];
 740       value = atts[i + 1];
 741
 742       if (!value)
 743         continue;
 744       if (!g_ascii_strcasecmp ("start", key)) {
 745         /* Only set a new start time if we don't have text pending */
 746         if (sctx->resultbuf->len == 0)
 747           sctx->time1 = sctx->time2;
 748
 749         sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
 750 #ifdef SUBPARSE_MODIFICATION
 751         sctx->time3 = sctx->time2;
 752 #endif
 753         sctx->time2 = MAX (sctx->time2, sctx->time1);
 754         g_string_append (sctx->resultbuf, sctx->buf->str);
 755         sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
 756         g_string_truncate (sctx->buf, 0);
 757       }
 758     }
 759   }
 760 }
 761
 762 static void
 763 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
 764 {
 765   int i;
 766
 767   sami_context_pop_state (sctx, SPAN_TAG);
 768   if (atts != NULL) {
 769     g_string_append (sctx->buf, "<span");
 770     for (i = 0; (atts[i] != NULL); i += 2) {
 771       const gchar *key, *value;
 772
 773       key = atts[i];
 774       value = atts[i + 1];
 775
 776       if (!value)
 777         continue;
 778       if (!g_ascii_strcasecmp ("color", key)) {
 779         /*
 780          * There are invalid color value in many
 781          * sami files.
 782          * It will fix hex color value that start without '#'
 783          */
 784         const gchar *sharp = "";
 785         int len = strlen (value);
 786
 787         if (!(*value == '#' && len == 7)) {
 788           gchar *r;
 789
 790           /* check if it looks like hex */
 791           if (strtol ((const char *) value, &r, 16) >= 0 &&
 792               ((gchar *) r == (value + 6) && len == 6)) {
 793             sharp = "#";
 794           }
 795         }
 796         /* some colours can be found in many sami files, but X RGB database
 797          * doesn't contain a colour by this name, so map explicitly */
 798         if (!g_ascii_strcasecmp ("aqua", value)) {
 799           value = "#00ffff";
 800         } else if (!g_ascii_strcasecmp ("crimson", value)) {
 801           value = "#dc143c";
 802         } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
 803           value = "#ff00ff";
 804         } else if (!g_ascii_strcasecmp ("indigo", value)) {
 805           value = "#4b0082";
 806         } else if (!g_ascii_strcasecmp ("lime", value)) {
 807           value = "#00ff00";
 808         } else if (!g_ascii_strcasecmp ("olive", value)) {
 809           value = "#808000";
 810         } else if (!g_ascii_strcasecmp ("silver", value)) {
 811           value = "#c0c0c0";
 812         } else if (!g_ascii_strcasecmp ("teal", value)) {
 813           value = "#008080";
 814         }
 815         g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
 816             value);
 817       } else if (!g_ascii_strcasecmp ("face", key)) {
 818         g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
 819       }
 820     }
 821     g_string_append_c (sctx->buf, '>');
 822     sami_context_push_state (sctx, SPAN_TAG);
 823   }
 824 }
 825
 826 #ifdef SUBPARSE_MODIFICATION
 827 static void
 828 handle_p (GstSamiContext * sctx, const gchar ** atts)
 829 {
 830   int i;
 831
 832   if (atts != NULL) {
 833     for (i = 0; (atts[i] != NULL); i += 2) {
 834       const gchar *key, *value;
 835
 836       key = atts[i];
 837       value = atts[i + 1];
 838
 839       if (sctx->current_language && value && strcmp(sctx->current_language, value))
 840         sctx->language_changed = TRUE;
 841
 842       else if (!sctx->current_language)
 843         sctx->current_language = (gchar*) malloc (128);
 844
 845       if (key && !g_ascii_strcasecmp ("class", key) && value) {
 846         strcpy (sctx->current_language, value);
 847       }
 848       if (sctx->language_changed)
 849       {
 850          sctx->time1 = sctx->time3;
 851          sctx->time2 = sctx->time1;
 852          sctx->time_set = FALSE;
 853          sctx->language_changed = FALSE;
 854       }
 855       if (!value)
 856         continue;
 857     }
 858   }
 859 }
 860
 861 static void
 862 handle_start_language_list (GstSamiContext * sctx, const gchar ** atts)
 863 {
 864   int i = 0;
 865   int attrIndex = 0;
 866   GstLangStruct *new = NULL;
 867   GstLangStruct *temp = NULL;
 868
 869   if (atts != NULL) {
 870     for (i = 0; (atts[attrIndex] != NULL); i++) {
 871       const gchar *key, *value;
 872
 873       key = atts[attrIndex++];
 874       value = atts[attrIndex++];
 875
 876       GST_LOG ("Inside handle_start_language_list key: %s, value: %s", key, value);
 877
 878       if (!value)
 879         continue;
 880
 881       new = g_new0 (GstLangStruct, 1);
 882       new->language_code = (gchar*) malloc (strlen(value) + 1);
 883       if (new->language_code && value)
 884         strcpy (new->language_code, value);
 885       new->language_key = (gchar*) malloc (strlen(key) + 1);
 886       if (new->language_key && key)
 887         strcpy (new->language_key, key);
 888       sctx->lang_list = g_list_append (sctx->lang_list, new);
 889       temp = g_list_nth_data (sctx->lang_list, i);
 890       if (sctx->desired_language == NULL && key){
 891         sctx->desired_language = (gchar*) malloc (strlen(key) + 1);
 892         strcpy(sctx->desired_language, key);
 893       }
 894
 895       if (temp)
 896         GST_LOG ("Inside handle_start_language_list of glist key: %s, value: %s",
 897                     temp->language_key, temp->language_code);
 898     }
 899   }
 900 }
 901 #endif
 902
 903 static void
 904 handle_start_element (HtmlContext * ctx, const gchar * name,
 905     const char **atts, gpointer user_data)
 906 {
 907   GstSamiContext *sctx = (GstSamiContext *) user_data;
 908
 909   GST_LOG ("name:%s", name);
 910
 911   if (!g_ascii_strcasecmp ("sync", name)) {
 912     handle_start_sync (sctx, atts);
 913     sctx->in_sync = TRUE;
 914   } else if (!g_ascii_strcasecmp ("font", name)) {
 915     handle_start_font (sctx, atts);
 916   } else if (!g_ascii_strcasecmp ("ruby", name)) {
 917     sami_context_push_state (sctx, RUBY_TAG);
 918   } else if (!g_ascii_strcasecmp ("br", name)) {
 919 #ifdef SUBPARSE_MODIFICATION
 920     if (sctx->current_language && sctx->desired_language &&
 921         !strcmp(sctx->current_language, sctx->desired_language))
 922 #endif
 923       g_string_append_c (sctx->buf, '\n');
 924     /* FIXME: support for furigana/ruby once implemented in pango */
 925   } else if (!g_ascii_strcasecmp ("rt", name)) {
 926     if (has_tag (sctx->state, ITALIC_TAG)) {
 927       g_string_append (sctx->rubybuf, "<i>");
 928     }
 929     g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
 930     sami_context_push_state (sctx, RT_TAG);
 931   } else if (!g_ascii_strcasecmp ("i", name)) {
 932 #ifdef SUBPARSE_MODIFICATION
 933     if (sctx->current_language && sctx->desired_language &&
 934         !strcmp(sctx->current_language, sctx->desired_language))
 935 #endif
 936       g_string_append (sctx->buf, "<i>");
 937     sami_context_push_state (sctx, ITALIC_TAG);
 938   } else if (!g_ascii_strcasecmp ("p", name)) {
 939 #ifdef SUBPARSE_MODIFICATION
 940     handle_p (sctx, atts);
 941   } else if (!g_ascii_strcasecmp ("!--P", name)) {
 942     handle_start_language_list (sctx, atts);
 943 #endif
 944   }
 945 }
 946
 947 static void
 948 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
 949 {
 950   GstSamiContext *sctx = (GstSamiContext *) user_data;
 951
 952   GST_LOG ("name:%s", name);
 953
 954   if (!g_ascii_strcasecmp ("sync", name)) {
 955     sctx->in_sync = FALSE;
 956   } else if ((!g_ascii_strcasecmp ("body", name)) ||
 957       (!g_ascii_strcasecmp ("sami", name))) {
 958     /* We will usually have one buffer left when the body is closed
 959      * as we need the next sync to actually send it */
 960     if (sctx->buf->len != 0) {
 961       /* Only set a new start time if we don't have text pending */
 962       if (sctx->resultbuf->len == 0)
 963         sctx->time1 = sctx->time2;
 964
 965       sctx->time2 = GST_CLOCK_TIME_NONE;
 966       g_string_append (sctx->resultbuf, sctx->buf->str);
 967       sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
 968       g_string_truncate (sctx->buf, 0);
 969     }
 970   } else if (!g_ascii_strcasecmp ("font", name)) {
 971     sami_context_pop_state (sctx, SPAN_TAG);
 972   } else if (!g_ascii_strcasecmp ("ruby", name)) {
 973     sami_context_pop_state (sctx, RUBY_TAG);
 974   } else if (!g_ascii_strcasecmp ("i", name)) {
 975     sami_context_pop_state (sctx, ITALIC_TAG);
 976   }
 977 }
 978
 979 static void
 980 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
 981     gpointer user_data)
 982 {
 983   GstSamiContext *sctx = (GstSamiContext *) user_data;
 984
 985   /* Skip everything except content of the sync elements */
 986   if (!sctx->in_sync)
 987     return;
 988 #ifdef SUBPARSE_MODIFICATION
 989   if (has_tag (sctx->state, RT_TAG) && (sctx->current_language && sctx->desired_language &&
 990        !strcmp(sctx->current_language, sctx->desired_language))) {
 991 #else
 992   if (has_tag (sctx->state, RT_TAG)) {
 993 #endif
 994     g_string_append_c (sctx->rubybuf, ' ');
 995     g_string_append (sctx->rubybuf, text);
 996     g_string_append_c (sctx->rubybuf, ' ');
 997   } else {
 998 #ifdef SUBPARSE_MODIFICATION
 999     if (sctx->current_language && sctx->desired_language &&
1000         !strcmp(sctx->current_language, sctx->desired_language))
1001 #endif
1002       g_string_append (sctx->buf, text);
1003   }
1004 }
1005
1006 static HtmlParser samiParser = {
1007   handle_start_element,         /* start_element */
1008   handle_end_element,           /* end_element */
1009   handle_text,                  /* text */
1010 };
1011
1012 void
1013 sami_context_init (ParserState * state)
1014 {
1015   GstSamiContext *context;
1016
1017   g_assert (state->user_data == NULL);
1018
1019   context = g_new0 (GstSamiContext, 1);
1020
1021   context->htmlctxt = html_context_new (&samiParser, context);
1022   context->buf = g_string_new ("");
1023   context->rubybuf = g_string_new ("");
1024   context->resultbuf = g_string_new ("");
1025   context->state = g_string_new ("");
1026 #ifdef SUBPARSE_MODIFICATION
1027   context->current_language = NULL;
1028   context->desired_language = NULL;
1029   context->time_set = FALSE;
1030   context->lang_list = NULL;
1031   context->language_changed = FALSE;
1032 #endif
1033   state->user_data = context;
1034 }
1035
1036 void
1037 sami_context_deinit (ParserState * state)
1038 {
1039   GstSamiContext *context = (GstSamiContext *) state->user_data;
1040 #ifdef SUBPARSE_MODIFICATION
1041   GstLangStruct *temp = NULL;
1042   int i = 0;
1043 #endif
1044   if (context) {
1045     html_context_free (context->htmlctxt);
1046     context->htmlctxt = NULL;
1047     g_string_free (context->buf, TRUE);
1048     g_string_free (context->rubybuf, TRUE);
1049     g_string_free (context->resultbuf, TRUE);
1050     g_string_free (context->state, TRUE);
1051 #ifdef SUBPARSE_MODIFICATION
1052     if (context->lang_list) {
1053       while ((temp = g_list_nth_data (context->lang_list, i))) {
1054         if (temp->language_code)
1055           free (temp->language_code);
1056         temp->language_code = NULL;
1057         if (temp->language_key)
1058           free (temp->language_key);
1059         temp->language_key = NULL;
1060         g_free (temp);
1061         i++;
1062       }
1063       g_list_free (context->lang_list);
1064     }
1065     context->lang_list = NULL;
1066
1067     if (context->current_language)
1068       free (context->current_language);
1069     context->current_language = NULL;
1070
1071     context->desired_language = NULL;
1072 #endif
1073     g_free (context);
1074     state->user_data = NULL;
1075   }
1076 }
1077
1078 void
1079 sami_context_reset (ParserState * state)
1080 {
1081   GstSamiContext *context = (GstSamiContext *) state->user_data;
1082
1083   if (context) {
1084     g_string_truncate (context->buf, 0);
1085     g_string_truncate (context->rubybuf, 0);
1086     g_string_truncate (context->resultbuf, 0);
1087     g_string_truncate (context->state, 0);
1088     context->has_result = FALSE;
1089     context->in_sync = FALSE;
1090     context->time1 = 0;
1091     context->time2 = 0;
1092   }
1093 }
1094
1095 #ifdef SUBPARSE_MODIFICATION
1096 void
1097 sami_context_change_language (ParserState * state)
1098 {
1099   GstSamiContext *context = (GstSamiContext *) state->user_data;
1100   GST_LOG ("**********desired language was %s**************", context->desired_language);
1101   free (context->desired_language);
1102   context->desired_language = state->current_language;
1103   context->time_set = TRUE;
1104   GST_LOG ("desired language changed to %s", context->desired_language);
1105 }
1106 #endif
1107
1108 gchar *
1109 parse_sami (ParserState * state, const gchar * line)
1110 {
1111   gchar *ret = NULL;
1112   GstSamiContext *context = (GstSamiContext *) state->user_data;
1113
1114   gchar *unescaped = unescape_string (line);
1115   html_context_parse (context->htmlctxt, (gchar *) unescaped,
1116       strlen (unescaped));
1117 #ifdef SUBPARSE_MODIFICATION
1118   if (context->lang_list)
1119     state->language_list = context->lang_list;
1120
1121   if (context->desired_language)
1122     state->current_language = context->desired_language;
1123 #endif
1124   g_free (unescaped);
1125 #ifdef SUBPARSE_MODIFICATION
1126   if (context->desired_language && context->current_language) {
1127     if (!strcmp(context->current_language, context->desired_language)) {
1128 #endif
1129       if (context->has_result) {
1130         if (context->rubybuf->len) {
1131           context->rubybuf = g_string_append_c (context->rubybuf, '\n');
1132           g_string_prepend (context->resultbuf, context->rubybuf->str);
1133           context->rubybuf = g_string_truncate (context->rubybuf, 0);
1134         }
1135
1136         ret = g_string_free (context->resultbuf, FALSE);
1137         context->resultbuf = g_string_new ("");
1138         state->start_time = context->time1;
1139         state->duration = context->time2 - context->time1;
1140         context->has_result = FALSE;
1141       }
1142 #ifdef SUBPARSE_MODIFICATION
1143     }
1144   }
1145 #endif
1146   return ret;
1147 }