gst/subparse/samiparse.c

   1 /* GStreamer SAMI subtitle parser
   2  * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #define _GNU_SOURCE
  21 #include "samiparse.h"
  22
  23 #include <glib.h>
  24 #include <string.h>
  25 #include <stdlib.h>
  26
  27 #define ITALIC_TAG 'i'
  28 #define SPAN_TAG   's'
  29 #define RUBY_TAG   'r'
  30 #define RT_TAG     't'
  31 #define CLEAR_TAG  '0'
  32
  33 typedef struct _HtmlParser HtmlParser;
  34 typedef struct _HtmlContext HtmlContext;
  35 typedef struct _GstSamiContext GstSamiContext;
  36 #ifdef SUBPARSE_MODIFICATION
  37 typedef struct _LanguageStruct  GstLangStruct;
  38 struct _LanguageStruct
  39 {
  40     gchar *language_code;
  41     gchar *language_key;
  42 };
  43 #define MAX_LANGUAGE 10
  44 #endif
  45 struct _GstSamiContext
  46 {
  47   GString *buf;                 /* buffer to collect content */
  48   GString *rubybuf;             /* buffer to collect ruby content */
  49   GString *resultbuf;           /* when opening the next 'sync' tag, move
  50                                  * from 'buf' to avoid to append following
  51                                  * content */
  52   GString *state;               /* in many sami files there are tags that
  53                                  * are not closed, so for each open tag the
  54                                  * parser will append a tag flag here so
  55                                  * that tags can be closed properly on
  56                                  * 'sync' tags. See _context_push_state()
  57                                  * and _context_pop_state(). */
  58   HtmlContext *htmlctxt;        /* html parser context */
  59   gboolean has_result;          /* set when ready to push out result */
  60   gboolean in_sync;             /* flag to avoid appending anything except the
  61                                  * content of the sync elements to buf */
  62   guint64 time1;                /* previous start attribute in sync tag */
  63   guint64 time2;                /* current start attribute in sync tag  */
  64 #ifdef SUBPARSE_MODIFICATION
  65   guint64 time3;                /* To store the last current time when language is changed */
  66   GList *lang_list;             /* Language list for an external subtitle file */
  67   gchar *current_language;      /* Current language parsed */
  68   gchar *desired_language;      /* Language set by user */
  69   gboolean language_changed;    /* language changed signal */
  70   gboolean end_body;            /* </BODY> reached */
  71 #endif
  72 };
  73
  74 struct _HtmlParser
  75 {
  76   void (*start_element) (HtmlContext * ctx,
  77       const gchar * name, const gchar ** attr, gpointer user_data);
  78   void (*end_element) (HtmlContext * ctx,
  79       const gchar * name, gpointer user_data);
  80   void (*text) (HtmlContext * ctx,
  81       const gchar * text, gsize text_len, gpointer user_data);
  82 };
  83
  84 struct _HtmlContext
  85 {
  86   const HtmlParser *parser;
  87   gpointer user_data;
  88   GString *buf;
  89 };
  90
  91 static HtmlContext *
  92 html_context_new (HtmlParser * parser, gpointer user_data)
  93 {
  94   HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
  95   ctxt->parser = parser;
  96   ctxt->user_data = user_data;
  97   ctxt->buf = g_string_new (NULL);
  98   return ctxt;
  99 }
 100
 101 static void
 102 html_context_free (HtmlContext * ctxt)
 103 {
 104   g_string_free (ctxt->buf, TRUE);
 105   g_free (ctxt);
 106 }
 107
 108 struct EntityMap
 109 {
 110   const gunichar unescaped;
 111   const gchar *escaped;
 112 };
 113
 114 struct EntityMap XmlEntities[] = {
 115   {34, "quot;"},
 116   {38, "amp;"},
 117   {39, "apos;"},
 118   {60, "lt;"},
 119   {62, "gt;"},
 120   {0, NULL},
 121 };
 122
 123 struct EntityMap HtmlEntities[] = {
 124 /* nbsp will handle manually
 125 { 160,  "nbsp;" }, */
 126   {161, "iexcl;"},
 127   {162, "cent;"},
 128   {163, "pound;"},
 129   {164, "curren;"},
 130   {165, "yen;"},
 131   {166, "brvbar;"},
 132   {167, "sect;"},
 133   {168, "uml;"},
 134   {169, "copy;"},
 135   {170, "ordf;"},
 136   {171, "laquo;"},
 137   {172, "not;"},
 138   {173, "shy;"},
 139   {174, "reg;"},
 140   {175, "macr;"},
 141   {176, "deg;"},
 142   {177, "plusmn;"},
 143   {178, "sup2;"},
 144   {179, "sup3;"},
 145   {180, "acute;"},
 146   {181, "micro;"},
 147   {182, "para;"},
 148   {183, "middot;"},
 149   {184, "cedil;"},
 150   {185, "sup1;"},
 151   {186, "ordm;"},
 152   {187, "raquo;"},
 153   {188, "frac14;"},
 154   {189, "frac12;"},
 155   {190, "frac34;"},
 156   {191, "iquest;"},
 157   {192, "Agrave;"},
 158   {193, "Aacute;"},
 159   {194, "Acirc;"},
 160   {195, "Atilde;"},
 161   {196, "Auml;"},
 162   {197, "Aring;"},
 163   {198, "AElig;"},
 164   {199, "Ccedil;"},
 165   {200, "Egrave;"},
 166   {201, "Eacute;"},
 167   {202, "Ecirc;"},
 168   {203, "Euml;"},
 169   {204, "Igrave;"},
 170   {205, "Iacute;"},
 171   {206, "Icirc;"},
 172   {207, "Iuml;"},
 173   {208, "ETH;"},
 174   {209, "Ntilde;"},
 175   {210, "Ograve;"},
 176   {211, "Oacute;"},
 177   {212, "Ocirc;"},
 178   {213, "Otilde;"},
 179   {214, "Ouml;"},
 180   {215, "times;"},
 181   {216, "Oslash;"},
 182   {217, "Ugrave;"},
 183   {218, "Uacute;"},
 184   {219, "Ucirc;"},
 185   {220, "Uuml;"},
 186   {221, "Yacute;"},
 187   {222, "THORN;"},
 188   {223, "szlig;"},
 189   {224, "agrave;"},
 190   {225, "aacute;"},
 191   {226, "acirc;"},
 192   {227, "atilde;"},
 193   {228, "auml;"},
 194   {229, "aring;"},
 195   {230, "aelig;"},
 196   {231, "ccedil;"},
 197   {232, "egrave;"},
 198   {233, "eacute;"},
 199   {234, "ecirc;"},
 200   {235, "euml;"},
 201   {236, "igrave;"},
 202   {237, "iacute;"},
 203   {238, "icirc;"},
 204   {239, "iuml;"},
 205   {240, "eth;"},
 206   {241, "ntilde;"},
 207   {242, "ograve;"},
 208   {243, "oacute;"},
 209   {244, "ocirc;"},
 210   {245, "otilde;"},
 211   {246, "ouml;"},
 212   {247, "divide;"},
 213   {248, "oslash;"},
 214   {249, "ugrave;"},
 215   {250, "uacute;"},
 216   {251, "ucirc;"},
 217   {252, "uuml;"},
 218   {253, "yacute;"},
 219   {254, "thorn;"},
 220   {255, "yuml;"},
 221   {338, "OElig;"},
 222   {339, "oelig;"},
 223   {352, "Scaron;"},
 224   {353, "scaron;"},
 225   {376, "Yuml;"},
 226   {402, "fnof;"},
 227   {710, "circ;"},
 228   {732, "tilde;"},
 229   {913, "Alpha;"},
 230   {914, "Beta;"},
 231   {915, "Gamma;"},
 232   {916, "Delta;"},
 233   {917, "Epsilon;"},
 234   {918, "Zeta;"},
 235   {919, "Eta;"},
 236   {920, "Theta;"},
 237   {921, "Iota;"},
 238   {922, "Kappa;"},
 239   {923, "Lambda;"},
 240   {924, "Mu;"},
 241   {925, "Nu;"},
 242   {926, "Xi;"},
 243   {927, "Omicron;"},
 244   {928, "Pi;"},
 245   {929, "Rho;"},
 246   {931, "Sigma;"},
 247   {932, "Tau;"},
 248   {933, "Upsilon;"},
 249   {934, "Phi;"},
 250   {935, "Chi;"},
 251   {936, "Psi;"},
 252   {937, "Omega;"},
 253   {945, "alpha;"},
 254   {946, "beta;"},
 255   {947, "gamma;"},
 256   {948, "delta;"},
 257   {949, "epsilon;"},
 258   {950, "zeta;"},
 259   {951, "eta;"},
 260   {952, "theta;"},
 261   {953, "iota;"},
 262   {954, "kappa;"},
 263   {955, "lambda;"},
 264   {956, "mu;"},
 265   {957, "nu;"},
 266   {958, "xi;"},
 267   {959, "omicron;"},
 268   {960, "pi;"},
 269   {961, "rho;"},
 270   {962, "sigmaf;"},
 271   {963, "sigma;"},
 272   {964, "tau;"},
 273   {965, "upsilon;"},
 274   {966, "phi;"},
 275   {967, "chi;"},
 276   {968, "psi;"},
 277   {969, "omega;"},
 278   {977, "thetasym;"},
 279   {978, "upsih;"},
 280   {982, "piv;"},
 281   {8194, "ensp;"},
 282   {8195, "emsp;"},
 283   {8201, "thinsp;"},
 284   {8204, "zwnj;"},
 285   {8205, "zwj;"},
 286   {8206, "lrm;"},
 287   {8207, "rlm;"},
 288   {8211, "ndash;"},
 289   {8212, "mdash;"},
 290   {8216, "lsquo;"},
 291   {8217, "rsquo;"},
 292   {8218, "sbquo;"},
 293   {8220, "ldquo;"},
 294   {8221, "rdquo;"},
 295   {8222, "bdquo;"},
 296   {8224, "dagger;"},
 297   {8225, "Dagger;"},
 298   {8226, "bull;"},
 299   {8230, "hellip;"},
 300   {8240, "permil;"},
 301   {8242, "prime;"},
 302   {8243, "Prime;"},
 303   {8249, "lsaquo;"},
 304   {8250, "rsaquo;"},
 305   {8254, "oline;"},
 306   {8260, "frasl;"},
 307   {8364, "euro;"},
 308   {8465, "image;"},
 309   {8472, "weierp;"},
 310   {8476, "real;"},
 311   {8482, "trade;"},
 312   {8501, "alefsym;"},
 313   {8592, "larr;"},
 314   {8593, "uarr;"},
 315   {8594, "rarr;"},
 316   {8595, "darr;"},
 317   {8596, "harr;"},
 318   {8629, "crarr;"},
 319   {8656, "lArr;"},
 320   {8657, "uArr;"},
 321   {8658, "rArr;"},
 322   {8659, "dArr;"},
 323   {8660, "hArr;"},
 324   {8704, "forall;"},
 325   {8706, "part;"},
 326   {8707, "exist;"},
 327   {8709, "empty;"},
 328   {8711, "nabla;"},
 329   {8712, "isin;"},
 330   {8713, "notin;"},
 331   {8715, "ni;"},
 332   {8719, "prod;"},
 333   {8721, "sum;"},
 334   {8722, "minus;"},
 335   {8727, "lowast;"},
 336   {8730, "radic;"},
 337   {8733, "prop;"},
 338   {8734, "infin;"},
 339   {8736, "ang;"},
 340   {8743, "and;"},
 341   {8744, "or;"},
 342   {8745, "cap;"},
 343   {8746, "cup;"},
 344   {8747, "int;"},
 345   {8756, "there4;"},
 346   {8764, "sim;"},
 347   {8773, "cong;"},
 348   {8776, "asymp;"},
 349   {8800, "ne;"},
 350   {8801, "equiv;"},
 351   {8804, "le;"},
 352   {8805, "ge;"},
 353   {8834, "sub;"},
 354   {8835, "sup;"},
 355   {8836, "nsub;"},
 356   {8838, "sube;"},
 357   {8839, "supe;"},
 358   {8853, "oplus;"},
 359   {8855, "otimes;"},
 360   {8869, "perp;"},
 361   {8901, "sdot;"},
 362   {8968, "lceil;"},
 363   {8969, "rceil;"},
 364   {8970, "lfloor;"},
 365   {8971, "rfloor;"},
 366   {9001, "lang;"},
 367   {9002, "rang;"},
 368   {9674, "loz;"},
 369   {9824, "spades;"},
 370   {9827, "clubs;"},
 371   {9829, "hearts;"},
 372   {9830, "diams;"},
 373   {0, NULL},
 374 };
 375
 376 static gchar *
 377 unescape_string (const gchar * text)
 378 {
 379   gint i;
 380   GString *unescaped = g_string_new (NULL);
 381
 382   while (*text) {
 383     if (*text == '&') {
 384       text++;
 385
 386       /* unescape &nbsp and &nbsp; */
 387       if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
 388         unescaped = g_string_append_unichar (unescaped, 160);
 389         text += 4;
 390         if (*text == ';') {
 391           text++;
 392         }
 393         goto next;
 394       }
 395
 396       /* pass xml entities. these will be processed as pango markup */
 397       for (i = 0; XmlEntities[i].escaped; i++) {
 398         gssize len = strlen (XmlEntities[i].escaped);
 399         if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
 400           unescaped = g_string_append_c (unescaped, '&');
 401           unescaped =
 402               g_string_append_len (unescaped, XmlEntities[i].escaped, len);
 403           text += len;
 404           goto next;
 405         }
 406       }
 407
 408       /* convert html entities */
 409       for (i = 0; HtmlEntities[i].escaped; i++) {
 410         gssize len = strlen (HtmlEntities[i].escaped);
 411         if (!strncmp (text, HtmlEntities[i].escaped, len)) {
 412           unescaped =
 413               g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
 414           text += len;
 415           goto next;
 416         }
 417       }
 418
 419       if (*text == '#') {
 420         gboolean is_hex = FALSE;
 421         gunichar l;
 422         gchar *end = NULL;
 423
 424         text++;
 425         if (*text == 'x') {
 426           is_hex = TRUE;
 427           text++;
 428         }
 429         errno = 0;
 430         if (is_hex) {
 431           l = strtoul (text, &end, 16);
 432         } else {
 433           l = strtoul (text, &end, 10);
 434         }
 435
 436         if (text == end || errno != 0) {
 437           /* error occured. pass it */
 438           goto next;
 439         }
 440         unescaped = g_string_append_unichar (unescaped, l);
 441         text = end;
 442
 443         if (*text == ';') {
 444           text++;
 445         }
 446         goto next;
 447       }
 448
 449       /* escape & */
 450       unescaped = g_string_append (unescaped, "&amp;");
 451
 452     next:
 453       continue;
 454
 455     } else if (g_ascii_isspace (*text)) {
 456       unescaped = g_string_append_c (unescaped, ' ');
 457       /* strip whitespace */
 458       do {
 459         text++;
 460       } while ((*text) && g_ascii_isspace (*text));
 461     } else {
 462       unescaped = g_string_append_c (unescaped, *text);
 463       text++;
 464     }
 465   }
 466
 467   return g_string_free (unescaped, FALSE);
 468 }
 469
 470 static const gchar *
 471 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
 472 {
 473   gchar *next = strstr (string, delimiter);
 474   if (next) {
 475     *first = g_strndup (string, next - string);
 476   } else {
 477     *first = g_strdup (string);
 478   }
 479   return next;
 480 }
 481
 482 static void
 483 html_context_handle_element (HtmlContext * ctxt,
 484     const gchar * string, gboolean must_close)
 485 {
 486   gchar *name = NULL;
 487   gint count = 0, i;
 488   gchar **attrs;
 489   const gchar *found, *next;
 490 #ifdef SUBPARSE_MODIFICATION
 491   const gchar *name_temp = NULL;
 492   gint j = 0;
 493 #endif
 494   /* split element name and attributes */
 495   next = string_token (string, " ", &name);
 496
 497   if (next) {
 498     /* count attributes */
 499     found = next + 1;
 500     while (TRUE) {
 501       found = strchr (found, '=');
 502       if (!found)
 503         break;
 504       found++;
 505       count++;
 506     }
 507   } else {
 508     count = 0;
 509   }
 510
 511   attrs = g_new0 (gchar *, (count + 1) * 2);
 512
 513   for (i = 0; i < count; i += 2) {
 514     gchar *attr_name = NULL, *attr_value = NULL;
 515     gsize length;
 516
 517 #ifdef SUBPARSE_MODIFICATION
 518     /* sometimes count can unnecessarily be high value, because of unrequired "=" in subtitle file.
 519      * In that case it should not crash */
 520     if (!next)
 521       break;
 522 #endif
 523
 524     next = string_token (next + 1, "=", &attr_name);
 525
 526 #ifdef SUBPARSE_MODIFICATION
 527     /* sometimes count can unnecessarily be high value, because of unrequired "=" in subtitle file.
 528      * In that case it should not crash */
 529     if (!next)
 530       break;
 531 #endif
 532
 533     next = string_token (next + 1, " ", &attr_value);
 534
 535     /* strip " or ' from attribute value */
 536     if (attr_value[0] == '"' || attr_value[0] == '\'') {
 537       gchar *tmp = g_strdup (attr_value + 1);
 538       g_free (attr_value);
 539       attr_value = tmp;
 540     }
 541
 542     length = strlen (attr_value);
 543     if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
 544       attr_value[length - 1] = '\0';
 545     }
 546
 547     attrs[i] = attr_name;
 548     attrs[i + 1] = attr_value;
 549   }
 550 #ifdef SUBPARSE_MODIFICATION
 551   /* sometimes spaces can be there in between !-- and P
 552    * that also we have to take care */
 553   if (!g_ascii_strcasecmp("!--", name)) {
 554     gchar* tempchar = (gchar*)(string + 3);
 555     while (*tempchar == ' ') {
 556       tempchar++;
 557       if (*tempchar == 'P' || *tempchar == 'p') {
 558         *(name + 3) = *tempchar;
 559         *(name + 4) = '\0';
 560         next = tempchar + 1;
 561         break;
 562       }
 563     }
 564   }
 565   if (next && (!g_ascii_strcasecmp("!--P", name))) {
 566     gint attrindex = 0;
 567     count = 0;
 568     /* count attributes */
 569     found = next + 1;
 570     while (TRUE) {
 571       found = (gchar*)strcasestr (found, "lang");
 572       if (!found)
 573         break;
 574       found++;
 575       count++;
 576     }
 577     g_strfreev (attrs);
 578
 579     attrs = g_new0 (gchar *, count * 2);
 580
 581     for (i = 0; i < count; i++) {
 582       gchar *attr_name = NULL, *attr_value = NULL;
 583
 584       next = (gchar*)strcasestr (next, "lang:");
 585       attr_value = (gchar*)malloc (3);
 586       next = next + 5;
 587       strncpy (attr_value, next, 2);
 588       attr_value[2] = '\0';
 589       GST_LOG ("Language value comes as %s", attr_value);
 590       name_temp = next;
 591       while (TRUE) {
 592         if (*name_temp == '{') {
 593           int character_count = 0;
 594
 595           while (TRUE) {
 596             name_temp--;
 597
 598             if (*name_temp == '.') {
 599               attr_name = (gchar*) malloc (character_count + 1);
 600               break;
 601             }
 602             else if (*name_temp != ' ')
 603               character_count++;
 604           }
 605           break;
 606         }
 607         name_temp--;
 608       }
 609       name_temp++;
 610       for (j = 0; *(name_temp + j) != ' '; j++) {
 611         attr_name[j] = *(name_temp + j);
 612       }
 613       attr_name[j] = '\0';
 614       attrs[attrindex++] = attr_name;
 615       attrs[attrindex++] = attr_value;
 616     }
 617   } else {
 618     count = 0;
 619   }
 620 #endif
 621   ctxt->parser->start_element (ctxt, name,
 622       (const gchar **) attrs, ctxt->user_data);
 623   if (must_close) {
 624     ctxt->parser->end_element (ctxt, name, ctxt->user_data);
 625   }
 626   g_strfreev (attrs);
 627   g_free (name);
 628 }
 629
 630 static void
 631 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
 632 {
 633   const gchar *next = NULL;
 634   ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
 635   next = ctxt->buf->str;
 636   if (!next) {
 637     GST_ERROR ("ctxt->buf->str is NULL");
 638     return;
 639   }
 640   while (TRUE) {
 641     if (next[0] == '<') {
 642       gchar *element = NULL;
 643       /* find <blahblah> */
 644       if (!strchr (next, '>')) {
 645         /* no tag end point. buffer will be process in next time */
 646         return;
 647       }
 648
 649       next = string_token (next, ">", &element);
 650       next++;
 651       if (g_str_has_suffix (next, "/")) {
 652         /* handle <blah/> */
 653         element[strlen (element) - 1] = '\0';
 654         html_context_handle_element (ctxt, element + 1, TRUE);
 655       } else if (element[1] == '/') {
 656         /* handle </blah> */
 657         ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
 658       } else {
 659         /* handle <blah> */
 660         html_context_handle_element (ctxt, element + 1, FALSE);
 661       }
 662       g_free (element);
 663     } else if (strchr (next, '<')) {
 664       gchar *text = NULL;
 665       gsize length;
 666       next = string_token (next, "<", &text);
 667       text = g_strstrip (text);
 668       length = strlen (text);
 669       ctxt->parser->text (ctxt, text, length, ctxt->user_data);
 670       g_free (text);
 671
 672     } else {
 673       gchar *text = (gchar *) next;
 674       gsize length;
 675       text = g_strstrip (text);
 676       length = strlen (text);
 677       ctxt->parser->text (ctxt, text, length, ctxt->user_data);
 678       ctxt->buf = g_string_assign (ctxt->buf, "");
 679       return;
 680     }
 681   }
 682
 683   ctxt->buf = g_string_assign (ctxt->buf, next);
 684 }
 685
 686 static gchar *
 687 has_tag (GString * str, const gchar tag)
 688 {
 689   return strrchr (str->str, tag);
 690 }
 691
 692 static void
 693 sami_context_push_state (GstSamiContext * sctx, char state)
 694 {
 695   GST_LOG ("state %c", state);
 696   g_string_append_c (sctx->state, state);
 697 }
 698
 699 static void
 700 sami_context_pop_state (GstSamiContext * sctx, char state)
 701 {
 702   GString *str = g_string_new ("");
 703   GString *context_state = sctx->state;
 704   int i;
 705
 706   GST_LOG ("state %c", state);
 707   for (i = context_state->len - 1; i >= 0; i--) {
 708     switch (context_state->str[i]) {
 709       case ITALIC_TAG:         /* <i> */
 710       {
 711         g_string_append (str, "</i>");
 712         break;
 713       }
 714       case SPAN_TAG:           /* <span foreground= > */
 715       {
 716         g_string_append (str, "</span>");
 717         break;
 718       }
 719       case RUBY_TAG:           /* <span size= >  -- ruby */
 720       {
 721         break;
 722       }
 723       case RT_TAG:             /*  ruby */
 724       {
 725         /* FIXME: support for furigana/ruby once implemented in pango */
 726         g_string_append (sctx->rubybuf, "</span>");
 727         if (has_tag (context_state, ITALIC_TAG)) {
 728           g_string_append (sctx->rubybuf, "</i>");
 729         }
 730
 731         break;
 732       }
 733       default:
 734         break;
 735     }
 736     if (context_state->str[i] == state) {
 737       g_string_append (sctx->buf, str->str);
 738       g_string_free (str, TRUE);
 739       g_string_truncate (context_state, i);
 740       return;
 741     }
 742   }
 743   if (state == CLEAR_TAG) {
 744     g_string_append (sctx->buf, str->str);
 745     g_string_truncate (context_state, 0);
 746   }
 747   g_string_free (str, TRUE);
 748 }
 749
 750 static void
 751 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
 752 {
 753   int i;
 754
 755   sami_context_pop_state (sctx, CLEAR_TAG);
 756   if (atts != NULL) {
 757     for (i = 0; (atts[i] != NULL); i += 2) {
 758       const gchar *key, *value;
 759
 760       key = atts[i];
 761       value = atts[i + 1];
 762
 763       if (!value)
 764         continue;
 765       if (!g_ascii_strcasecmp ("start", key)) {
 766         /* Only set a new start time if we don't have text pending */
 767         if (sctx->resultbuf->len == 0)
 768           sctx->time1 = sctx->time2;
 769
 770         sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
 771 #ifdef SUBPARSE_MODIFICATION
 772         sctx->time3 = sctx->time2;
 773 #endif
 774         sctx->time2 = MAX (sctx->time2, sctx->time1);
 775         g_string_append (sctx->resultbuf, sctx->buf->str);
 776         sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
 777         g_string_truncate (sctx->buf, 0);
 778       }
 779     }
 780   }
 781 }
 782
 783 static void
 784 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
 785 {
 786   int i;
 787
 788   sami_context_pop_state (sctx, SPAN_TAG);
 789   if (atts != NULL) {
 790     g_string_append (sctx->buf, "<span");
 791     for (i = 0; (atts[i] != NULL); i += 2) {
 792       const gchar *key, *value;
 793
 794       key = atts[i];
 795       value = atts[i + 1];
 796
 797       if (!value)
 798         continue;
 799       if (!g_ascii_strcasecmp ("color", key)) {
 800         /*
 801          * There are invalid color value in many
 802          * sami files.
 803          * It will fix hex color value that start without '#'
 804          */
 805         const gchar *sharp = "";
 806         int len = strlen (value);
 807
 808         if (!(*value == '#' && len == 7)) {
 809           gchar *r;
 810
 811           /* check if it looks like hex */
 812           if (strtol ((const char *) value, &r, 16) >= 0 &&
 813               ((gchar *) r == (value + 6) && len == 6)) {
 814             sharp = "#";
 815           }
 816         }
 817         /* some colours can be found in many sami files, but X RGB database
 818          * doesn't contain a colour by this name, so map explicitly */
 819         if (!g_ascii_strcasecmp ("aqua", value)) {
 820           value = "#00ffff";
 821         } else if (!g_ascii_strcasecmp ("crimson", value)) {
 822           value = "#dc143c";
 823         } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
 824           value = "#ff00ff";
 825         } else if (!g_ascii_strcasecmp ("indigo", value)) {
 826           value = "#4b0082";
 827         } else if (!g_ascii_strcasecmp ("lime", value)) {
 828           value = "#00ff00";
 829         } else if (!g_ascii_strcasecmp ("olive", value)) {
 830           value = "#808000";
 831         } else if (!g_ascii_strcasecmp ("silver", value)) {
 832           value = "#c0c0c0";
 833         } else if (!g_ascii_strcasecmp ("teal", value)) {
 834           value = "#008080";
 835         }
 836         g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
 837             value);
 838       } else if (!g_ascii_strcasecmp ("face", key)) {
 839         g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
 840       }
 841     }
 842     g_string_append_c (sctx->buf, '>');
 843     sami_context_push_state (sctx, SPAN_TAG);
 844   }
 845 }
 846
 847 #ifdef SUBPARSE_MODIFICATION
 848 static void
 849 handle_p (GstSamiContext * sctx, const gchar ** atts)
 850 {
 851   int i;
 852
 853   if (atts != NULL) {
 854     for (i = 0; (atts[i] != NULL); i += 2) {
 855       const gchar *key, *value;
 856
 857       key = atts[i];
 858       value = atts[i + 1];
 859
 860       if (sctx->current_language && value && strcmp(sctx->current_language, value)
 861           && (sctx->time1 == sctx->time2))
 862         sctx->language_changed = TRUE;
 863
 864       else if (!sctx->current_language)
 865         sctx->current_language = (gchar*) malloc (128);
 866
 867       if (key && !g_ascii_strcasecmp ("class", key) && value) {
 868         strcpy (sctx->current_language, value);
 869         if (sctx->desired_language == NULL && key) {
 870           sctx->desired_language = g_strdup(value);
 871           GST_LOG("no language list was found and desired lang was set to %s",sctx->desired_language);
 872         }
 873       }
 874       if (sctx->language_changed)
 875       {
 876          sctx->time1 = 0;
 877          sctx->time2 = sctx->time3;
 878          sctx->language_changed = FALSE;
 879       }
 880       if (!value)
 881         continue;
 882     }
 883   }
 884 }
 885
 886 static void
 887 handle_start_language_list (GstSamiContext * sctx, const gchar ** atts)
 888 {
 889   int i = 0;
 890   int attrIndex = 0;
 891   GstLangStruct *new = NULL;
 892   GstLangStruct *temp = NULL;
 893
 894   if (atts != NULL) {
 895     if (g_list_length (sctx->lang_list)) {
 896       GST_LOG ("We already got the language list");
 897       return;
 898     }
 899     for (i = 0; (atts[attrIndex] != NULL); i++) {
 900       const gchar *key, *value;
 901
 902       key = atts[attrIndex++];
 903       value = atts[attrIndex++];
 904
 905       GST_LOG ("Inside handle_start_language_list key: %s, value: %s", key, value);
 906
 907       if (!value)
 908         continue;
 909
 910       new = g_new0 (GstLangStruct, 1);
 911       new->language_code = (gchar*) malloc (strlen(value) + 1);
 912       if (new->language_code && value)
 913         strcpy (new->language_code, value);
 914       new->language_key = (gchar*) malloc (strlen(key) + 1);
 915       if (new->language_key && key)
 916         strcpy (new->language_key, key);
 917       sctx->lang_list = g_list_append (sctx->lang_list, new);
 918       temp = g_list_nth_data (sctx->lang_list, i);
 919       if (sctx->desired_language == NULL && key){
 920         sctx->desired_language = g_strdup(key);
 921       }
 922
 923       if (temp)
 924         GST_LOG ("Inside handle_start_language_list of glist key: %s, value: %s",
 925                     temp->language_key, temp->language_code);
 926     }
 927   }
 928 }
 929 #endif
 930
 931 static void
 932 handle_start_element (HtmlContext * ctx, const gchar * name,
 933     const char **atts, gpointer user_data)
 934 {
 935   GstSamiContext *sctx = (GstSamiContext *) user_data;
 936
 937   GST_LOG ("name:%s", name);
 938
 939   if (!g_ascii_strcasecmp ("sync", name)) {
 940     handle_start_sync (sctx, atts);
 941     sctx->in_sync = TRUE;
 942   } else if (!g_ascii_strcasecmp ("font", name)) {
 943     handle_start_font (sctx, atts);
 944   } else if (!g_ascii_strcasecmp ("ruby", name)) {
 945     sami_context_push_state (sctx, RUBY_TAG);
 946   } else if (!g_ascii_strcasecmp ("br", name)) {
 947 #ifdef SUBPARSE_MODIFICATION
 948     if (sctx->current_language && sctx->desired_language &&
 949         !strcmp(sctx->current_language, sctx->desired_language))
 950 #endif
 951       g_string_append_c (sctx->buf, '\n');
 952     /* FIXME: support for furigana/ruby once implemented in pango */
 953   } else if (!g_ascii_strcasecmp ("rt", name)) {
 954     if (has_tag (sctx->state, ITALIC_TAG)) {
 955       g_string_append (sctx->rubybuf, "<i>");
 956     }
 957     g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
 958     sami_context_push_state (sctx, RT_TAG);
 959   } else if (!g_ascii_strcasecmp ("i", name)) {
 960 #ifdef SUBPARSE_MODIFICATION
 961     if (sctx->current_language && sctx->desired_language &&
 962         !strcmp(sctx->current_language, sctx->desired_language))
 963 #endif
 964       g_string_append (sctx->buf, "<i>");
 965     sami_context_push_state (sctx, ITALIC_TAG);
 966   } else if (!g_ascii_strcasecmp ("p", name)) {
 967 #ifdef SUBPARSE_MODIFICATION
 968     handle_p (sctx, atts);
 969   } else if (!g_ascii_strcasecmp ("!--P", name)) {
 970     handle_start_language_list (sctx, atts);
 971 #endif
 972   }
 973 }
 974
 975 static void
 976 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
 977 {
 978   GstSamiContext *sctx = (GstSamiContext *) user_data;
 979
 980   GST_LOG ("name:%s", name);
 981
 982   if (!g_ascii_strcasecmp ("sync", name)) {
 983     sctx->in_sync = FALSE;
 984   } else if ((!g_ascii_strcasecmp ("body", name)) ||
 985       (!g_ascii_strcasecmp ("sami", name))) {
 986     /* We will usually have one buffer left when the body is closed
 987      * as we need the next sync to actually send it */
 988
 989 #ifdef SUBPARSE_MODIFICATION
 990     sctx->end_body = TRUE;
 991 #endif
 992
 993     if (sctx->buf->len != 0) {
 994       /* Only set a new start time if we don't have text pending */
 995       if (sctx->resultbuf->len == 0)
 996         sctx->time1 = sctx->time2;
 997
 998       sctx->time2 = GST_CLOCK_TIME_NONE;
 999       g_string_append (sctx->resultbuf, sctx->buf->str);
1000       sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
1001       g_string_truncate (sctx->buf, 0);
1002     }
1003   } else if (!g_ascii_strcasecmp ("font", name)) {
1004     sami_context_pop_state (sctx, SPAN_TAG);
1005   } else if (!g_ascii_strcasecmp ("ruby", name)) {
1006     sami_context_pop_state (sctx, RUBY_TAG);
1007   } else if (!g_ascii_strcasecmp ("i", name)) {
1008     sami_context_pop_state (sctx, ITALIC_TAG);
1009   }
1010 }
1011
1012 static void
1013 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
1014     gpointer user_data)
1015 {
1016   GstSamiContext *sctx = (GstSamiContext *) user_data;
1017
1018   /* Skip everything except content of the sync elements */
1019   if (!sctx->in_sync)
1020     return;
1021 #ifdef SUBPARSE_MODIFICATION
1022   if (has_tag (sctx->state, RT_TAG) && (sctx->current_language && sctx->desired_language &&
1023        !strcmp(sctx->current_language, sctx->desired_language))) {
1024 #else
1025   if (has_tag (sctx->state, RT_TAG)) {
1026 #endif
1027     g_string_append_c (sctx->rubybuf, ' ');
1028     g_string_append (sctx->rubybuf, text);
1029     g_string_append_c (sctx->rubybuf, ' ');
1030   } else {
1031 #ifdef SUBPARSE_MODIFICATION
1032     if (sctx->current_language && sctx->desired_language &&
1033         !strcmp(sctx->current_language, sctx->desired_language))
1034 #endif
1035       g_string_append (sctx->buf, text);
1036   }
1037 }
1038
1039 static HtmlParser samiParser = {
1040   handle_start_element,         /* start_element */
1041   handle_end_element,           /* end_element */
1042   handle_text,                  /* text */
1043 };
1044
1045 void
1046 sami_context_init (ParserState * state)
1047 {
1048   GstSamiContext *context;
1049
1050   g_assert (state->user_data == NULL);
1051
1052   context = g_new0 (GstSamiContext, 1);
1053
1054   context->htmlctxt = html_context_new (&samiParser, context);
1055   context->buf = g_string_new ("");
1056   context->rubybuf = g_string_new ("");
1057   context->resultbuf = g_string_new ("");
1058   context->state = g_string_new ("");
1059 #ifdef SUBPARSE_MODIFICATION
1060   context->current_language = NULL;
1061   context->desired_language = NULL;
1062   context->lang_list = NULL;
1063   context->language_changed = FALSE;
1064   context->end_body = FALSE;
1065 #endif
1066   state->user_data = context;
1067 }
1068
1069 void
1070 sami_context_deinit (ParserState * state)
1071 {
1072   GstSamiContext *context = (GstSamiContext *) state->user_data;
1073 #ifdef SUBPARSE_MODIFICATION
1074   GstLangStruct *temp = NULL;
1075   int i = 0;
1076 #endif
1077   if (context) {
1078     html_context_free (context->htmlctxt);
1079     context->htmlctxt = NULL;
1080     g_string_free (context->buf, TRUE);
1081     g_string_free (context->rubybuf, TRUE);
1082     g_string_free (context->resultbuf, TRUE);
1083     g_string_free (context->state, TRUE);
1084 #ifdef SUBPARSE_MODIFICATION
1085     if (context->lang_list) {
1086       while ((temp = g_list_nth_data (context->lang_list, i))) {
1087         if (temp->language_code)
1088           free (temp->language_code);
1089         temp->language_code = NULL;
1090         if (temp->language_key)
1091           free (temp->language_key);
1092         temp->language_key = NULL;
1093         g_free (temp);
1094         i++;
1095       }
1096       g_list_free (context->lang_list);
1097     }
1098     context->lang_list = NULL;
1099
1100     if (context->current_language)
1101       free (context->current_language);
1102     context->current_language = NULL;
1103
1104     context->desired_language = NULL;
1105 #endif
1106     g_free (context);
1107     state->user_data = NULL;
1108   }
1109 }
1110
1111 void
1112 sami_context_reset (ParserState * state)
1113 {
1114   GstSamiContext *context = (GstSamiContext *) state->user_data;
1115
1116   if (context) {
1117     g_string_truncate (context->buf, 0);
1118     g_string_truncate (context->rubybuf, 0);
1119     g_string_truncate (context->resultbuf, 0);
1120     g_string_truncate (context->state, 0);
1121     context->has_result = FALSE;
1122     context->in_sync = FALSE;
1123     context->time1 = 0;
1124     context->time2 = 0;
1125   }
1126 }
1127
1128 #ifdef SUBPARSE_MODIFICATION
1129 void
1130 sami_context_change_language (ParserState * state)
1131 {
1132   GstSamiContext *context = (GstSamiContext *) state->user_data;
1133   GST_LOG ("**********desired language was %s**************", context->desired_language);
1134   free (context->desired_language);
1135   if(state->current_language) {
1136     context->desired_language = state->current_language;
1137   } else {
1138     context->desired_language = state->msl_language;
1139   }
1140   GST_LOG ("desired language changed to %s", context->desired_language);
1141 }
1142
1143 gchar *
1144 sami_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding,
1145     gsize * consumed, GError ** err, GstSubParse * self)
1146 {
1147   gchar *ret = NULL;
1148
1149   /* The char cast is necessary in glib < 2.24 */
1150   ret =
1151       g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*",
1152       consumed, NULL, err);
1153
1154   if (ret == NULL)
1155   {
1156     GST_DEBUG_OBJECT (self, "g_convert_with_fallback returns NULL");
1157     return ret;
1158   }
1159
1160   /* + 3 to skip UTF-8 BOM if it was added */
1161   len = strlen (ret);
1162   if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
1163       && (guint8) ret[2] == 0xBF)
1164     g_memmove (ret, ret + 3, len + 1 - 3);
1165
1166   return ret;
1167 }
1168
1169 gboolean
1170 sami_validate_langlist_body(GList * lang_list, GstSubParse * self){
1171   gchar * file_path_type = NULL;
1172   gchar * file_path = NULL;
1173   gchar   line[1024];
1174   FILE  * fp = NULL;
1175   guint i = 0, found_count = 0;
1176   const guint list_len = g_list_length(lang_list);
1177   gboolean counter[MAX_LANGUAGE];
1178   struct LangStruct
1179   {
1180       gchar *language_code;
1181       gchar *language_key;
1182   } * lang;
1183
1184   GstQuery *cquery;
1185   GstStructure *structure;
1186   const GValue *value;
1187   structure = gst_structure_new ("FileSrcURI",
1188                                  "file-uri", G_TYPE_STRING, NULL, NULL);
1189
1190   cquery = gst_query_new_application (GST_QUERY_CUSTOM, structure);
1191
1192   if (!gst_pad_peer_query (self->sinkpad, cquery))
1193   {
1194     GST_DEBUG_OBJECT (self, "failed to query SMI file path");
1195     gst_query_unref (cquery);
1196     return FALSE;
1197   }
1198   structure = gst_query_get_structure (cquery);
1199   value = gst_structure_get_value (structure, "file-uri");
1200   file_path = g_strdup (g_value_get_string (value));
1201
1202   if (file_path == NULL){
1203     GST_DEBUG_OBJECT (self, "could not parse the SMI file path");
1204     gst_query_unref (cquery);
1205     return FALSE;
1206   }
1207   gst_query_unref (cquery);
1208
1209   GST_INFO_OBJECT (self, "file path comes as %s", file_path);
1210
1211   file_path_type = g_strndup ((gchar *) file_path, 4);
1212   GST_INFO_OBJECT (self, "received file path by query = %s,%s", file_path,file_path_type);
1213   if (!g_strcmp0(file_path_type, "file")){
1214     file_path += 7;
1215     GST_INFO_OBJECT (self, "file path comes as %s", file_path);
1216
1217     fp = fopen (file_path, "r");
1218     if (!fp){
1219       GST_DEBUG_OBJECT (self, "failed to open file");
1220       return FALSE;
1221     }
1222
1223     for(i=0;i<list_len;i++){
1224       counter[i] = FALSE;
1225     }
1226
1227     while(!feof(fp) && found_count < list_len){
1228       GError *err = NULL;
1229       gsize * consumed = NULL;
1230       gint gap = 0;
1231       guint charCount = 0;
1232       gchar* result = NULL;
1233       gchar* temp = NULL;
1234       gchar* temp_lang = NULL;
1235       gchar * temp1 = NULL;
1236       gchar *con_temp_lang = NULL;
1237       gchar *con_temp = NULL;
1238       gboolean conversion = TRUE;
1239       charCount = fread (line, sizeof(char), 1024, fp);
1240       if (!charCount) {
1241         GST_WARNING_OBJECT (self, "fread returned zero bytes");
1242         continue;
1243       }
1244       GST_DEBUG("value of detected encoding is %s and self encoding is %s",self->detected_encoding,self->encoding);
1245       if (self->detected_encoding && strcmp (self->detected_encoding, "UTF-8") && conversion){
1246         result = sami_convert_to_utf8 (line, charCount, self->detected_encoding, consumed, &err, self);
1247       }
1248       if(result == NULL) {
1249          result = line;
1250          conversion =  FALSE;
1251       }
1252       con_temp = g_utf8_strdown (result,strlen(result));
1253       temp = con_temp;
1254       while(con_temp) {
1255         con_temp = g_strstr_len(con_temp, strlen(con_temp),"class=");
1256         if(con_temp) {
1257           temp1 = g_strstr_len(con_temp+1, strlen(con_temp),"class=");
1258         }
1259         if(temp1 && con_temp){
1260           gap = strlen(con_temp)-strlen(temp1);
1261         }else if(con_temp) {
1262           gap = strlen(con_temp);
1263         } else {
1264           continue;
1265         }
1266         if(con_temp){
1267           for(i=0;i<list_len;i++){
1268             if(counter[i]==TRUE){
1269               con_temp=con_temp+1;
1270               continue;
1271             }
1272             lang = (struct LangStruct *) g_list_nth_data(lang_list,i);
1273             if(lang) {
1274               temp_lang = g_strdup(lang->language_key);
1275               con_temp_lang = g_utf8_strdown (temp_lang,strlen(temp_lang));
1276               if(g_strstr_len(con_temp,gap,con_temp_lang)){
1277                 found_count++;
1278                 counter[i]=TRUE;
1279                 GST_INFO_OBJECT (self, " valid Language in list : [%s]", lang->language_key);
1280                 con_temp=con_temp+1;
1281               }
1282               g_free(temp_lang);
1283               g_free(con_temp_lang);
1284             }
1285           }
1286         }
1287       }
1288       if(conversion)
1289        g_free (result);
1290       if(temp)
1291        g_free(temp);
1292
1293     }
1294
1295     if(found_count < list_len){
1296       for(i=0;i<list_len;i++){
1297         if(counter[i]==FALSE)
1298           lang_list = g_list_delete_link(lang_list,g_list_nth(lang_list,i));
1299       }
1300     }
1301   }
1302   fclose(fp);
1303   return TRUE;
1304 }
1305 #endif
1306
1307 gchar *
1308 parse_sami (ParserState * state, const gchar * line)
1309 {
1310   gchar *ret = NULL;
1311 #ifdef SUBPARSE_MODIFICATION
1312   gint64 clip_start = 0, clip_stop = 0;
1313   gboolean in_seg = FALSE;
1314 #endif
1315   GstSamiContext *context = (GstSamiContext *) state->user_data;
1316
1317   gchar *unescaped = unescape_string (line);
1318   html_context_parse (context->htmlctxt, (gchar *) unescaped,
1319       strlen (unescaped));
1320 #ifdef SUBPARSE_MODIFICATION
1321   if (context->lang_list)
1322     state->language_list = context->lang_list;
1323
1324   if (context->desired_language)
1325     state->current_language = context->desired_language;
1326 #endif
1327   g_free (unescaped);
1328 #ifdef SUBPARSE_MODIFICATION
1329   if (context->desired_language && context->current_language) {
1330     if ((!strcmp(context->current_language, context->desired_language)) || context->end_body) {
1331 #endif
1332       if (context->has_result) {
1333         if (context->rubybuf->len) {
1334           context->rubybuf = g_string_append_c (context->rubybuf, '\n');
1335           g_string_prepend (context->resultbuf, context->rubybuf->str);
1336           context->rubybuf = g_string_truncate (context->rubybuf, 0);
1337         }
1338
1339         ret = g_string_free (context->resultbuf, FALSE);
1340         context->resultbuf = g_string_new ("");
1341         state->start_time = context->time1;
1342         state->duration = context->time2 - context->time1;
1343         context->has_result = FALSE;
1344       }
1345 #ifdef SUBPARSE_MODIFICATION
1346       context->end_body = FALSE;
1347     }
1348   }
1349   /* Check our segment start/stop */
1350   in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
1351              state->start_time, state->start_time + state->duration, &clip_start,
1352              &clip_stop);
1353
1354   /* No need to send that text if it's out of segment */
1355   if (in_seg) {
1356     state->start_time = clip_start;
1357     state->duration = clip_stop - clip_start;
1358   } else {
1359     return NULL;
1360   }
1361 #endif
1362   return ret;
1363 }