gst/subparse/samiparse.c

   1 /* GStreamer SAMI subtitle parser
   2  * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #define _GNU_SOURCE
  21 #include "samiparse.h"
  22
  23 #include <glib.h>
  24 #include <string.h>
  25 #include <stdlib.h>
  26
  27 #define ITALIC_TAG 'i'
  28 #define SPAN_TAG   's'
  29 #define RUBY_TAG   'r'
  30 #define RT_TAG     't'
  31 #define CLEAR_TAG  '0'
  32
  33 typedef struct _HtmlParser HtmlParser;
  34 typedef struct _HtmlContext HtmlContext;
  35 typedef struct _GstSamiContext GstSamiContext;
  36 #ifdef SUBPARSE_MODIFICATION
  37 typedef struct _LanguageStruct  GstLangStruct;
  38 struct _LanguageStruct
  39 {
  40     gchar *language_code;
  41     gchar *language_key;
  42 };
  43 #define MAX_LANGUAGE 10
  44 #endif
  45 struct _GstSamiContext
  46 {
  47   GString *buf;                 /* buffer to collect content */
  48   GString *rubybuf;             /* buffer to collect ruby content */
  49   GString *resultbuf;           /* when opening the next 'sync' tag, move
  50                                  * from 'buf' to avoid to append following
  51                                  * content */
  52   GString *state;               /* in many sami files there are tags that
  53                                  * are not closed, so for each open tag the
  54                                  * parser will append a tag flag here so
  55                                  * that tags can be closed properly on
  56                                  * 'sync' tags. See _context_push_state()
  57                                  * and _context_pop_state(). */
  58   HtmlContext *htmlctxt;        /* html parser context */
  59   gboolean has_result;          /* set when ready to push out result */
  60   gboolean in_sync;             /* flag to avoid appending anything except the
  61                                  * content of the sync elements to buf */
  62   guint64 time1;                /* previous start attribute in sync tag */
  63   guint64 time2;                /* current start attribute in sync tag  */
  64 #ifdef SUBPARSE_MODIFICATION
  65   guint64 time3;                /* To store the last current time when language is changed */
  66   GList *lang_list;             /* Language list for an external subtitle file */
  67   gchar *current_language;      /* Current language parsed */
  68   gchar *desired_language;      /* Language set by user */
  69   gboolean language_changed;    /* language changed signal */
  70   gboolean end_body;            /* </BODY> reached */
  71 #endif
  72 };
  73
  74 struct _HtmlParser
  75 {
  76   void (*start_element) (HtmlContext * ctx,
  77       const gchar * name, const gchar ** attr, gpointer user_data);
  78   void (*end_element) (HtmlContext * ctx,
  79       const gchar * name, gpointer user_data);
  80   void (*text) (HtmlContext * ctx,
  81       const gchar * text, gsize text_len, gpointer user_data);
  82 };
  83
  84 struct _HtmlContext
  85 {
  86   const HtmlParser *parser;
  87   gpointer user_data;
  88   GString *buf;
  89 };
  90
  91 static HtmlContext *
  92 html_context_new (HtmlParser * parser, gpointer user_data)
  93 {
  94   HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
  95   ctxt->parser = parser;
  96   ctxt->user_data = user_data;
  97   ctxt->buf = g_string_new (NULL);
  98   return ctxt;
  99 }
 100
 101 static void
 102 html_context_free (HtmlContext * ctxt)
 103 {
 104   g_string_free (ctxt->buf, TRUE);
 105   g_free (ctxt);
 106 }
 107
 108 struct EntityMap
 109 {
 110   const gunichar unescaped;
 111   const gchar *escaped;
 112 };
 113
 114 struct EntityMap XmlEntities[] = {
 115   {34, "quot;"},
 116   {38, "amp;"},
 117   {39, "apos;"},
 118   {60, "lt;"},
 119   {62, "gt;"},
 120   {0, NULL},
 121 };
 122
 123 struct EntityMap HtmlEntities[] = {
 124 /* nbsp will handle manually
 125 { 160,  "nbsp;" }, */
 126   {161, "iexcl;"},
 127   {162, "cent;"},
 128   {163, "pound;"},
 129   {164, "curren;"},
 130   {165, "yen;"},
 131   {166, "brvbar;"},
 132   {167, "sect;"},
 133   {168, "uml;"},
 134   {169, "copy;"},
 135   {170, "ordf;"},
 136   {171, "laquo;"},
 137   {172, "not;"},
 138   {173, "shy;"},
 139   {174, "reg;"},
 140   {175, "macr;"},
 141   {176, "deg;"},
 142   {177, "plusmn;"},
 143   {178, "sup2;"},
 144   {179, "sup3;"},
 145   {180, "acute;"},
 146   {181, "micro;"},
 147   {182, "para;"},
 148   {183, "middot;"},
 149   {184, "cedil;"},
 150   {185, "sup1;"},
 151   {186, "ordm;"},
 152   {187, "raquo;"},
 153   {188, "frac14;"},
 154   {189, "frac12;"},
 155   {190, "frac34;"},
 156   {191, "iquest;"},
 157   {192, "Agrave;"},
 158   {193, "Aacute;"},
 159   {194, "Acirc;"},
 160   {195, "Atilde;"},
 161   {196, "Auml;"},
 162   {197, "Aring;"},
 163   {198, "AElig;"},
 164   {199, "Ccedil;"},
 165   {200, "Egrave;"},
 166   {201, "Eacute;"},
 167   {202, "Ecirc;"},
 168   {203, "Euml;"},
 169   {204, "Igrave;"},
 170   {205, "Iacute;"},
 171   {206, "Icirc;"},
 172   {207, "Iuml;"},
 173   {208, "ETH;"},
 174   {209, "Ntilde;"},
 175   {210, "Ograve;"},
 176   {211, "Oacute;"},
 177   {212, "Ocirc;"},
 178   {213, "Otilde;"},
 179   {214, "Ouml;"},
 180   {215, "times;"},
 181   {216, "Oslash;"},
 182   {217, "Ugrave;"},
 183   {218, "Uacute;"},
 184   {219, "Ucirc;"},
 185   {220, "Uuml;"},
 186   {221, "Yacute;"},
 187   {222, "THORN;"},
 188   {223, "szlig;"},
 189   {224, "agrave;"},
 190   {225, "aacute;"},
 191   {226, "acirc;"},
 192   {227, "atilde;"},
 193   {228, "auml;"},
 194   {229, "aring;"},
 195   {230, "aelig;"},
 196   {231, "ccedil;"},
 197   {232, "egrave;"},
 198   {233, "eacute;"},
 199   {234, "ecirc;"},
 200   {235, "euml;"},
 201   {236, "igrave;"},
 202   {237, "iacute;"},
 203   {238, "icirc;"},
 204   {239, "iuml;"},
 205   {240, "eth;"},
 206   {241, "ntilde;"},
 207   {242, "ograve;"},
 208   {243, "oacute;"},
 209   {244, "ocirc;"},
 210   {245, "otilde;"},
 211   {246, "ouml;"},
 212   {247, "divide;"},
 213   {248, "oslash;"},
 214   {249, "ugrave;"},
 215   {250, "uacute;"},
 216   {251, "ucirc;"},
 217   {252, "uuml;"},
 218   {253, "yacute;"},
 219   {254, "thorn;"},
 220   {255, "yuml;"},
 221   {338, "OElig;"},
 222   {339, "oelig;"},
 223   {352, "Scaron;"},
 224   {353, "scaron;"},
 225   {376, "Yuml;"},
 226   {402, "fnof;"},
 227   {710, "circ;"},
 228   {732, "tilde;"},
 229   {913, "Alpha;"},
 230   {914, "Beta;"},
 231   {915, "Gamma;"},
 232   {916, "Delta;"},
 233   {917, "Epsilon;"},
 234   {918, "Zeta;"},
 235   {919, "Eta;"},
 236   {920, "Theta;"},
 237   {921, "Iota;"},
 238   {922, "Kappa;"},
 239   {923, "Lambda;"},
 240   {924, "Mu;"},
 241   {925, "Nu;"},
 242   {926, "Xi;"},
 243   {927, "Omicron;"},
 244   {928, "Pi;"},
 245   {929, "Rho;"},
 246   {931, "Sigma;"},
 247   {932, "Tau;"},
 248   {933, "Upsilon;"},
 249   {934, "Phi;"},
 250   {935, "Chi;"},
 251   {936, "Psi;"},
 252   {937, "Omega;"},
 253   {945, "alpha;"},
 254   {946, "beta;"},
 255   {947, "gamma;"},
 256   {948, "delta;"},
 257   {949, "epsilon;"},
 258   {950, "zeta;"},
 259   {951, "eta;"},
 260   {952, "theta;"},
 261   {953, "iota;"},
 262   {954, "kappa;"},
 263   {955, "lambda;"},
 264   {956, "mu;"},
 265   {957, "nu;"},
 266   {958, "xi;"},
 267   {959, "omicron;"},
 268   {960, "pi;"},
 269   {961, "rho;"},
 270   {962, "sigmaf;"},
 271   {963, "sigma;"},
 272   {964, "tau;"},
 273   {965, "upsilon;"},
 274   {966, "phi;"},
 275   {967, "chi;"},
 276   {968, "psi;"},
 277   {969, "omega;"},
 278   {977, "thetasym;"},
 279   {978, "upsih;"},
 280   {982, "piv;"},
 281   {8194, "ensp;"},
 282   {8195, "emsp;"},
 283   {8201, "thinsp;"},
 284   {8204, "zwnj;"},
 285   {8205, "zwj;"},
 286   {8206, "lrm;"},
 287   {8207, "rlm;"},
 288   {8211, "ndash;"},
 289   {8212, "mdash;"},
 290   {8216, "lsquo;"},
 291   {8217, "rsquo;"},
 292   {8218, "sbquo;"},
 293   {8220, "ldquo;"},
 294   {8221, "rdquo;"},
 295   {8222, "bdquo;"},
 296   {8224, "dagger;"},
 297   {8225, "Dagger;"},
 298   {8226, "bull;"},
 299   {8230, "hellip;"},
 300   {8240, "permil;"},
 301   {8242, "prime;"},
 302   {8243, "Prime;"},
 303   {8249, "lsaquo;"},
 304   {8250, "rsaquo;"},
 305   {8254, "oline;"},
 306   {8260, "frasl;"},
 307   {8364, "euro;"},
 308   {8465, "image;"},
 309   {8472, "weierp;"},
 310   {8476, "real;"},
 311   {8482, "trade;"},
 312   {8501, "alefsym;"},
 313   {8592, "larr;"},
 314   {8593, "uarr;"},
 315   {8594, "rarr;"},
 316   {8595, "darr;"},
 317   {8596, "harr;"},
 318   {8629, "crarr;"},
 319   {8656, "lArr;"},
 320   {8657, "uArr;"},
 321   {8658, "rArr;"},
 322   {8659, "dArr;"},
 323   {8660, "hArr;"},
 324   {8704, "forall;"},
 325   {8706, "part;"},
 326   {8707, "exist;"},
 327   {8709, "empty;"},
 328   {8711, "nabla;"},
 329   {8712, "isin;"},
 330   {8713, "notin;"},
 331   {8715, "ni;"},
 332   {8719, "prod;"},
 333   {8721, "sum;"},
 334   {8722, "minus;"},
 335   {8727, "lowast;"},
 336   {8730, "radic;"},
 337   {8733, "prop;"},
 338   {8734, "infin;"},
 339   {8736, "ang;"},
 340   {8743, "and;"},
 341   {8744, "or;"},
 342   {8745, "cap;"},
 343   {8746, "cup;"},
 344   {8747, "int;"},
 345   {8756, "there4;"},
 346   {8764, "sim;"},
 347   {8773, "cong;"},
 348   {8776, "asymp;"},
 349   {8800, "ne;"},
 350   {8801, "equiv;"},
 351   {8804, "le;"},
 352   {8805, "ge;"},
 353   {8834, "sub;"},
 354   {8835, "sup;"},
 355   {8836, "nsub;"},
 356   {8838, "sube;"},
 357   {8839, "supe;"},
 358   {8853, "oplus;"},
 359   {8855, "otimes;"},
 360   {8869, "perp;"},
 361   {8901, "sdot;"},
 362   {8968, "lceil;"},
 363   {8969, "rceil;"},
 364   {8970, "lfloor;"},
 365   {8971, "rfloor;"},
 366   {9001, "lang;"},
 367   {9002, "rang;"},
 368   {9674, "loz;"},
 369   {9824, "spades;"},
 370   {9827, "clubs;"},
 371   {9829, "hearts;"},
 372   {9830, "diams;"},
 373   {0, NULL},
 374 };
 375
 376 static gchar *
 377 unescape_string (const gchar * text)
 378 {
 379   gint i;
 380   GString *unescaped = g_string_new (NULL);
 381
 382   while (*text) {
 383     if (*text == '&') {
 384       text++;
 385
 386       /* unescape &nbsp and &nbsp; */
 387       if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
 388         unescaped = g_string_append_unichar (unescaped, 160);
 389         text += 4;
 390         if (*text == ';') {
 391           text++;
 392         }
 393         goto next;
 394       }
 395
 396       /* pass xml entities. these will be processed as pango markup */
 397       for (i = 0; XmlEntities[i].escaped; i++) {
 398         gssize len = strlen (XmlEntities[i].escaped);
 399         if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
 400           unescaped = g_string_append_c (unescaped, '&');
 401           unescaped =
 402               g_string_append_len (unescaped, XmlEntities[i].escaped, len);
 403           text += len;
 404           goto next;
 405         }
 406       }
 407
 408       /* convert html entities */
 409       for (i = 0; HtmlEntities[i].escaped; i++) {
 410         gssize len = strlen (HtmlEntities[i].escaped);
 411         if (!strncmp (text, HtmlEntities[i].escaped, len)) {
 412           unescaped =
 413               g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
 414           text += len;
 415           goto next;
 416         }
 417       }
 418
 419       if (*text == '#') {
 420         gboolean is_hex = FALSE;
 421         gunichar l;
 422         gchar *end = NULL;
 423
 424         text++;
 425         if (*text == 'x') {
 426           is_hex = TRUE;
 427           text++;
 428         }
 429         errno = 0;
 430         if (is_hex) {
 431           l = strtoul (text, &end, 16);
 432         } else {
 433           l = strtoul (text, &end, 10);
 434         }
 435
 436         if (text == end || errno != 0) {
 437           /* error occured. pass it */
 438           goto next;
 439         }
 440         unescaped = g_string_append_unichar (unescaped, l);
 441         text = end;
 442
 443         if (*text == ';') {
 444           text++;
 445         }
 446         goto next;
 447       }
 448
 449       /* escape & */
 450       unescaped = g_string_append (unescaped, "&amp;");
 451
 452     next:
 453       continue;
 454
 455     } else if (g_ascii_isspace (*text)) {
 456       unescaped = g_string_append_c (unescaped, ' ');
 457       /* strip whitespace */
 458       do {
 459         text++;
 460       } while ((*text) && g_ascii_isspace (*text));
 461     } else {
 462       unescaped = g_string_append_c (unescaped, *text);
 463       text++;
 464     }
 465   }
 466
 467   return g_string_free (unescaped, FALSE);
 468 }
 469
 470 static const gchar *
 471 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
 472 {
 473   gchar *next = strstr (string, delimiter);
 474   if (next) {
 475     *first = g_strndup (string, next - string);
 476   } else {
 477     *first = g_strdup (string);
 478   }
 479   return next;
 480 }
 481
 482 static void
 483 html_context_handle_element (HtmlContext * ctxt,
 484     const gchar * string, gboolean must_close)
 485 {
 486   gchar *name = NULL;
 487   gint count = 0, i;
 488   gchar **attrs;
 489   const gchar *found, *next;
 490 #ifdef SUBPARSE_MODIFICATION
 491   const gchar *name_temp = NULL;
 492   gint j = 0;
 493 #endif
 494   /* split element name and attributes */
 495   next = string_token (string, " ", &name);
 496
 497   if (next) {
 498     /* count attributes */
 499     found = next + 1;
 500     while (TRUE) {
 501       found = strchr (found, '=');
 502       if (!found)
 503         break;
 504       found++;
 505       count++;
 506     }
 507   } else {
 508     count = 0;
 509   }
 510
 511   attrs = g_new0 (gchar *, (count + 1) * 2);
 512
 513   for (i = 0; i < count; i += 2) {
 514     gchar *attr_name = NULL, *attr_value = NULL;
 515     gsize length;
 516
 517 #ifdef SUBPARSE_MODIFICATION
 518     /* sometimes count can unnecessarily be high value, because of unrequired "=" in subtitle file.
 519      * In that case it should not crash */
 520     if (!next)
 521       break;
 522 #endif
 523
 524     next = string_token (next + 1, "=", &attr_name);
 525
 526 #ifdef SUBPARSE_MODIFICATION
 527     /* sometimes count can unnecessarily be high value, because of unrequired "=" in subtitle file.
 528      * In that case it should not crash */
 529     if (!next)
 530       break;
 531 #endif
 532
 533     next = string_token (next + 1, " ", &attr_value);
 534
 535     /* strip " or ' from attribute value */
 536     if (attr_value[0] == '"' || attr_value[0] == '\'') {
 537       gchar *tmp = g_strdup (attr_value + 1);
 538       g_free (attr_value);
 539       attr_value = tmp;
 540     }
 541
 542     length = strlen (attr_value);
 543     if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
 544       attr_value[length - 1] = '\0';
 545     }
 546
 547     attrs[i] = attr_name;
 548     attrs[i + 1] = attr_value;
 549   }
 550 #ifdef SUBPARSE_MODIFICATION
 551   /* sometimes spaces can be there in between !-- and P
 552    * that also we have to take care */
 553   if (!g_ascii_strcasecmp("!--", name)) {
 554     gchar* tempchar = (gchar*)(string + 3);
 555     while (*tempchar == ' ') {
 556       tempchar++;
 557       if (*tempchar == 'P' || *tempchar == 'p') {
 558         *(name + 3) = *tempchar;
 559         *(name + 4) = '\0';
 560         next = tempchar + 1;
 561         break;
 562       }
 563     }
 564   }
 565   if (next && (!g_ascii_strcasecmp("!--P", name))) {
 566     gint attrindex = 0;
 567     count = 0;
 568     /* count attributes */
 569     found = next + 1;
 570     while (TRUE) {
 571       found = (gchar*)strcasestr (found, "lang");
 572       if (!found)
 573         break;
 574       found++;
 575       count++;
 576     }
 577     g_strfreev (attrs);
 578
 579     attrs = g_new0 (gchar *, count * 2);
 580
 581     for (i = 0; i < count; i++) {
 582       gchar *attr_name = NULL, *attr_value = NULL;
 583
 584       next = (gchar*)strcasestr (next, "lang:");
 585       attr_value = (gchar*)malloc (3);
 586       next = next + 5;
 587       strncpy (attr_value, next, 2);
 588       attr_value[2] = '\0';
 589       GST_LOG ("Language value comes as %s", attr_value);
 590       name_temp = next;
 591       while (TRUE) {
 592         if (*name_temp == '{') {
 593           int character_count = 0;
 594
 595           while (TRUE) {
 596             name_temp--;
 597
 598             if (*name_temp == '.') {
 599               attr_name = (gchar*) malloc (character_count + 1);
 600               break;
 601             }
 602             else if (*name_temp != ' ')
 603               character_count++;
 604           }
 605           break;
 606         }
 607         name_temp--;
 608       }
 609       name_temp++;
 610       for (j = 0; *(name_temp + j) != ' '; j++) {
 611         attr_name[j] = *(name_temp + j);
 612       }
 613       attr_name[j] = '\0';
 614       attrs[attrindex++] = attr_name;
 615       attrs[attrindex++] = attr_value;
 616     }
 617   } else {
 618     count = 0;
 619   }
 620 #endif
 621   ctxt->parser->start_element (ctxt, name,
 622       (const gchar **) attrs, ctxt->user_data);
 623   if (must_close) {
 624     ctxt->parser->end_element (ctxt, name, ctxt->user_data);
 625   }
 626   g_strfreev (attrs);
 627   g_free (name);
 628 }
 629
 630 static void
 631 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
 632 {
 633   const gchar *next = NULL;
 634   ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
 635   next = ctxt->buf->str;
 636   while (TRUE) {
 637     if (next[0] == '<') {
 638       gchar *element = NULL;
 639       /* find <blahblah> */
 640       if (!strchr (next, '>')) {
 641         /* no tag end point. buffer will be process in next time */
 642         return;
 643       }
 644
 645       next = string_token (next, ">", &element);
 646       next++;
 647       if (g_str_has_suffix (next, "/")) {
 648         /* handle <blah/> */
 649         element[strlen (element) - 1] = '\0';
 650         html_context_handle_element (ctxt, element + 1, TRUE);
 651       } else if (element[1] == '/') {
 652         /* handle </blah> */
 653         ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
 654       } else {
 655         /* handle <blah> */
 656         html_context_handle_element (ctxt, element + 1, FALSE);
 657       }
 658       g_free (element);
 659     } else if (strchr (next, '<')) {
 660       gchar *text = NULL;
 661       gsize length;
 662       next = string_token (next, "<", &text);
 663       text = g_strstrip (text);
 664       length = strlen (text);
 665       ctxt->parser->text (ctxt, text, length, ctxt->user_data);
 666       g_free (text);
 667
 668     } else {
 669       gchar *text = (gchar *) next;
 670       gsize length;
 671       text = g_strstrip (text);
 672       length = strlen (text);
 673       ctxt->parser->text (ctxt, text, length, ctxt->user_data);
 674       ctxt->buf = g_string_assign (ctxt->buf, "");
 675       return;
 676     }
 677   }
 678
 679   ctxt->buf = g_string_assign (ctxt->buf, next);
 680 }
 681
 682 static gchar *
 683 has_tag (GString * str, const gchar tag)
 684 {
 685   return strrchr (str->str, tag);
 686 }
 687
 688 static void
 689 sami_context_push_state (GstSamiContext * sctx, char state)
 690 {
 691   GST_LOG ("state %c", state);
 692   g_string_append_c (sctx->state, state);
 693 }
 694
 695 static void
 696 sami_context_pop_state (GstSamiContext * sctx, char state)
 697 {
 698   GString *str = g_string_new ("");
 699   GString *context_state = sctx->state;
 700   int i;
 701
 702   GST_LOG ("state %c", state);
 703   for (i = context_state->len - 1; i >= 0; i--) {
 704     switch (context_state->str[i]) {
 705       case ITALIC_TAG:         /* <i> */
 706       {
 707         g_string_append (str, "</i>");
 708         break;
 709       }
 710       case SPAN_TAG:           /* <span foreground= > */
 711       {
 712         g_string_append (str, "</span>");
 713         break;
 714       }
 715       case RUBY_TAG:           /* <span size= >  -- ruby */
 716       {
 717         break;
 718       }
 719       case RT_TAG:             /*  ruby */
 720       {
 721         /* FIXME: support for furigana/ruby once implemented in pango */
 722         g_string_append (sctx->rubybuf, "</span>");
 723         if (has_tag (context_state, ITALIC_TAG)) {
 724           g_string_append (sctx->rubybuf, "</i>");
 725         }
 726
 727         break;
 728       }
 729       default:
 730         break;
 731     }
 732     if (context_state->str[i] == state) {
 733       g_string_append (sctx->buf, str->str);
 734       g_string_free (str, TRUE);
 735       g_string_truncate (context_state, i);
 736       return;
 737     }
 738   }
 739   if (state == CLEAR_TAG) {
 740     g_string_append (sctx->buf, str->str);
 741     g_string_truncate (context_state, 0);
 742   }
 743   g_string_free (str, TRUE);
 744 }
 745
 746 static void
 747 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
 748 {
 749   int i;
 750
 751   sami_context_pop_state (sctx, CLEAR_TAG);
 752   if (atts != NULL) {
 753     for (i = 0; (atts[i] != NULL); i += 2) {
 754       const gchar *key, *value;
 755
 756       key = atts[i];
 757       value = atts[i + 1];
 758
 759       if (!value)
 760         continue;
 761       if (!g_ascii_strcasecmp ("start", key)) {
 762         /* Only set a new start time if we don't have text pending */
 763         if (sctx->resultbuf->len == 0)
 764           sctx->time1 = sctx->time2;
 765
 766         sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
 767 #ifdef SUBPARSE_MODIFICATION
 768         sctx->time3 = sctx->time2;
 769 #endif
 770         sctx->time2 = MAX (sctx->time2, sctx->time1);
 771         g_string_append (sctx->resultbuf, sctx->buf->str);
 772         sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
 773         g_string_truncate (sctx->buf, 0);
 774       }
 775     }
 776   }
 777 }
 778
 779 static void
 780 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
 781 {
 782   int i;
 783
 784   sami_context_pop_state (sctx, SPAN_TAG);
 785   if (atts != NULL) {
 786     g_string_append (sctx->buf, "<span");
 787     for (i = 0; (atts[i] != NULL); i += 2) {
 788       const gchar *key, *value;
 789
 790       key = atts[i];
 791       value = atts[i + 1];
 792
 793       if (!value)
 794         continue;
 795       if (!g_ascii_strcasecmp ("color", key)) {
 796         /*
 797          * There are invalid color value in many
 798          * sami files.
 799          * It will fix hex color value that start without '#'
 800          */
 801         const gchar *sharp = "";
 802         int len = strlen (value);
 803
 804         if (!(*value == '#' && len == 7)) {
 805           gchar *r;
 806
 807           /* check if it looks like hex */
 808           if (strtol ((const char *) value, &r, 16) >= 0 &&
 809               ((gchar *) r == (value + 6) && len == 6)) {
 810             sharp = "#";
 811           }
 812         }
 813         /* some colours can be found in many sami files, but X RGB database
 814          * doesn't contain a colour by this name, so map explicitly */
 815         if (!g_ascii_strcasecmp ("aqua", value)) {
 816           value = "#00ffff";
 817         } else if (!g_ascii_strcasecmp ("crimson", value)) {
 818           value = "#dc143c";
 819         } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
 820           value = "#ff00ff";
 821         } else if (!g_ascii_strcasecmp ("indigo", value)) {
 822           value = "#4b0082";
 823         } else if (!g_ascii_strcasecmp ("lime", value)) {
 824           value = "#00ff00";
 825         } else if (!g_ascii_strcasecmp ("olive", value)) {
 826           value = "#808000";
 827         } else if (!g_ascii_strcasecmp ("silver", value)) {
 828           value = "#c0c0c0";
 829         } else if (!g_ascii_strcasecmp ("teal", value)) {
 830           value = "#008080";
 831         }
 832         g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
 833             value);
 834       } else if (!g_ascii_strcasecmp ("face", key)) {
 835         g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
 836       }
 837     }
 838     g_string_append_c (sctx->buf, '>');
 839     sami_context_push_state (sctx, SPAN_TAG);
 840   }
 841 }
 842
 843 #ifdef SUBPARSE_MODIFICATION
 844 static void
 845 handle_p (GstSamiContext * sctx, const gchar ** atts)
 846 {
 847   int i;
 848
 849   if (atts != NULL) {
 850     for (i = 0; (atts[i] != NULL); i += 2) {
 851       const gchar *key, *value;
 852
 853       key = atts[i];
 854       value = atts[i + 1];
 855
 856       if (sctx->current_language && value && strcmp(sctx->current_language, value)
 857           && (sctx->time1 == sctx->time2))
 858         sctx->language_changed = TRUE;
 859
 860       else if (!sctx->current_language)
 861         sctx->current_language = (gchar*) malloc (128);
 862
 863       if (key && !g_ascii_strcasecmp ("class", key) && value) {
 864         strcpy (sctx->current_language, value);
 865         if (sctx->desired_language == NULL && key) {
 866           sctx->desired_language = (gchar*) malloc (strlen(value) + 1);
 867           strcpy(sctx->desired_language, value);
 868           GST_LOG("no language list was found and desired lang was set to %s",sctx->desired_language);
 869         }
 870       }
 871       if (sctx->language_changed)
 872       {
 873          sctx->time1 = 0;
 874          sctx->time2 = sctx->time3;
 875          sctx->language_changed = FALSE;
 876       }
 877       if (!value)
 878         continue;
 879     }
 880   }
 881 }
 882
 883 static void
 884 handle_start_language_list (GstSamiContext * sctx, const gchar ** atts)
 885 {
 886   int i = 0;
 887   int attrIndex = 0;
 888   GstLangStruct *new = NULL;
 889   GstLangStruct *temp = NULL;
 890
 891   if (atts != NULL) {
 892     if (g_list_length (sctx->lang_list)) {
 893       GST_LOG ("We already got the language list");
 894       return;
 895     }
 896     for (i = 0; (atts[attrIndex] != NULL); i++) {
 897       const gchar *key, *value;
 898
 899       key = atts[attrIndex++];
 900       value = atts[attrIndex++];
 901
 902       GST_LOG ("Inside handle_start_language_list key: %s, value: %s", key, value);
 903
 904       if (!value)
 905         continue;
 906
 907       new = g_new0 (GstLangStruct, 1);
 908       new->language_code = (gchar*) malloc (strlen(value) + 1);
 909       if (new->language_code && value)
 910         strcpy (new->language_code, value);
 911       new->language_key = (gchar*) malloc (strlen(key) + 1);
 912       if (new->language_key && key)
 913         strcpy (new->language_key, key);
 914       sctx->lang_list = g_list_append (sctx->lang_list, new);
 915       temp = g_list_nth_data (sctx->lang_list, i);
 916       if (sctx->desired_language == NULL && key){
 917         sctx->desired_language = (gchar*) malloc (strlen(key) + 1);
 918         strcpy(sctx->desired_language, key);
 919       }
 920
 921       if (temp)
 922         GST_LOG ("Inside handle_start_language_list of glist key: %s, value: %s",
 923                     temp->language_key, temp->language_code);
 924     }
 925   }
 926 }
 927 #endif
 928
 929 static void
 930 handle_start_element (HtmlContext * ctx, const gchar * name,
 931     const char **atts, gpointer user_data)
 932 {
 933   GstSamiContext *sctx = (GstSamiContext *) user_data;
 934
 935   GST_LOG ("name:%s", name);
 936
 937   if (!g_ascii_strcasecmp ("sync", name)) {
 938     handle_start_sync (sctx, atts);
 939     sctx->in_sync = TRUE;
 940   } else if (!g_ascii_strcasecmp ("font", name)) {
 941     handle_start_font (sctx, atts);
 942   } else if (!g_ascii_strcasecmp ("ruby", name)) {
 943     sami_context_push_state (sctx, RUBY_TAG);
 944   } else if (!g_ascii_strcasecmp ("br", name)) {
 945 #ifdef SUBPARSE_MODIFICATION
 946     if (sctx->current_language && sctx->desired_language &&
 947         !strcmp(sctx->current_language, sctx->desired_language))
 948 #endif
 949       g_string_append_c (sctx->buf, '\n');
 950     /* FIXME: support for furigana/ruby once implemented in pango */
 951   } else if (!g_ascii_strcasecmp ("rt", name)) {
 952     if (has_tag (sctx->state, ITALIC_TAG)) {
 953       g_string_append (sctx->rubybuf, "<i>");
 954     }
 955     g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
 956     sami_context_push_state (sctx, RT_TAG);
 957   } else if (!g_ascii_strcasecmp ("i", name)) {
 958 #ifdef SUBPARSE_MODIFICATION
 959     if (sctx->current_language && sctx->desired_language &&
 960         !strcmp(sctx->current_language, sctx->desired_language))
 961 #endif
 962       g_string_append (sctx->buf, "<i>");
 963     sami_context_push_state (sctx, ITALIC_TAG);
 964   } else if (!g_ascii_strcasecmp ("p", name)) {
 965 #ifdef SUBPARSE_MODIFICATION
 966     handle_p (sctx, atts);
 967   } else if (!g_ascii_strcasecmp ("!--P", name)) {
 968     handle_start_language_list (sctx, atts);
 969 #endif
 970   }
 971 }
 972
 973 static void
 974 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
 975 {
 976   GstSamiContext *sctx = (GstSamiContext *) user_data;
 977
 978   GST_LOG ("name:%s", name);
 979
 980   if (!g_ascii_strcasecmp ("sync", name)) {
 981     sctx->in_sync = FALSE;
 982   } else if ((!g_ascii_strcasecmp ("body", name)) ||
 983       (!g_ascii_strcasecmp ("sami", name))) {
 984     /* We will usually have one buffer left when the body is closed
 985      * as we need the next sync to actually send it */
 986
 987 #ifdef SUBPARSE_MODIFICATION
 988     sctx->end_body = TRUE;
 989 #endif
 990
 991     if (sctx->buf->len != 0) {
 992       /* Only set a new start time if we don't have text pending */
 993       if (sctx->resultbuf->len == 0)
 994         sctx->time1 = sctx->time2;
 995
 996       sctx->time2 = GST_CLOCK_TIME_NONE;
 997       g_string_append (sctx->resultbuf, sctx->buf->str);
 998       sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
 999       g_string_truncate (sctx->buf, 0);
1000     }
1001   } else if (!g_ascii_strcasecmp ("font", name)) {
1002     sami_context_pop_state (sctx, SPAN_TAG);
1003   } else if (!g_ascii_strcasecmp ("ruby", name)) {
1004     sami_context_pop_state (sctx, RUBY_TAG);
1005   } else if (!g_ascii_strcasecmp ("i", name)) {
1006     sami_context_pop_state (sctx, ITALIC_TAG);
1007   }
1008 }
1009
1010 static void
1011 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
1012     gpointer user_data)
1013 {
1014   GstSamiContext *sctx = (GstSamiContext *) user_data;
1015
1016   /* Skip everything except content of the sync elements */
1017   if (!sctx->in_sync)
1018     return;
1019 #ifdef SUBPARSE_MODIFICATION
1020   if (has_tag (sctx->state, RT_TAG) && (sctx->current_language && sctx->desired_language &&
1021        !strcmp(sctx->current_language, sctx->desired_language))) {
1022 #else
1023   if (has_tag (sctx->state, RT_TAG)) {
1024 #endif
1025     g_string_append_c (sctx->rubybuf, ' ');
1026     g_string_append (sctx->rubybuf, text);
1027     g_string_append_c (sctx->rubybuf, ' ');
1028   } else {
1029 #ifdef SUBPARSE_MODIFICATION
1030     if (sctx->current_language && sctx->desired_language &&
1031         !strcmp(sctx->current_language, sctx->desired_language))
1032 #endif
1033       g_string_append (sctx->buf, text);
1034   }
1035 }
1036
1037 static HtmlParser samiParser = {
1038   handle_start_element,         /* start_element */
1039   handle_end_element,           /* end_element */
1040   handle_text,                  /* text */
1041 };
1042
1043 void
1044 sami_context_init (ParserState * state)
1045 {
1046   GstSamiContext *context;
1047
1048   g_assert (state->user_data == NULL);
1049
1050   context = g_new0 (GstSamiContext, 1);
1051
1052   context->htmlctxt = html_context_new (&samiParser, context);
1053   context->buf = g_string_new ("");
1054   context->rubybuf = g_string_new ("");
1055   context->resultbuf = g_string_new ("");
1056   context->state = g_string_new ("");
1057 #ifdef SUBPARSE_MODIFICATION
1058   context->current_language = NULL;
1059   context->desired_language = NULL;
1060   context->lang_list = NULL;
1061   context->language_changed = FALSE;
1062   context->end_body = FALSE;
1063 #endif
1064   state->user_data = context;
1065 }
1066
1067 void
1068 sami_context_deinit (ParserState * state)
1069 {
1070   GstSamiContext *context = (GstSamiContext *) state->user_data;
1071 #ifdef SUBPARSE_MODIFICATION
1072   GstLangStruct *temp = NULL;
1073   int i = 0;
1074 #endif
1075   if (context) {
1076     html_context_free (context->htmlctxt);
1077     context->htmlctxt = NULL;
1078     g_string_free (context->buf, TRUE);
1079     g_string_free (context->rubybuf, TRUE);
1080     g_string_free (context->resultbuf, TRUE);
1081     g_string_free (context->state, TRUE);
1082 #ifdef SUBPARSE_MODIFICATION
1083     if (context->lang_list) {
1084       while ((temp = g_list_nth_data (context->lang_list, i))) {
1085         if (temp->language_code)
1086           free (temp->language_code);
1087         temp->language_code = NULL;
1088         if (temp->language_key)
1089           free (temp->language_key);
1090         temp->language_key = NULL;
1091         g_free (temp);
1092         i++;
1093       }
1094       g_list_free (context->lang_list);
1095     }
1096     context->lang_list = NULL;
1097
1098     if (context->current_language)
1099       free (context->current_language);
1100     context->current_language = NULL;
1101
1102     context->desired_language = NULL;
1103 #endif
1104     g_free (context);
1105     state->user_data = NULL;
1106   }
1107 }
1108
1109 void
1110 sami_context_reset (ParserState * state)
1111 {
1112   GstSamiContext *context = (GstSamiContext *) state->user_data;
1113
1114   if (context) {
1115     g_string_truncate (context->buf, 0);
1116     g_string_truncate (context->rubybuf, 0);
1117     g_string_truncate (context->resultbuf, 0);
1118     g_string_truncate (context->state, 0);
1119     context->has_result = FALSE;
1120     context->in_sync = FALSE;
1121     context->time1 = 0;
1122     context->time2 = 0;
1123   }
1124 }
1125
1126 #ifdef SUBPARSE_MODIFICATION
1127 void
1128 sami_context_change_language (ParserState * state)
1129 {
1130   GstSamiContext *context = (GstSamiContext *) state->user_data;
1131   GST_LOG ("**********desired language was %s**************", context->desired_language);
1132   free (context->desired_language);
1133   if(state->current_language) {
1134     context->desired_language = state->current_language;
1135   } else {
1136     context->desired_language = state->msl_language;
1137   }
1138   GST_LOG ("desired language changed to %s", context->desired_language);
1139 }
1140
1141 gchar *
1142 sami_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding,
1143     gsize * consumed, GError ** err, GstSubParse * self)
1144 {
1145   gchar *ret = NULL;
1146
1147   /* The char cast is necessary in glib < 2.24 */
1148   ret =
1149       g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*",
1150       consumed, NULL, err);
1151
1152   if (ret == NULL)
1153   {
1154     GST_DEBUG_OBJECT (self, "g_convert_with_fallback returns NULL");
1155     return ret;
1156   }
1157
1158   /* + 3 to skip UTF-8 BOM if it was added */
1159   len = strlen (ret);
1160   if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
1161       && (guint8) ret[2] == 0xBF)
1162     g_memmove (ret, ret + 3, len + 1 - 3);
1163
1164   return ret;
1165 }
1166
1167 gboolean
1168 sami_validate_langlist_body(GList * lang_list, GstSubParse * self){
1169   gchar * file_path_type = NULL;
1170   gchar * file_path = NULL;
1171   gchar   line[1024];
1172   FILE  * fp = NULL;
1173   guint i = 0, found_count = 0;
1174   const guint list_len = g_list_length(lang_list);
1175   gboolean counter[MAX_LANGUAGE];
1176   struct LangStruct
1177   {
1178       gchar *language_code;
1179       gchar *language_key;
1180   } * lang;
1181
1182   GstQuery *cquery;
1183   GstStructure *structure;
1184   const GValue *value;
1185   structure = gst_structure_new ("FileSrcURI",
1186                                  "file-uri", G_TYPE_STRING, NULL, NULL);
1187
1188   cquery = gst_query_new_application (GST_QUERY_CUSTOM, structure);
1189
1190   if (!gst_pad_peer_query (self->sinkpad, cquery))
1191   {
1192     GST_DEBUG_OBJECT (self, "failed to query SMI file path");
1193     gst_query_unref (cquery);
1194     return FALSE;
1195   }
1196   structure = gst_query_get_structure (cquery);
1197   value = gst_structure_get_value (structure, "file-uri");
1198   file_path = g_strdup (g_value_get_string (value));
1199
1200   if (file_path == NULL){
1201     GST_DEBUG_OBJECT (self, "could not parse the SMI file path");
1202     gst_query_unref (cquery);
1203     return FALSE;
1204   }
1205   gst_query_unref (cquery);
1206
1207   GST_INFO_OBJECT (self, "file path comes as %s", file_path);
1208
1209   file_path_type = g_strndup ((gchar *) file_path, 4);
1210   GST_INFO_OBJECT (self, "received file path by query = %s,%s", file_path,file_path_type);
1211   if (!g_strcmp0(file_path_type, "file")){
1212     file_path += 7;
1213     GST_INFO_OBJECT (self, "file path comes as %s", file_path);
1214
1215     fp = fopen (file_path, "r");
1216     if (!fp){
1217       GST_DEBUG_OBJECT (self, "failed to open file");
1218       return FALSE;
1219     }
1220
1221     for(i=0;i<list_len;i++){
1222       counter[i] = FALSE;
1223     }
1224
1225     while(!feof(fp) && found_count < list_len){
1226       GError *err = NULL;
1227       gsize * consumed = NULL;
1228       gint gap = 0;
1229       guint charCount = 0;
1230       gchar* result = NULL;
1231       gchar* temp = NULL;
1232       gchar* temp_lang = NULL;
1233       gchar * temp1 = NULL;
1234       gchar *con_temp_lang = NULL;
1235       gchar *con_temp = NULL;
1236       gboolean conversion = TRUE;
1237       charCount = fread (line, sizeof(char), 1024, fp);
1238       if (!charCount) {
1239         GST_WARNING_OBJECT (self, "fread returned zero bytes");
1240         continue;
1241       }
1242       GST_DEBUG("value of detected encoding is %s and self encoding is %s",self->detected_encoding,self->encoding);
1243       if (self->detected_encoding && strcmp (self->detected_encoding, "UTF-8") && conversion){
1244         result = sami_convert_to_utf8 (line, charCount, self->detected_encoding, consumed, &err, self);
1245       }
1246       if(result == NULL) {
1247          result = line;
1248          conversion =  FALSE;
1249       }
1250       con_temp = g_utf8_strdown (result,strlen(result));
1251       temp = con_temp;
1252       while(con_temp) {
1253         con_temp = g_strstr_len(con_temp, strlen(con_temp),"class=");
1254         if(con_temp) {
1255           temp1 = g_strstr_len(con_temp+1, strlen(con_temp),"class=");
1256         }
1257         if(temp1 && con_temp){
1258           gap = strlen(con_temp)-strlen(temp1);
1259         }else if(con_temp) {
1260           gap = strlen(con_temp);
1261         } else {
1262           continue;
1263         }
1264         if(con_temp){
1265           for(i=0;i<list_len;i++){
1266             if(counter[i]==TRUE){
1267               con_temp=con_temp+1;
1268               continue;
1269             }
1270             lang = (struct LangStruct *) g_list_nth_data(lang_list,i);
1271             if(lang) {
1272               temp_lang = (gchar*)g_malloc(strlen(lang->language_key)+1);
1273               strcpy(temp_lang,lang->language_key);
1274               con_temp_lang = g_utf8_strdown (temp_lang,strlen(temp_lang));
1275               if(g_strstr_len(con_temp,gap,con_temp_lang)){
1276                 found_count++;
1277                 counter[i]=TRUE;
1278                 GST_INFO_OBJECT (self, " valid Language in list : [%s]", lang->language_key);
1279                 con_temp=con_temp+1;
1280               }
1281               g_free(temp_lang);
1282               g_free(con_temp_lang);
1283             }
1284           }
1285         }
1286       }
1287       if(conversion)
1288        g_free (result);
1289       if(temp)
1290        g_free(temp);
1291
1292     }
1293
1294     if(found_count < list_len){
1295       for(i=0;i<list_len;i++){
1296         if(counter[i]==FALSE)
1297           lang_list = g_list_delete_link(lang_list,g_list_nth(lang_list,i));
1298       }
1299     }
1300   }
1301   return TRUE;
1302 }
1303 #endif
1304
1305 gchar *
1306 parse_sami (ParserState * state, const gchar * line)
1307 {
1308   gchar *ret = NULL;
1309 #ifdef SUBPARSE_MODIFICATION
1310   gint64 clip_start = 0, clip_stop = 0;
1311   gboolean in_seg = FALSE;
1312 #endif
1313   GstSamiContext *context = (GstSamiContext *) state->user_data;
1314
1315   gchar *unescaped = unescape_string (line);
1316   html_context_parse (context->htmlctxt, (gchar *) unescaped,
1317       strlen (unescaped));
1318 #ifdef SUBPARSE_MODIFICATION
1319   if (context->lang_list)
1320     state->language_list = context->lang_list;
1321
1322   if (context->desired_language)
1323     state->current_language = context->desired_language;
1324 #endif
1325   g_free (unescaped);
1326 #ifdef SUBPARSE_MODIFICATION
1327   if (context->desired_language && context->current_language) {
1328     if ((!strcmp(context->current_language, context->desired_language)) || context->end_body) {
1329 #endif
1330       if (context->has_result) {
1331         if (context->rubybuf->len) {
1332           context->rubybuf = g_string_append_c (context->rubybuf, '\n');
1333           g_string_prepend (context->resultbuf, context->rubybuf->str);
1334           context->rubybuf = g_string_truncate (context->rubybuf, 0);
1335         }
1336
1337         ret = g_string_free (context->resultbuf, FALSE);
1338         context->resultbuf = g_string_new ("");
1339         state->start_time = context->time1;
1340         state->duration = context->time2 - context->time1;
1341         context->has_result = FALSE;
1342       }
1343 #ifdef SUBPARSE_MODIFICATION
1344       context->end_body = FALSE;
1345     }
1346   }
1347   /* Check our segment start/stop */
1348   in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
1349              state->start_time, state->start_time + state->duration, &clip_start,
1350              &clip_stop);
1351
1352   /* No need to send that text if it's out of segment */
1353   if (in_seg) {
1354     state->start_time = clip_start;
1355     state->duration = clip_stop - clip_start;
1356   } else {
1357     return NULL;
1358   }
1359 #endif
1360   return ret;
1361 }