/* GStreamer SAMI subtitle parser
- * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
+ * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
*/
+#define _GNU_SOURCE
#include "samiparse.h"
-#include <libxml/HTMLparser.h>
+#include <glib.h>
#include <string.h>
+#include <stdlib.h>
#define ITALIC_TAG 'i'
#define SPAN_TAG 's'
#define RT_TAG 't'
#define CLEAR_TAG '0'
+typedef struct _HtmlParser HtmlParser;
+typedef struct _HtmlContext HtmlContext;
typedef struct _GstSamiContext GstSamiContext;
-
+#ifdef SUBPARSE_MODIFICATION
+typedef struct _LanguageStruct GstLangStruct;
+struct _LanguageStruct
+{
+ gchar *language_code;
+ gchar *language_key;
+};
+#define MAX_LANGUAGE 10
+#endif
struct _GstSamiContext
{
GString *buf; /* buffer to collect content */
* that tags can be closed properly on
* 'sync' tags. See _context_push_state()
* and _context_pop_state(). */
- htmlParserCtxtPtr htmlctxt; /* html parser context */
+ HtmlContext *htmlctxt; /* html parser context */
gboolean has_result; /* set when ready to push out result */
gboolean in_sync; /* flag to avoid appending anything except the
* content of the sync elements to buf */
guint64 time1; /* previous start attribute in sync tag */
guint64 time2; /* current start attribute in sync tag */
+#ifdef SUBPARSE_MODIFICATION
+ guint64 time3; /* To store the last current time when language is changed */
+ GList *lang_list; /* Language list for an external subtitle file */
+ gchar *current_language; /* Current language parsed */
+ gchar *desired_language; /* Language set by user */
+ gboolean language_changed; /* language changed signal */
+ gboolean end_body; /* </BODY> reached */
+#endif
+};
+
+struct _HtmlParser
+{
+ void (*start_element) (HtmlContext * ctx,
+ const gchar * name, const gchar ** attr, gpointer user_data);
+ void (*end_element) (HtmlContext * ctx,
+ const gchar * name, gpointer user_data);
+ void (*text) (HtmlContext * ctx,
+ const gchar * text, gsize text_len, gpointer user_data);
+};
+
+struct _HtmlContext
+{
+ const HtmlParser *parser;
+ gpointer user_data;
+ GString *buf;
+};
+
+static HtmlContext *
+html_context_new (HtmlParser * parser, gpointer user_data)
+{
+ HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
+ ctxt->parser = parser;
+ ctxt->user_data = user_data;
+ ctxt->buf = g_string_new (NULL);
+ return ctxt;
+}
+
+static void
+html_context_free (HtmlContext * ctxt)
+{
+ g_string_free (ctxt->buf, TRUE);
+ g_free (ctxt);
+}
+
+struct EntityMap
+{
+ const gunichar unescaped;
+ const gchar *escaped;
+};
+
+struct EntityMap XmlEntities[] = {
+ {34, "quot;"},
+ {38, "amp;"},
+ {39, "apos;"},
+ {60, "lt;"},
+ {62, "gt;"},
+ {0, NULL},
};
+struct EntityMap HtmlEntities[] = {
+/* nbsp will handle manually
+{ 160, "nbsp;" }, */
+ {161, "iexcl;"},
+ {162, "cent;"},
+ {163, "pound;"},
+ {164, "curren;"},
+ {165, "yen;"},
+ {166, "brvbar;"},
+ {167, "sect;"},
+ {168, "uml;"},
+ {169, "copy;"},
+ {170, "ordf;"},
+ {171, "laquo;"},
+ {172, "not;"},
+ {173, "shy;"},
+ {174, "reg;"},
+ {175, "macr;"},
+ {176, "deg;"},
+ {177, "plusmn;"},
+ {178, "sup2;"},
+ {179, "sup3;"},
+ {180, "acute;"},
+ {181, "micro;"},
+ {182, "para;"},
+ {183, "middot;"},
+ {184, "cedil;"},
+ {185, "sup1;"},
+ {186, "ordm;"},
+ {187, "raquo;"},
+ {188, "frac14;"},
+ {189, "frac12;"},
+ {190, "frac34;"},
+ {191, "iquest;"},
+ {192, "Agrave;"},
+ {193, "Aacute;"},
+ {194, "Acirc;"},
+ {195, "Atilde;"},
+ {196, "Auml;"},
+ {197, "Aring;"},
+ {198, "AElig;"},
+ {199, "Ccedil;"},
+ {200, "Egrave;"},
+ {201, "Eacute;"},
+ {202, "Ecirc;"},
+ {203, "Euml;"},
+ {204, "Igrave;"},
+ {205, "Iacute;"},
+ {206, "Icirc;"},
+ {207, "Iuml;"},
+ {208, "ETH;"},
+ {209, "Ntilde;"},
+ {210, "Ograve;"},
+ {211, "Oacute;"},
+ {212, "Ocirc;"},
+ {213, "Otilde;"},
+ {214, "Ouml;"},
+ {215, "times;"},
+ {216, "Oslash;"},
+ {217, "Ugrave;"},
+ {218, "Uacute;"},
+ {219, "Ucirc;"},
+ {220, "Uuml;"},
+ {221, "Yacute;"},
+ {222, "THORN;"},
+ {223, "szlig;"},
+ {224, "agrave;"},
+ {225, "aacute;"},
+ {226, "acirc;"},
+ {227, "atilde;"},
+ {228, "auml;"},
+ {229, "aring;"},
+ {230, "aelig;"},
+ {231, "ccedil;"},
+ {232, "egrave;"},
+ {233, "eacute;"},
+ {234, "ecirc;"},
+ {235, "euml;"},
+ {236, "igrave;"},
+ {237, "iacute;"},
+ {238, "icirc;"},
+ {239, "iuml;"},
+ {240, "eth;"},
+ {241, "ntilde;"},
+ {242, "ograve;"},
+ {243, "oacute;"},
+ {244, "ocirc;"},
+ {245, "otilde;"},
+ {246, "ouml;"},
+ {247, "divide;"},
+ {248, "oslash;"},
+ {249, "ugrave;"},
+ {250, "uacute;"},
+ {251, "ucirc;"},
+ {252, "uuml;"},
+ {253, "yacute;"},
+ {254, "thorn;"},
+ {255, "yuml;"},
+ {338, "OElig;"},
+ {339, "oelig;"},
+ {352, "Scaron;"},
+ {353, "scaron;"},
+ {376, "Yuml;"},
+ {402, "fnof;"},
+ {710, "circ;"},
+ {732, "tilde;"},
+ {913, "Alpha;"},
+ {914, "Beta;"},
+ {915, "Gamma;"},
+ {916, "Delta;"},
+ {917, "Epsilon;"},
+ {918, "Zeta;"},
+ {919, "Eta;"},
+ {920, "Theta;"},
+ {921, "Iota;"},
+ {922, "Kappa;"},
+ {923, "Lambda;"},
+ {924, "Mu;"},
+ {925, "Nu;"},
+ {926, "Xi;"},
+ {927, "Omicron;"},
+ {928, "Pi;"},
+ {929, "Rho;"},
+ {931, "Sigma;"},
+ {932, "Tau;"},
+ {933, "Upsilon;"},
+ {934, "Phi;"},
+ {935, "Chi;"},
+ {936, "Psi;"},
+ {937, "Omega;"},
+ {945, "alpha;"},
+ {946, "beta;"},
+ {947, "gamma;"},
+ {948, "delta;"},
+ {949, "epsilon;"},
+ {950, "zeta;"},
+ {951, "eta;"},
+ {952, "theta;"},
+ {953, "iota;"},
+ {954, "kappa;"},
+ {955, "lambda;"},
+ {956, "mu;"},
+ {957, "nu;"},
+ {958, "xi;"},
+ {959, "omicron;"},
+ {960, "pi;"},
+ {961, "rho;"},
+ {962, "sigmaf;"},
+ {963, "sigma;"},
+ {964, "tau;"},
+ {965, "upsilon;"},
+ {966, "phi;"},
+ {967, "chi;"},
+ {968, "psi;"},
+ {969, "omega;"},
+ {977, "thetasym;"},
+ {978, "upsih;"},
+ {982, "piv;"},
+ {8194, "ensp;"},
+ {8195, "emsp;"},
+ {8201, "thinsp;"},
+ {8204, "zwnj;"},
+ {8205, "zwj;"},
+ {8206, "lrm;"},
+ {8207, "rlm;"},
+ {8211, "ndash;"},
+ {8212, "mdash;"},
+ {8216, "lsquo;"},
+ {8217, "rsquo;"},
+ {8218, "sbquo;"},
+ {8220, "ldquo;"},
+ {8221, "rdquo;"},
+ {8222, "bdquo;"},
+ {8224, "dagger;"},
+ {8225, "Dagger;"},
+ {8226, "bull;"},
+ {8230, "hellip;"},
+ {8240, "permil;"},
+ {8242, "prime;"},
+ {8243, "Prime;"},
+ {8249, "lsaquo;"},
+ {8250, "rsaquo;"},
+ {8254, "oline;"},
+ {8260, "frasl;"},
+ {8364, "euro;"},
+ {8465, "image;"},
+ {8472, "weierp;"},
+ {8476, "real;"},
+ {8482, "trade;"},
+ {8501, "alefsym;"},
+ {8592, "larr;"},
+ {8593, "uarr;"},
+ {8594, "rarr;"},
+ {8595, "darr;"},
+ {8596, "harr;"},
+ {8629, "crarr;"},
+ {8656, "lArr;"},
+ {8657, "uArr;"},
+ {8658, "rArr;"},
+ {8659, "dArr;"},
+ {8660, "hArr;"},
+ {8704, "forall;"},
+ {8706, "part;"},
+ {8707, "exist;"},
+ {8709, "empty;"},
+ {8711, "nabla;"},
+ {8712, "isin;"},
+ {8713, "notin;"},
+ {8715, "ni;"},
+ {8719, "prod;"},
+ {8721, "sum;"},
+ {8722, "minus;"},
+ {8727, "lowast;"},
+ {8730, "radic;"},
+ {8733, "prop;"},
+ {8734, "infin;"},
+ {8736, "ang;"},
+ {8743, "and;"},
+ {8744, "or;"},
+ {8745, "cap;"},
+ {8746, "cup;"},
+ {8747, "int;"},
+ {8756, "there4;"},
+ {8764, "sim;"},
+ {8773, "cong;"},
+ {8776, "asymp;"},
+ {8800, "ne;"},
+ {8801, "equiv;"},
+ {8804, "le;"},
+ {8805, "ge;"},
+ {8834, "sub;"},
+ {8835, "sup;"},
+ {8836, "nsub;"},
+ {8838, "sube;"},
+ {8839, "supe;"},
+ {8853, "oplus;"},
+ {8855, "otimes;"},
+ {8869, "perp;"},
+ {8901, "sdot;"},
+ {8968, "lceil;"},
+ {8969, "rceil;"},
+ {8970, "lfloor;"},
+ {8971, "rfloor;"},
+ {9001, "lang;"},
+ {9002, "rang;"},
+ {9674, "loz;"},
+ {9824, "spades;"},
+ {9827, "clubs;"},
+ {9829, "hearts;"},
+ {9830, "diams;"},
+ {0, NULL},
+};
+
+static gchar *
+unescape_string (const gchar * text)
+{
+ gint i;
+ GString *unescaped = g_string_new (NULL);
+
+ while (*text) {
+ if (*text == '&') {
+ text++;
+
+ /* unescape   and */
+ if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
+ unescaped = g_string_append_unichar (unescaped, 160);
+ text += 4;
+ if (*text == ';') {
+ text++;
+ }
+ goto next;
+ }
+
+ /* pass xml entities. these will be processed as pango markup */
+ for (i = 0; XmlEntities[i].escaped; i++) {
+ gssize len = strlen (XmlEntities[i].escaped);
+ if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
+ unescaped = g_string_append_c (unescaped, '&');
+ unescaped =
+ g_string_append_len (unescaped, XmlEntities[i].escaped, len);
+ text += len;
+ goto next;
+ }
+ }
+
+ /* convert html entities */
+ for (i = 0; HtmlEntities[i].escaped; i++) {
+ gssize len = strlen (HtmlEntities[i].escaped);
+ if (!strncmp (text, HtmlEntities[i].escaped, len)) {
+ unescaped =
+ g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
+ text += len;
+ goto next;
+ }
+ }
+
+ if (*text == '#') {
+ gboolean is_hex = FALSE;
+ gunichar l;
+ gchar *end = NULL;
+
+ text++;
+ if (*text == 'x') {
+ is_hex = TRUE;
+ text++;
+ }
+ errno = 0;
+ if (is_hex) {
+ l = strtoul (text, &end, 16);
+ } else {
+ l = strtoul (text, &end, 10);
+ }
+
+ if (text == end || errno != 0) {
+ /* error occured. pass it */
+ goto next;
+ }
+ unescaped = g_string_append_unichar (unescaped, l);
+ text = end;
+
+ if (*text == ';') {
+ text++;
+ }
+ goto next;
+ }
+
+ /* escape & */
+ unescaped = g_string_append (unescaped, "&");
+
+ next:
+ continue;
+
+ } else if (g_ascii_isspace (*text)) {
+ unescaped = g_string_append_c (unescaped, ' ');
+ /* strip whitespace */
+ do {
+ text++;
+ } while ((*text) && g_ascii_isspace (*text));
+ } else {
+ unescaped = g_string_append_c (unescaped, *text);
+ text++;
+ }
+ }
+
+ return g_string_free (unescaped, FALSE);
+}
+
+static const gchar *
+string_token (const gchar * string, const gchar * delimiter, gchar ** first)
+{
+ gchar *next = strstr (string, delimiter);
+ if (next) {
+ *first = g_strndup (string, next - string);
+ } else {
+ *first = g_strdup (string);
+ }
+ return next;
+}
+
+static void
+html_context_handle_element (HtmlContext * ctxt,
+ const gchar * string, gboolean must_close)
+{
+ gchar *name = NULL;
+ gint count = 0, i;
+ gchar **attrs;
+ const gchar *found, *next;
+#ifdef SUBPARSE_MODIFICATION
+ const gchar *name_temp = NULL;
+ gint j = 0;
+#endif
+ /* split element name and attributes */
+ next = string_token (string, " ", &name);
+
+ if (next) {
+ /* count attributes */
+ found = next + 1;
+ while (TRUE) {
+ found = strchr (found, '=');
+ if (!found)
+ break;
+ found++;
+ count++;
+ }
+ } else {
+ count = 0;
+ }
+
+ attrs = g_new0 (gchar *, (count + 1) * 2);
+
+ for (i = 0; i < count; i += 2) {
+ gchar *attr_name = NULL, *attr_value = NULL;
+ gsize length;
+
+#ifdef SUBPARSE_MODIFICATION
+ /* sometimes count can unnecessarily be high value, because of unrequired "=" in subtitle file.
+ * In that case it should not crash */
+ if (!next)
+ break;
+#endif
+
+ next = string_token (next + 1, "=", &attr_name);
+
+#ifdef SUBPARSE_MODIFICATION
+ /* sometimes count can unnecessarily be high value, because of unrequired "=" in subtitle file.
+ * In that case it should not crash */
+ if (!next)
+ break;
+#endif
+
+ next = string_token (next + 1, " ", &attr_value);
+
+ /* strip " or ' from attribute value */
+ if (attr_value[0] == '"' || attr_value[0] == '\'') {
+ gchar *tmp = g_strdup (attr_value + 1);
+ g_free (attr_value);
+ attr_value = tmp;
+ }
+
+ length = strlen (attr_value);
+ if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
+ attr_value[length - 1] = '\0';
+ }
+
+ attrs[i] = attr_name;
+ attrs[i + 1] = attr_value;
+ }
+#ifdef SUBPARSE_MODIFICATION
+ /* sometimes spaces can be there in between !-- and P
+ * that also we have to take care */
+ if (!g_ascii_strcasecmp("!--", name)) {
+ gchar* tempchar = (gchar*)(string + 3);
+ while (*tempchar == ' ') {
+ tempchar++;
+ if (*tempchar == 'P' || *tempchar == 'p') {
+ *(name + 3) = *tempchar;
+ *(name + 4) = '\0';
+ next = tempchar + 1;
+ break;
+ }
+ }
+ }
+ if (next && (!g_ascii_strcasecmp("!--P", name))) {
+ gint attrindex = 0;
+ count = 0;
+ /* count attributes */
+ found = next + 1;
+ while (TRUE) {
+ found = (gchar*)strcasestr (found, "lang");
+ if (!found)
+ break;
+ found++;
+ count++;
+ }
+ g_strfreev (attrs);
+
+ attrs = g_new0 (gchar *, count * 2);
+
+ for (i = 0; i < count; i++) {
+ gchar *attr_name = NULL, *attr_value = NULL;
+
+ next = (gchar*)strcasestr (next, "lang:");
+ attr_value = (gchar*)malloc (3);
+ next = next + 5;
+ strncpy (attr_value, next, 2);
+ attr_value[2] = '\0';
+ GST_LOG ("Language value comes as %s", attr_value);
+ name_temp = next;
+ while (TRUE) {
+ if (*name_temp == '{') {
+ int character_count = 0;
+
+ while (TRUE) {
+ name_temp--;
+
+ if (*name_temp == '.') {
+ attr_name = (gchar*) malloc (character_count + 1);
+ break;
+ }
+ else if (*name_temp != ' ')
+ character_count++;
+ }
+ break;
+ }
+ name_temp--;
+ }
+ name_temp++;
+ for (j = 0; *(name_temp + j) != ' '; j++) {
+ attr_name[j] = *(name_temp + j);
+ }
+ attr_name[j] = '\0';
+ attrs[attrindex++] = attr_name;
+ attrs[attrindex++] = attr_value;
+ }
+ } else {
+ count = 0;
+ }
+#endif
+ ctxt->parser->start_element (ctxt, name,
+ (const gchar **) attrs, ctxt->user_data);
+ if (must_close) {
+ ctxt->parser->end_element (ctxt, name, ctxt->user_data);
+ }
+ g_strfreev (attrs);
+ g_free (name);
+}
+
+static void
+html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
+{
+ const gchar *next = NULL;
+ ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
+ next = ctxt->buf->str;
+ if (!next) {
+ GST_ERROR ("ctxt->buf->str is NULL");
+ return;
+ }
+ while (TRUE) {
+ if (next[0] == '<') {
+ gchar *element = NULL;
+ /* find <blahblah> */
+ if (!strchr (next, '>')) {
+ /* no tag end point. buffer will be process in next time */
+ return;
+ }
+
+ next = string_token (next, ">", &element);
+ next++;
+ if (g_str_has_suffix (next, "/")) {
+ /* handle <blah/> */
+ element[strlen (element) - 1] = '\0';
+ html_context_handle_element (ctxt, element + 1, TRUE);
+ } else if (element[1] == '/') {
+ /* handle </blah> */
+ ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
+ } else {
+ /* handle <blah> */
+ html_context_handle_element (ctxt, element + 1, FALSE);
+ }
+ g_free (element);
+ } else if (strchr (next, '<')) {
+ gchar *text = NULL;
+ gsize length;
+ next = string_token (next, "<", &text);
+ text = g_strstrip (text);
+ length = strlen (text);
+ ctxt->parser->text (ctxt, text, length, ctxt->user_data);
+ g_free (text);
+
+ } else {
+ gchar *text = (gchar *) next;
+ gsize length;
+ text = g_strstrip (text);
+ length = strlen (text);
+ ctxt->parser->text (ctxt, text, length, ctxt->user_data);
+ ctxt->buf = g_string_assign (ctxt->buf, "");
+ return;
+ }
+ }
+
+ ctxt->buf = g_string_assign (ctxt->buf, next);
+}
+
static gchar *
has_tag (GString * str, const gchar tag)
{
}
static void
-handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
+handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
{
int i;
sami_context_pop_state (sctx, CLEAR_TAG);
if (atts != NULL) {
for (i = 0; (atts[i] != NULL); i += 2) {
- const xmlChar *key, *value;
+ const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
continue;
- if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
+ if (!g_ascii_strcasecmp ("start", key)) {
/* Only set a new start time if we don't have text pending */
if (sctx->resultbuf->len == 0)
sctx->time1 = sctx->time2;
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
+#ifdef SUBPARSE_MODIFICATION
+ sctx->time3 = sctx->time2;
+#endif
+ sctx->time2 = MAX (sctx->time2, sctx->time1);
g_string_append (sctx->resultbuf, sctx->buf->str);
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
}
static void
-handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
+handle_start_font (GstSamiContext * sctx, const gchar ** atts)
{
int i;
if (atts != NULL) {
g_string_append (sctx->buf, "<span");
for (i = 0; (atts[i] != NULL); i += 2) {
- const xmlChar *key, *value;
+ const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
continue;
- if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
+ if (!g_ascii_strcasecmp ("color", key)) {
/*
* There are invalid color value in many
* sami files.
* It will fix hex color value that start without '#'
*/
const gchar *sharp = "";
- int len = xmlStrlen (value);
+ int len = strlen (value);
if (!(*value == '#' && len == 7)) {
gchar *r;
/* check if it looks like hex */
if (strtol ((const char *) value, &r, 16) >= 0 &&
- ((xmlChar *) r == (value + 6) && len == 6)) {
+ ((gchar *) r == (value + 6) && len == 6)) {
sharp = "#";
}
}
/* some colours can be found in many sami files, but X RGB database
* doesn't contain a colour by this name, so map explicitly */
- if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
- value = (const xmlChar *) "#00ffff";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
- value = (const xmlChar *) "#dc143c";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
- value = (const xmlChar *) "#ff00ff";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
- value = (const xmlChar *) "#4b0082";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
- value = (const xmlChar *) "#00ff00";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
- value = (const xmlChar *) "#808000";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
- value = (const xmlChar *) "#c0c0c0";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
- value = (const xmlChar *) "#008080";
+ if (!g_ascii_strcasecmp ("aqua", value)) {
+ value = "#00ffff";
+ } else if (!g_ascii_strcasecmp ("crimson", value)) {
+ value = "#dc143c";
+ } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
+ value = "#ff00ff";
+ } else if (!g_ascii_strcasecmp ("indigo", value)) {
+ value = "#4b0082";
+ } else if (!g_ascii_strcasecmp ("lime", value)) {
+ value = "#00ff00";
+ } else if (!g_ascii_strcasecmp ("olive", value)) {
+ value = "#808000";
+ } else if (!g_ascii_strcasecmp ("silver", value)) {
+ value = "#c0c0c0";
+ } else if (!g_ascii_strcasecmp ("teal", value)) {
+ value = "#008080";
}
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
value);
- } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
+ } else if (!g_ascii_strcasecmp ("face", key)) {
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
}
}
}
}
+#ifdef SUBPARSE_MODIFICATION
+static void
+handle_p (GstSamiContext * sctx, const gchar ** atts)
+{
+ int i;
+
+ if (atts != NULL) {
+ for (i = 0; (atts[i] != NULL); i += 2) {
+ const gchar *key, *value;
+
+ key = atts[i];
+ value = atts[i + 1];
+
+ if (sctx->current_language && value && strcmp(sctx->current_language, value)
+ && (sctx->time1 == sctx->time2))
+ sctx->language_changed = TRUE;
+
+ else if (!sctx->current_language)
+ sctx->current_language = (gchar*) malloc (128);
+
+ if (key && !g_ascii_strcasecmp ("class", key) && value) {
+ strcpy (sctx->current_language, value);
+ if (sctx->desired_language == NULL && key) {
+ sctx->desired_language = g_strdup(value);
+ GST_LOG("no language list was found and desired lang was set to %s",sctx->desired_language);
+ }
+ }
+ if (sctx->language_changed)
+ {
+ sctx->time1 = 0;
+ sctx->time2 = sctx->time3;
+ sctx->language_changed = FALSE;
+ }
+ if (!value)
+ continue;
+ }
+ }
+}
+
static void
-start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
+handle_start_language_list (GstSamiContext * sctx, const gchar ** atts)
{
- GstSamiContext *sctx = (GstSamiContext *) ctx;
+ int i = 0;
+ int attrIndex = 0;
+ GstLangStruct *new = NULL;
+ GstLangStruct *temp = NULL;
+
+ if (atts != NULL) {
+ if (g_list_length (sctx->lang_list)) {
+ GST_LOG ("We already got the language list");
+ return;
+ }
+ for (i = 0; (atts[attrIndex] != NULL); i++) {
+ const gchar *key, *value;
+
+ key = atts[attrIndex++];
+ value = atts[attrIndex++];
+
+ GST_LOG ("Inside handle_start_language_list key: %s, value: %s", key, value);
+
+ if (!value)
+ continue;
+
+ new = g_new0 (GstLangStruct, 1);
+ new->language_code = (gchar*) malloc (strlen(value) + 1);
+ if (new->language_code && value)
+ strcpy (new->language_code, value);
+ new->language_key = (gchar*) malloc (strlen(key) + 1);
+ if (new->language_key && key)
+ strcpy (new->language_key, key);
+ sctx->lang_list = g_list_append (sctx->lang_list, new);
+ temp = g_list_nth_data (sctx->lang_list, i);
+ if (sctx->desired_language == NULL && key){
+ sctx->desired_language = g_strdup(key);
+ }
+
+ if (temp)
+ GST_LOG ("Inside handle_start_language_list of glist key: %s, value: %s",
+ temp->language_key, temp->language_code);
+ }
+ }
+}
+#endif
+
+static void
+handle_start_element (HtmlContext * ctx, const gchar * name,
+ const char **atts, gpointer user_data)
+{
+ GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
- if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
+ if (!g_ascii_strcasecmp ("sync", name)) {
handle_start_sync (sctx, atts);
sctx->in_sync = TRUE;
- } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("font", name)) {
handle_start_font (sctx, atts);
- } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_push_state (sctx, RUBY_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
- g_string_append_c (sctx->buf, '\n');
+ } else if (!g_ascii_strcasecmp ("br", name)) {
+#ifdef SUBPARSE_MODIFICATION
+ if (sctx->current_language && sctx->desired_language &&
+ !strcmp(sctx->current_language, sctx->desired_language))
+#endif
+ g_string_append_c (sctx->buf, '\n');
/* FIXME: support for furigana/ruby once implemented in pango */
- } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
+ } else if (!g_ascii_strcasecmp ("rt", name)) {
if (has_tag (sctx->state, ITALIC_TAG)) {
g_string_append (sctx->rubybuf, "<i>");
}
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
sami_context_push_state (sctx, RT_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
- } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
- g_string_append (sctx->buf, "<i>");
+ } else if (!g_ascii_strcasecmp ("i", name)) {
+#ifdef SUBPARSE_MODIFICATION
+ if (sctx->current_language && sctx->desired_language &&
+ !strcmp(sctx->current_language, sctx->desired_language))
+#endif
+ g_string_append (sctx->buf, "<i>");
sami_context_push_state (sctx, ITALIC_TAG);
+ } else if (!g_ascii_strcasecmp ("p", name)) {
+#ifdef SUBPARSE_MODIFICATION
+ handle_p (sctx, atts);
+ } else if (!g_ascii_strcasecmp ("!--P", name)) {
+ handle_start_language_list (sctx, atts);
+#endif
}
}
static void
-end_sami_element (void *ctx, const xmlChar * name)
+handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
{
- GstSamiContext *sctx = (GstSamiContext *) ctx;
+ GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
- if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
+ if (!g_ascii_strcasecmp ("sync", name)) {
sctx->in_sync = FALSE;
- } else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) ||
- (!xmlStrncmp ((const xmlChar *) "sami", name, 4))) {
+ } else if ((!g_ascii_strcasecmp ("body", name)) ||
+ (!g_ascii_strcasecmp ("sami", name))) {
/* We will usually have one buffer left when the body is closed
* as we need the next sync to actually send it */
+
+#ifdef SUBPARSE_MODIFICATION
+ sctx->end_body = TRUE;
+#endif
+
if (sctx->buf->len != 0) {
/* Only set a new start time if we don't have text pending */
if (sctx->resultbuf->len == 0)
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
}
- } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("font", name)) {
sami_context_pop_state (sctx, SPAN_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_pop_state (sctx, RUBY_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
+ } else if (!g_ascii_strcasecmp ("i", name)) {
sami_context_pop_state (sctx, ITALIC_TAG);
}
}
static void
-characters_sami (void *ctx, const xmlChar * ch, int len)
+handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
+ gpointer user_data)
{
- GstSamiContext *sctx = (GstSamiContext *) ctx;
- gchar *escaped;
- gchar *tmp;
- gint i;
+ GstSamiContext *sctx = (GstSamiContext *) user_data;
/* Skip everything except content of the sync elements */
if (!sctx->in_sync)
return;
-
- escaped = g_markup_escape_text ((const gchar *) ch, len);
- g_strstrip (escaped);
-
- /* Remove double spaces forom the string as those are
- * usually added by newlines and indention */
- tmp = escaped;
- for (i = 0; i <= strlen (escaped); i++) {
- escaped[i] = *tmp;
- if (*tmp != ' ') {
- tmp++;
- continue;
- }
- while (*tmp == ' ')
- tmp++;
- }
-
+#ifdef SUBPARSE_MODIFICATION
+ if (has_tag (sctx->state, RT_TAG) && (sctx->current_language && sctx->desired_language &&
+ !strcmp(sctx->current_language, sctx->desired_language))) {
+#else
if (has_tag (sctx->state, RT_TAG)) {
+#endif
g_string_append_c (sctx->rubybuf, ' ');
- g_string_append (sctx->rubybuf, escaped);
+ g_string_append (sctx->rubybuf, text);
g_string_append_c (sctx->rubybuf, ' ');
} else {
- g_string_append (sctx->buf, escaped);
+#ifdef SUBPARSE_MODIFICATION
+ if (sctx->current_language && sctx->desired_language &&
+ !strcmp(sctx->current_language, sctx->desired_language))
+#endif
+ g_string_append (sctx->buf, text);
}
- g_free (escaped);
}
-static xmlSAXHandler samiSAXHandlerStruct = {
- NULL, /* internalSubset */
- NULL, /* isStandalone */
- NULL, /* hasInternalSubset */
- NULL, /* hasExternalSubset */
- NULL, /* resolveEntity */
- NULL, /* getEntity */
- NULL, /* entityDecl */
- NULL, /* notationDecl */
- NULL, /* attributeDecl */
- NULL, /* elementDecl */
- NULL, /* unparsedEntityDecl */
- NULL, /* setDocumentLocator */
- NULL, /* startDocument */
- NULL, /* endDocument */
- start_sami_element, /* startElement */
- end_sami_element, /* endElement */
- NULL, /* reference */
- characters_sami, /* characters */
- NULL, /* ignorableWhitespace */
- NULL, /* processingInstruction */
- NULL, /* comment */
- NULL, /* xmlParserWarning */
- NULL, /* xmlParserError */
- NULL, /* xmlParserError */
- NULL, /* getParameterEntity */
- NULL, /* cdataBlock */
- NULL, /* externalSubset */
- 1, /* initialized */
- NULL, /* private */
- NULL, /* startElementNsSAX2Func */
- NULL, /* endElementNsSAX2Func */
- NULL /* xmlStructuredErrorFunc */
+static HtmlParser samiParser = {
+ handle_start_element, /* start_element */
+ handle_end_element, /* end_element */
+ handle_text, /* text */
};
-static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
-
void
sami_context_init (ParserState * state)
{
GstSamiContext *context;
g_assert (state->user_data == NULL);
- state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
- context = (GstSamiContext *) state->user_data;
- context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
- "", 0, NULL, XML_CHAR_ENCODING_UTF8);
+ context = g_new0 (GstSamiContext, 1);
+
+ context->htmlctxt = html_context_new (&samiParser, context);
context->buf = g_string_new ("");
context->rubybuf = g_string_new ("");
context->resultbuf = g_string_new ("");
context->state = g_string_new ("");
+#ifdef SUBPARSE_MODIFICATION
+ context->current_language = NULL;
+ context->desired_language = NULL;
+ context->lang_list = NULL;
+ context->language_changed = FALSE;
+ context->end_body = FALSE;
+#endif
+ state->user_data = context;
}
void
sami_context_deinit (ParserState * state)
{
GstSamiContext *context = (GstSamiContext *) state->user_data;
-
+#ifdef SUBPARSE_MODIFICATION
+ GstLangStruct *temp = NULL;
+ int i = 0;
+#endif
if (context) {
- htmlParserCtxtPtr htmlctxt = context->htmlctxt;
-
- /* destroy sax context */
- htmlDocPtr doc;
-
- htmlParseChunk (htmlctxt, "", 0, 1);
- doc = htmlctxt->myDoc;
- htmlFreeParserCtxt (htmlctxt);
+ html_context_free (context->htmlctxt);
context->htmlctxt = NULL;
- if (doc)
- xmlFreeDoc (doc);
g_string_free (context->buf, TRUE);
g_string_free (context->rubybuf, TRUE);
g_string_free (context->resultbuf, TRUE);
g_string_free (context->state, TRUE);
+#ifdef SUBPARSE_MODIFICATION
+ if (context->lang_list) {
+ while ((temp = g_list_nth_data (context->lang_list, i))) {
+ if (temp->language_code)
+ free (temp->language_code);
+ temp->language_code = NULL;
+ if (temp->language_key)
+ free (temp->language_key);
+ temp->language_key = NULL;
+ g_free (temp);
+ i++;
+ }
+ g_list_free (context->lang_list);
+ }
+ context->lang_list = NULL;
+
+ if (context->current_language)
+ free (context->current_language);
+ context->current_language = NULL;
+
+ context->desired_language = NULL;
+#endif
g_free (context);
state->user_data = NULL;
}
}
}
-static gchar *
-fix_invalid_entities (const gchar * line)
+#ifdef SUBPARSE_MODIFICATION
+void
+sami_context_change_language (ParserState * state)
{
- const gchar *cp, *pp; /* current pointer, previous pointer */
- gssize size;
- GString *ret = g_string_new (NULL);
-
- pp = line;
- cp = strchr (line, '&');
- while (cp) {
- size = cp - pp;
- ret = g_string_append_len (ret, pp, size);
- cp++;
- if (g_ascii_strncasecmp (cp, "nbsp;", 5)
- && (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
- /* translate " " to " " */
- ret = g_string_append_len (ret, " ", 6);
- cp += 4;
- } else if (g_ascii_strncasecmp (cp, "quot;", 5)
- && g_ascii_strncasecmp (cp, "amp;", 4)
- && g_ascii_strncasecmp (cp, "apos;", 5)
- && g_ascii_strncasecmp (cp, "lt;", 3)
- && g_ascii_strncasecmp (cp, "gt;", 3)
- && g_ascii_strncasecmp (cp, "nbsp;", 5)
- && cp[0] != '#') {
- /* translate "&" to "&" */
- ret = g_string_append_len (ret, "&", 5);
- } else {
- /* do not translate */
- ret = g_string_append_c (ret, '&');
+ GstSamiContext *context = (GstSamiContext *) state->user_data;
+ GST_LOG ("**********desired language was %s**************", context->desired_language);
+ free (context->desired_language);
+ if(state->current_language) {
+ context->desired_language = state->current_language;
+ } else {
+ context->desired_language = state->msl_language;
+ }
+ GST_LOG ("desired language changed to %s", context->desired_language);
+}
+
+gchar *
+sami_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding,
+ gsize * consumed, GError ** err, GstSubParse * self)
+{
+ gchar *ret = NULL;
+
+ /* The char cast is necessary in glib < 2.24 */
+ ret =
+ g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*",
+ consumed, NULL, err);
+
+ if (ret == NULL)
+ {
+ GST_DEBUG_OBJECT (self, "g_convert_with_fallback returns NULL");
+ return ret;
+ }
+
+ /* + 3 to skip UTF-8 BOM if it was added */
+ len = strlen (ret);
+ if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
+ && (guint8) ret[2] == 0xBF)
+ g_memmove (ret, ret + 3, len + 1 - 3);
+
+ return ret;
+}
+
+gboolean
+sami_validate_langlist_body(GList * lang_list, GstSubParse * self){
+ gchar * file_path_type = NULL;
+ gchar * file_path = NULL;
+ gchar line[1024];
+ FILE * fp = NULL;
+ guint i = 0, found_count = 0;
+ const guint list_len = g_list_length(lang_list);
+ gboolean counter[MAX_LANGUAGE];
+ struct LangStruct
+ {
+ gchar *language_code;
+ gchar *language_key;
+ } * lang;
+
+ GstQuery *cquery;
+ GstStructure *structure;
+ const GValue *value;
+ structure = gst_structure_new ("FileSrcURI",
+ "file-uri", G_TYPE_STRING, NULL, NULL);
+
+ cquery = gst_query_new_application (GST_QUERY_CUSTOM, structure);
+
+ if (!gst_pad_peer_query (self->sinkpad, cquery))
+ {
+ GST_DEBUG_OBJECT (self, "failed to query SMI file path");
+ gst_query_unref (cquery);
+ return FALSE;
+ }
+ structure = gst_query_get_structure (cquery);
+ value = gst_structure_get_value (structure, "file-uri");
+ file_path = g_strdup (g_value_get_string (value));
+
+ if (file_path == NULL){
+ GST_DEBUG_OBJECT (self, "could not parse the SMI file path");
+ gst_query_unref (cquery);
+ return FALSE;
+ }
+ gst_query_unref (cquery);
+
+ GST_INFO_OBJECT (self, "file path comes as %s", file_path);
+
+ file_path_type = g_strndup ((gchar *) file_path, 4);
+ GST_INFO_OBJECT (self, "received file path by query = %s,%s", file_path,file_path_type);
+ if (!g_strcmp0(file_path_type, "file")){
+ file_path += 7;
+ GST_INFO_OBJECT (self, "file path comes as %s", file_path);
+
+ fp = fopen (file_path, "r");
+ if (!fp){
+ GST_DEBUG_OBJECT (self, "failed to open file");
+ return FALSE;
+ }
+
+ for(i=0;i<list_len;i++){
+ counter[i] = FALSE;
}
- pp = cp;
- cp = strchr (pp, '&');
+ while(!feof(fp) && found_count < list_len){
+ GError *err = NULL;
+ gsize * consumed = NULL;
+ gint gap = 0;
+ guint charCount = 0;
+ gchar* result = NULL;
+ gchar* temp = NULL;
+ gchar* temp_lang = NULL;
+ gchar * temp1 = NULL;
+ gchar *con_temp_lang = NULL;
+ gchar *con_temp = NULL;
+ gboolean conversion = TRUE;
+ charCount = fread (line, sizeof(char), 1024, fp);
+ if (!charCount) {
+ GST_WARNING_OBJECT (self, "fread returned zero bytes");
+ continue;
+ }
+ GST_DEBUG("value of detected encoding is %s and self encoding is %s",self->detected_encoding,self->encoding);
+ if (self->detected_encoding && strcmp (self->detected_encoding, "UTF-8") && conversion){
+ result = sami_convert_to_utf8 (line, charCount, self->detected_encoding, consumed, &err, self);
+ }
+ if(result == NULL) {
+ result = line;
+ conversion = FALSE;
+ }
+ con_temp = g_utf8_strdown (result,strlen(result));
+ temp = con_temp;
+ while(con_temp) {
+ con_temp = g_strstr_len(con_temp, strlen(con_temp),"class=");
+ if(con_temp) {
+ temp1 = g_strstr_len(con_temp+1, strlen(con_temp),"class=");
+ }
+ if(temp1 && con_temp){
+ gap = strlen(con_temp)-strlen(temp1);
+ }else if(con_temp) {
+ gap = strlen(con_temp);
+ } else {
+ continue;
+ }
+ if(con_temp){
+ for(i=0;i<list_len;i++){
+ if(counter[i]==TRUE){
+ con_temp=con_temp+1;
+ continue;
+ }
+ lang = (struct LangStruct *) g_list_nth_data(lang_list,i);
+ if(lang) {
+ temp_lang = g_strdup(lang->language_key);
+ con_temp_lang = g_utf8_strdown (temp_lang,strlen(temp_lang));
+ if(g_strstr_len(con_temp,gap,con_temp_lang)){
+ found_count++;
+ counter[i]=TRUE;
+ GST_INFO_OBJECT (self, " valid Language in list : [%s]", lang->language_key);
+ con_temp=con_temp+1;
+ }
+ g_free(temp_lang);
+ g_free(con_temp_lang);
+ }
+ }
+ }
+ }
+ if(conversion)
+ g_free (result);
+ if(temp)
+ g_free(temp);
+
+ }
+
+ if(found_count < list_len){
+ for(i=0;i<list_len;i++){
+ if(counter[i]==FALSE)
+ lang_list = g_list_delete_link(lang_list,g_list_nth(lang_list,i));
+ }
+ }
}
- ret = g_string_append (ret, pp);
- return g_string_free (ret, FALSE);
+ fclose(fp);
+ return TRUE;
}
+#endif
gchar *
parse_sami (ParserState * state, const gchar * line)
{
- gchar *fixed_line;
+ gchar *ret = NULL;
+#ifdef SUBPARSE_MODIFICATION
+ gint64 clip_start = 0, clip_stop = 0;
+ gboolean in_seg = FALSE;
+#endif
GstSamiContext *context = (GstSamiContext *) state->user_data;
- fixed_line = fix_invalid_entities (line);
- htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
- g_free (fixed_line);
+ gchar *unescaped = unescape_string (line);
+ html_context_parse (context->htmlctxt, (gchar *) unescaped,
+ strlen (unescaped));
+#ifdef SUBPARSE_MODIFICATION
+ if (context->lang_list)
+ state->language_list = context->lang_list;
- if (context->has_result) {
- gchar *r;
+ if (context->desired_language)
+ state->current_language = context->desired_language;
+#endif
+ g_free (unescaped);
+#ifdef SUBPARSE_MODIFICATION
+ if (context->desired_language && context->current_language) {
+ if ((!strcmp(context->current_language, context->desired_language)) || context->end_body) {
+#endif
+ if (context->has_result) {
+ if (context->rubybuf->len) {
+ context->rubybuf = g_string_append_c (context->rubybuf, '\n');
+ g_string_prepend (context->resultbuf, context->rubybuf->str);
+ context->rubybuf = g_string_truncate (context->rubybuf, 0);
+ }
- if (context->rubybuf->len) {
- context->rubybuf = g_string_append_c (context->rubybuf, '\n');
- g_string_prepend (context->resultbuf, context->rubybuf->str);
- context->rubybuf = g_string_truncate (context->rubybuf, 0);
+ ret = g_string_free (context->resultbuf, FALSE);
+ context->resultbuf = g_string_new ("");
+ state->start_time = context->time1;
+ state->duration = context->time2 - context->time1;
+ context->has_result = FALSE;
+ }
+#ifdef SUBPARSE_MODIFICATION
+ context->end_body = FALSE;
}
+ }
+ /* Check our segment start/stop */
+ in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
+ state->start_time, state->start_time + state->duration, &clip_start,
+ &clip_stop);
- r = g_string_free (context->resultbuf, FALSE);
- context->resultbuf = g_string_new ("");
- state->start_time = context->time1;
- state->duration = context->time2 - context->time1;
- context->has_result = FALSE;
- return r;
+ /* No need to send that text if it's out of segment */
+ if (in_seg) {
+ state->start_time = clip_start;
+ state->duration = clip_stop - clip_start;
+ } else {
+ return NULL;
}
- return NULL;
+#endif
+ return ret;
}