/* GStreamer SAMI subtitle parser
- * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
+ * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
#include "samiparse.h"
-#include <libxml/HTMLparser.h>
+#include <glib.h>
#include <string.h>
+#include <stdlib.h>
#define ITALIC_TAG 'i'
#define SPAN_TAG 's'
#define RT_TAG 't'
#define CLEAR_TAG '0'
+typedef struct _HtmlParser HtmlParser;
+typedef struct _HtmlContext HtmlContext;
typedef struct _GstSamiContext GstSamiContext;
struct _GstSamiContext
* that tags can be closed properly on
* 'sync' tags. See _context_push_state()
* and _context_pop_state(). */
- htmlParserCtxtPtr htmlctxt; /* html parser context */
+ HtmlContext *htmlctxt; /* html parser context */
gboolean has_result; /* set when ready to push out result */
gboolean in_sync; /* flag to avoid appending anything except the
* content of the sync elements to buf */
guint64 time2; /* current start attribute in sync tag */
};
+struct _HtmlParser
+{
+ void (*start_element) (HtmlContext * ctx,
+ const gchar * name, const gchar ** attr, gpointer user_data);
+ void (*end_element) (HtmlContext * ctx,
+ const gchar * name, gpointer user_data);
+ void (*text) (HtmlContext * ctx,
+ const gchar * text, gsize text_len, gpointer user_data);
+};
+
+struct _HtmlContext
+{
+ const HtmlParser *parser;
+ gpointer user_data;
+ GString *buf;
+};
+
+static HtmlContext *
+html_context_new (HtmlParser * parser, gpointer user_data)
+{
+ HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
+ ctxt->parser = parser;
+ ctxt->user_data = user_data;
+ ctxt->buf = g_string_new (NULL);
+ return ctxt;
+}
+
+static void
+html_context_free (HtmlContext * ctxt)
+{
+ g_string_free (ctxt->buf, TRUE);
+ g_free (ctxt);
+}
+
+struct EntityMap
+{
+ const gunichar unescaped;
+ const gchar *escaped;
+};
+
+struct EntityMap XmlEntities[] = {
+ {34, "quot;"},
+ {38, "amp;"},
+ {39, "apos;"},
+ {60, "lt;"},
+ {62, "gt;"},
+ {0, NULL},
+};
+
+struct EntityMap HtmlEntities[] = {
+/* nbsp will handle manually
+{ 160, "nbsp;" }, */
+ {161, "iexcl;"},
+ {162, "cent;"},
+ {163, "pound;"},
+ {164, "curren;"},
+ {165, "yen;"},
+ {166, "brvbar;"},
+ {167, "sect;"},
+ {168, "uml;"},
+ {169, "copy;"},
+ {170, "ordf;"},
+ {171, "laquo;"},
+ {172, "not;"},
+ {173, "shy;"},
+ {174, "reg;"},
+ {175, "macr;"},
+ {176, "deg;"},
+ {177, "plusmn;"},
+ {178, "sup2;"},
+ {179, "sup3;"},
+ {180, "acute;"},
+ {181, "micro;"},
+ {182, "para;"},
+ {183, "middot;"},
+ {184, "cedil;"},
+ {185, "sup1;"},
+ {186, "ordm;"},
+ {187, "raquo;"},
+ {188, "frac14;"},
+ {189, "frac12;"},
+ {190, "frac34;"},
+ {191, "iquest;"},
+ {192, "Agrave;"},
+ {193, "Aacute;"},
+ {194, "Acirc;"},
+ {195, "Atilde;"},
+ {196, "Auml;"},
+ {197, "Aring;"},
+ {198, "AElig;"},
+ {199, "Ccedil;"},
+ {200, "Egrave;"},
+ {201, "Eacute;"},
+ {202, "Ecirc;"},
+ {203, "Euml;"},
+ {204, "Igrave;"},
+ {205, "Iacute;"},
+ {206, "Icirc;"},
+ {207, "Iuml;"},
+ {208, "ETH;"},
+ {209, "Ntilde;"},
+ {210, "Ograve;"},
+ {211, "Oacute;"},
+ {212, "Ocirc;"},
+ {213, "Otilde;"},
+ {214, "Ouml;"},
+ {215, "times;"},
+ {216, "Oslash;"},
+ {217, "Ugrave;"},
+ {218, "Uacute;"},
+ {219, "Ucirc;"},
+ {220, "Uuml;"},
+ {221, "Yacute;"},
+ {222, "THORN;"},
+ {223, "szlig;"},
+ {224, "agrave;"},
+ {225, "aacute;"},
+ {226, "acirc;"},
+ {227, "atilde;"},
+ {228, "auml;"},
+ {229, "aring;"},
+ {230, "aelig;"},
+ {231, "ccedil;"},
+ {232, "egrave;"},
+ {233, "eacute;"},
+ {234, "ecirc;"},
+ {235, "euml;"},
+ {236, "igrave;"},
+ {237, "iacute;"},
+ {238, "icirc;"},
+ {239, "iuml;"},
+ {240, "eth;"},
+ {241, "ntilde;"},
+ {242, "ograve;"},
+ {243, "oacute;"},
+ {244, "ocirc;"},
+ {245, "otilde;"},
+ {246, "ouml;"},
+ {247, "divide;"},
+ {248, "oslash;"},
+ {249, "ugrave;"},
+ {250, "uacute;"},
+ {251, "ucirc;"},
+ {252, "uuml;"},
+ {253, "yacute;"},
+ {254, "thorn;"},
+ {255, "yuml;"},
+ {338, "OElig;"},
+ {339, "oelig;"},
+ {352, "Scaron;"},
+ {353, "scaron;"},
+ {376, "Yuml;"},
+ {402, "fnof;"},
+ {710, "circ;"},
+ {732, "tilde;"},
+ {913, "Alpha;"},
+ {914, "Beta;"},
+ {915, "Gamma;"},
+ {916, "Delta;"},
+ {917, "Epsilon;"},
+ {918, "Zeta;"},
+ {919, "Eta;"},
+ {920, "Theta;"},
+ {921, "Iota;"},
+ {922, "Kappa;"},
+ {923, "Lambda;"},
+ {924, "Mu;"},
+ {925, "Nu;"},
+ {926, "Xi;"},
+ {927, "Omicron;"},
+ {928, "Pi;"},
+ {929, "Rho;"},
+ {931, "Sigma;"},
+ {932, "Tau;"},
+ {933, "Upsilon;"},
+ {934, "Phi;"},
+ {935, "Chi;"},
+ {936, "Psi;"},
+ {937, "Omega;"},
+ {945, "alpha;"},
+ {946, "beta;"},
+ {947, "gamma;"},
+ {948, "delta;"},
+ {949, "epsilon;"},
+ {950, "zeta;"},
+ {951, "eta;"},
+ {952, "theta;"},
+ {953, "iota;"},
+ {954, "kappa;"},
+ {955, "lambda;"},
+ {956, "mu;"},
+ {957, "nu;"},
+ {958, "xi;"},
+ {959, "omicron;"},
+ {960, "pi;"},
+ {961, "rho;"},
+ {962, "sigmaf;"},
+ {963, "sigma;"},
+ {964, "tau;"},
+ {965, "upsilon;"},
+ {966, "phi;"},
+ {967, "chi;"},
+ {968, "psi;"},
+ {969, "omega;"},
+ {977, "thetasym;"},
+ {978, "upsih;"},
+ {982, "piv;"},
+ {8194, "ensp;"},
+ {8195, "emsp;"},
+ {8201, "thinsp;"},
+ {8204, "zwnj;"},
+ {8205, "zwj;"},
+ {8206, "lrm;"},
+ {8207, "rlm;"},
+ {8211, "ndash;"},
+ {8212, "mdash;"},
+ {8216, "lsquo;"},
+ {8217, "rsquo;"},
+ {8218, "sbquo;"},
+ {8220, "ldquo;"},
+ {8221, "rdquo;"},
+ {8222, "bdquo;"},
+ {8224, "dagger;"},
+ {8225, "Dagger;"},
+ {8226, "bull;"},
+ {8230, "hellip;"},
+ {8240, "permil;"},
+ {8242, "prime;"},
+ {8243, "Prime;"},
+ {8249, "lsaquo;"},
+ {8250, "rsaquo;"},
+ {8254, "oline;"},
+ {8260, "frasl;"},
+ {8364, "euro;"},
+ {8465, "image;"},
+ {8472, "weierp;"},
+ {8476, "real;"},
+ {8482, "trade;"},
+ {8501, "alefsym;"},
+ {8592, "larr;"},
+ {8593, "uarr;"},
+ {8594, "rarr;"},
+ {8595, "darr;"},
+ {8596, "harr;"},
+ {8629, "crarr;"},
+ {8656, "lArr;"},
+ {8657, "uArr;"},
+ {8658, "rArr;"},
+ {8659, "dArr;"},
+ {8660, "hArr;"},
+ {8704, "forall;"},
+ {8706, "part;"},
+ {8707, "exist;"},
+ {8709, "empty;"},
+ {8711, "nabla;"},
+ {8712, "isin;"},
+ {8713, "notin;"},
+ {8715, "ni;"},
+ {8719, "prod;"},
+ {8721, "sum;"},
+ {8722, "minus;"},
+ {8727, "lowast;"},
+ {8730, "radic;"},
+ {8733, "prop;"},
+ {8734, "infin;"},
+ {8736, "ang;"},
+ {8743, "and;"},
+ {8744, "or;"},
+ {8745, "cap;"},
+ {8746, "cup;"},
+ {8747, "int;"},
+ {8756, "there4;"},
+ {8764, "sim;"},
+ {8773, "cong;"},
+ {8776, "asymp;"},
+ {8800, "ne;"},
+ {8801, "equiv;"},
+ {8804, "le;"},
+ {8805, "ge;"},
+ {8834, "sub;"},
+ {8835, "sup;"},
+ {8836, "nsub;"},
+ {8838, "sube;"},
+ {8839, "supe;"},
+ {8853, "oplus;"},
+ {8855, "otimes;"},
+ {8869, "perp;"},
+ {8901, "sdot;"},
+ {8968, "lceil;"},
+ {8969, "rceil;"},
+ {8970, "lfloor;"},
+ {8971, "rfloor;"},
+ {9001, "lang;"},
+ {9002, "rang;"},
+ {9674, "loz;"},
+ {9824, "spades;"},
+ {9827, "clubs;"},
+ {9829, "hearts;"},
+ {9830, "diams;"},
+ {0, NULL},
+};
+
+static gchar *
+unescape_string (const gchar * text)
+{
+ gint i;
+ GString *unescaped = g_string_new (NULL);
+
+ while (*text) {
+ if (*text == '&') {
+ text++;
+
+ /* unescape   and */
+ if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
+ unescaped = g_string_append_unichar (unescaped, 160);
+ text += 4;
+ if (*text == ';') {
+ text++;
+ }
+ goto next;
+ }
+
+ /* pass xml entities. these will be processed as pango markup */
+ for (i = 0; XmlEntities[i].escaped; i++) {
+ gssize len = strlen (XmlEntities[i].escaped);
+ if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
+ unescaped = g_string_append_c (unescaped, '&');
+ unescaped =
+ g_string_append_len (unescaped, XmlEntities[i].escaped, len);
+ text += len;
+ goto next;
+ }
+ }
+
+ /* convert html entities */
+ for (i = 0; HtmlEntities[i].escaped; i++) {
+ gssize len = strlen (HtmlEntities[i].escaped);
+ if (!strncmp (text, HtmlEntities[i].escaped, len)) {
+ unescaped =
+ g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
+ text += len;
+ goto next;
+ }
+ }
+
+ if (*text == '#') {
+ gboolean is_hex = FALSE;
+ gunichar l;
+ gchar *end = NULL;
+
+ text++;
+ if (*text == 'x') {
+ is_hex = TRUE;
+ text++;
+ }
+ errno = 0;
+ if (is_hex) {
+ l = strtoul (text, &end, 16);
+ } else {
+ l = strtoul (text, &end, 10);
+ }
+
+ if (text == end || errno != 0) {
+ /* error occured. pass it */
+ goto next;
+ }
+ unescaped = g_string_append_unichar (unescaped, l);
+ text = end;
+
+ if (*text == ';') {
+ text++;
+ }
+ goto next;
+ }
+
+ /* escape & */
+ unescaped = g_string_append (unescaped, "&");
+
+ next:
+ continue;
+
+ } else if (g_ascii_isspace (*text)) {
+ unescaped = g_string_append_c (unescaped, ' ');
+ /* strip whitespace */
+ do {
+ text++;
+ } while ((*text) && g_ascii_isspace (*text));
+ } else {
+ unescaped = g_string_append_c (unescaped, *text);
+ text++;
+ }
+ }
+
+ return g_string_free (unescaped, FALSE);
+}
+
+static const gchar *
+string_token (const gchar * string, const gchar * delimiter, gchar ** first)
+{
+ gchar *next = strstr (string, delimiter);
+ if (next) {
+ *first = strndup (string, next - string);
+ } else {
+ *first = strdup (string);
+ }
+ return next;
+}
+
+static void
+html_context_handle_element (HtmlContext * ctxt,
+ const gchar * string, gboolean must_close)
+{
+ gchar *name = NULL;
+ gint count = 0, i;
+ gchar **attrs;
+ const gchar *found, *next;
+
+ /* split element name and attributes */
+ next = string_token (string, " ", &name);
+
+ if (next) {
+ /* count attributes */
+ found = next + 1;
+ while (TRUE) {
+ found = strchr (found, '=');
+ if (!found)
+ break;
+ found++;
+ count++;
+ }
+ } else {
+ count = 0;
+ }
+
+ attrs = g_new0 (gchar *, (count + 1) * 2);
+
+ for (i = 0; i < count; i += 2) {
+ gchar *attr_name = NULL, *attr_value = NULL;
+ gsize length;
+ next = string_token (next + 1, "=", &attr_name);
+ next = string_token (next + 1, " ", &attr_value);
+
+ /* strip " or ' from attribute value */
+ if (attr_value[0] == '"' || attr_value[0] == '\'') {
+ gchar *tmp = strdup (attr_value + 1);
+ g_free (attr_value);
+ attr_value = tmp;
+ }
+
+ length = strlen (attr_value);
+ if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
+ attr_value[length - 1] = '\0';
+ }
+
+ attrs[i] = attr_name;
+ attrs[i + 1] = attr_value;
+ }
+
+ ctxt->parser->start_element (ctxt, name,
+ (const gchar **) attrs, ctxt->user_data);
+ if (must_close) {
+ ctxt->parser->end_element (ctxt, name, ctxt->user_data);
+ }
+ g_strfreev (attrs);
+ g_free (name);
+}
+
+static void
+html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
+{
+ const gchar *next = NULL;
+ ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
+ next = ctxt->buf->str;
+ while (TRUE) {
+ if (next[0] == '<') {
+ gchar *element = NULL;
+ /* find <blahblah> */
+ if (!strchr (next, '>')) {
+ /* no tag end point. buffer will be process in next time */
+ return;
+ }
+
+ next = string_token (next, ">", &element);
+ next++;
+ if (g_str_has_suffix (next, "/")) {
+ /* handle <blah/> */
+ element[strlen (element) - 1] = '\0';
+ html_context_handle_element (ctxt, element + 1, TRUE);
+ } else if (element[1] == '/') {
+ /* handle </blah> */
+ ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
+ } else {
+ /* handle <blah> */
+ html_context_handle_element (ctxt, element + 1, FALSE);
+ }
+ g_free (element);
+ } else if (strchr (next, '<')) {
+ gchar *text = NULL;
+ gsize length;
+ next = string_token (next, "<", &text);
+ text = g_strstrip (text);
+ length = strlen (text);
+ ctxt->parser->text (ctxt, text, length, ctxt->user_data);
+ g_free (text);
+
+ } else {
+ gchar *text = (gchar *) next;
+ gsize length;
+ text = g_strstrip (text);
+ length = strlen (text);
+ ctxt->parser->text (ctxt, text, length, ctxt->user_data);
+ ctxt->buf = g_string_assign (ctxt->buf, "");
+ return;
+ }
+ }
+
+ ctxt->buf = g_string_assign (ctxt->buf, next);
+}
+
static gchar *
has_tag (GString * str, const gchar tag)
{
}
static void
-handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
+handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
{
int i;
sami_context_pop_state (sctx, CLEAR_TAG);
if (atts != NULL) {
for (i = 0; (atts[i] != NULL); i += 2) {
- const xmlChar *key, *value;
+ const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
continue;
- if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
+ if (!g_ascii_strcasecmp ("start", key)) {
/* Only set a new start time if we don't have text pending */
if (sctx->resultbuf->len == 0)
sctx->time1 = sctx->time2;
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
+ sctx->time2 = MAX (sctx->time2, sctx->time1);
g_string_append (sctx->resultbuf, sctx->buf->str);
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
}
static void
-handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
+handle_start_font (GstSamiContext * sctx, const gchar ** atts)
{
int i;
if (atts != NULL) {
g_string_append (sctx->buf, "<span");
for (i = 0; (atts[i] != NULL); i += 2) {
- const xmlChar *key, *value;
+ const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
continue;
- if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
+ if (!g_ascii_strcasecmp ("color", key)) {
/*
* There are invalid color value in many
* sami files.
* It will fix hex color value that start without '#'
*/
const gchar *sharp = "";
- int len = xmlStrlen (value);
+ int len = strlen (value);
if (!(*value == '#' && len == 7)) {
gchar *r;
/* check if it looks like hex */
if (strtol ((const char *) value, &r, 16) >= 0 &&
- ((xmlChar *) r == (value + 6) && len == 6)) {
+ ((gchar *) r == (value + 6) && len == 6)) {
sharp = "#";
}
}
/* some colours can be found in many sami files, but X RGB database
* doesn't contain a colour by this name, so map explicitly */
- if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
- value = (const xmlChar *) "#00ffff";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
- value = (const xmlChar *) "#dc143c";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
- value = (const xmlChar *) "#ff00ff";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
- value = (const xmlChar *) "#4b0082";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
- value = (const xmlChar *) "#00ff00";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
- value = (const xmlChar *) "#808000";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
- value = (const xmlChar *) "#c0c0c0";
- } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
- value = (const xmlChar *) "#008080";
+ if (!g_ascii_strcasecmp ("aqua", value)) {
+ value = "#00ffff";
+ } else if (!g_ascii_strcasecmp ("crimson", value)) {
+ value = "#dc143c";
+ } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
+ value = "#ff00ff";
+ } else if (!g_ascii_strcasecmp ("indigo", value)) {
+ value = "#4b0082";
+ } else if (!g_ascii_strcasecmp ("lime", value)) {
+ value = "#00ff00";
+ } else if (!g_ascii_strcasecmp ("olive", value)) {
+ value = "#808000";
+ } else if (!g_ascii_strcasecmp ("silver", value)) {
+ value = "#c0c0c0";
+ } else if (!g_ascii_strcasecmp ("teal", value)) {
+ value = "#008080";
}
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
value);
- } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
+ } else if (!g_ascii_strcasecmp ("face", key)) {
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
}
}
}
static void
-start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
+handle_start_element (HtmlContext * ctx, const gchar * name,
+ const char **atts, gpointer user_data)
{
- GstSamiContext *sctx = (GstSamiContext *) ctx;
+ GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
- if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
+ if (!g_ascii_strcasecmp ("sync", name)) {
handle_start_sync (sctx, atts);
sctx->in_sync = TRUE;
- } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("font", name)) {
handle_start_font (sctx, atts);
- } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_push_state (sctx, RUBY_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
+ } else if (!g_ascii_strcasecmp ("br", name)) {
g_string_append_c (sctx->buf, '\n');
/* FIXME: support for furigana/ruby once implemented in pango */
- } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
+ } else if (!g_ascii_strcasecmp ("rt", name)) {
if (has_tag (sctx->state, ITALIC_TAG)) {
g_string_append (sctx->rubybuf, "<i>");
}
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
sami_context_push_state (sctx, RT_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
- } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
+ } else if (!g_ascii_strcasecmp ("i", name)) {
g_string_append (sctx->buf, "<i>");
sami_context_push_state (sctx, ITALIC_TAG);
+ } else if (!g_ascii_strcasecmp ("p", name)) {
}
}
static void
-end_sami_element (void *ctx, const xmlChar * name)
+handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
{
- GstSamiContext *sctx = (GstSamiContext *) ctx;
+ GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
- if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
+ if (!g_ascii_strcasecmp ("sync", name)) {
sctx->in_sync = FALSE;
- } else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) ||
- (!xmlStrncmp ((const xmlChar *) "sami", name, 4))) {
+ } else if ((!g_ascii_strcasecmp ("body", name)) ||
+ (!g_ascii_strcasecmp ("sami", name))) {
/* We will usually have one buffer left when the body is closed
* as we need the next sync to actually send it */
if (sctx->buf->len != 0) {
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
}
- } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("font", name)) {
sami_context_pop_state (sctx, SPAN_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
+ } else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_pop_state (sctx, RUBY_TAG);
- } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
+ } else if (!g_ascii_strcasecmp ("i", name)) {
sami_context_pop_state (sctx, ITALIC_TAG);
}
}
static void
-characters_sami (void *ctx, const xmlChar * ch, int len)
+handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
+ gpointer user_data)
{
- GstSamiContext *sctx = (GstSamiContext *) ctx;
- gchar *escaped;
- gchar *tmp;
- gint i;
+ GstSamiContext *sctx = (GstSamiContext *) user_data;
/* Skip everything except content of the sync elements */
if (!sctx->in_sync)
return;
- escaped = g_markup_escape_text ((const gchar *) ch, len);
- g_strstrip (escaped);
-
- /* Remove double spaces forom the string as those are
- * usually added by newlines and indention */
- tmp = escaped;
- for (i = 0; i <= strlen (escaped); i++) {
- escaped[i] = *tmp;
- if (*tmp != ' ') {
- tmp++;
- continue;
- }
- while (*tmp == ' ')
- tmp++;
- }
-
if (has_tag (sctx->state, RT_TAG)) {
g_string_append_c (sctx->rubybuf, ' ');
- g_string_append (sctx->rubybuf, escaped);
+ g_string_append (sctx->rubybuf, text);
g_string_append_c (sctx->rubybuf, ' ');
} else {
- g_string_append (sctx->buf, escaped);
+ g_string_append (sctx->buf, text);
}
- g_free (escaped);
}
-static xmlSAXHandler samiSAXHandlerStruct = {
- NULL, /* internalSubset */
- NULL, /* isStandalone */
- NULL, /* hasInternalSubset */
- NULL, /* hasExternalSubset */
- NULL, /* resolveEntity */
- NULL, /* getEntity */
- NULL, /* entityDecl */
- NULL, /* notationDecl */
- NULL, /* attributeDecl */
- NULL, /* elementDecl */
- NULL, /* unparsedEntityDecl */
- NULL, /* setDocumentLocator */
- NULL, /* startDocument */
- NULL, /* endDocument */
- start_sami_element, /* startElement */
- end_sami_element, /* endElement */
- NULL, /* reference */
- characters_sami, /* characters */
- NULL, /* ignorableWhitespace */
- NULL, /* processingInstruction */
- NULL, /* comment */
- NULL, /* xmlParserWarning */
- NULL, /* xmlParserError */
- NULL, /* xmlParserError */
- NULL, /* getParameterEntity */
- NULL, /* cdataBlock */
- NULL, /* externalSubset */
- 1, /* initialized */
- NULL, /* private */
- NULL, /* startElementNsSAX2Func */
- NULL, /* endElementNsSAX2Func */
- NULL /* xmlStructuredErrorFunc */
+static HtmlParser samiParser = {
+ handle_start_element, /* start_element */
+ handle_end_element, /* end_element */
+ handle_text, /* text */
};
-static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
-
void
sami_context_init (ParserState * state)
{
state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
context = (GstSamiContext *) state->user_data;
- context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
- "", 0, NULL, XML_CHAR_ENCODING_UTF8);
+ context->htmlctxt = html_context_new (&samiParser, context);
context->buf = g_string_new ("");
context->rubybuf = g_string_new ("");
context->resultbuf = g_string_new ("");
GstSamiContext *context = (GstSamiContext *) state->user_data;
if (context) {
- htmlParserCtxtPtr htmlctxt = context->htmlctxt;
-
- /* destroy sax context */
- htmlDocPtr doc;
-
- htmlParseChunk (htmlctxt, "", 0, 1);
- doc = htmlctxt->myDoc;
- htmlFreeParserCtxt (htmlctxt);
+ html_context_free (context->htmlctxt);
context->htmlctxt = NULL;
- if (doc)
- xmlFreeDoc (doc);
g_string_free (context->buf, TRUE);
g_string_free (context->rubybuf, TRUE);
g_string_free (context->resultbuf, TRUE);
}
}
-static gchar *
-fix_invalid_entities (const gchar * line)
-{
- const gchar *cp, *pp; /* current pointer, previous pointer */
- gssize size;
- GString *ret = g_string_new (NULL);
-
- pp = line;
- cp = strchr (line, '&');
- while (cp) {
- size = cp - pp;
- ret = g_string_append_len (ret, pp, size);
- cp++;
- if (g_ascii_strncasecmp (cp, "nbsp;", 5)
- && (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
- /* translate " " to " " */
- ret = g_string_append_len (ret, " ", 6);
- cp += 4;
- } else if (g_ascii_strncasecmp (cp, "quot;", 5)
- && g_ascii_strncasecmp (cp, "amp;", 4)
- && g_ascii_strncasecmp (cp, "apos;", 5)
- && g_ascii_strncasecmp (cp, "lt;", 3)
- && g_ascii_strncasecmp (cp, "gt;", 3)
- && g_ascii_strncasecmp (cp, "nbsp;", 5)
- && cp[0] != '#') {
- /* translate "&" to "&" */
- ret = g_string_append_len (ret, "&", 5);
- } else {
- /* do not translate */
- ret = g_string_append_c (ret, '&');
- }
-
- pp = cp;
- cp = strchr (pp, '&');
- }
- ret = g_string_append (ret, pp);
- return g_string_free (ret, FALSE);
-}
-
gchar *
parse_sami (ParserState * state, const gchar * line)
{
- gchar *fixed_line;
+ gchar *ret = NULL;
GstSamiContext *context = (GstSamiContext *) state->user_data;
- fixed_line = fix_invalid_entities (line);
- htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
- g_free (fixed_line);
+ gchar *unescaped = unescape_string (line);
+ html_context_parse (context->htmlctxt, (gchar *) unescaped,
+ strlen (unescaped));
+ g_free (unescaped);
if (context->has_result) {
- gchar *r;
-
if (context->rubybuf->len) {
context->rubybuf = g_string_append_c (context->rubybuf, '\n');
g_string_prepend (context->resultbuf, context->rubybuf->str);
context->rubybuf = g_string_truncate (context->rubybuf, 0);
}
- r = g_string_free (context->resultbuf, FALSE);
+ ret = g_string_free (context->resultbuf, FALSE);
context->resultbuf = g_string_new ("");
state->start_time = context->time1;
state->duration = context->time2 - context->time1;
context->has_result = FALSE;
- return r;
}
- return NULL;
+ return ret;
}