1 /* GStreamer SAMI subtitle parser
2 * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
20 #include "samiparse.h"
22 #include <libxml/HTMLparser.h>
25 #define ITALIC_TAG 'i'
31 typedef struct _GstSamiContext GstSamiContext;
33 struct _GstSamiContext
35 GString *buf; /* buffer to collect content */
36 GString *rubybuf; /* buffer to collect ruby content */
37 GString *resultbuf; /* when opening the next 'sync' tag, move
38 * from 'buf' to avoid to append following
40 GString *state; /* in many sami files there are tags that
41 * are not closed, so for each open tag the
42 * parser will append a tag flag here so
43 * that tags can be closed properly on
44 * 'sync' tags. See _context_push_state()
45 * and _context_pop_state(). */
46 htmlParserCtxtPtr htmlctxt; /* html parser context */
47 gboolean has_result; /* set when ready to push out result */
48 gboolean in_sync; /* flag to avoid appending anything except the
49 * content of the sync elements to buf */
50 guint64 time1; /* previous start attribute in sync tag */
51 guint64 time2; /* current start attribute in sync tag */
55 has_tag (GString * str, const gchar tag)
57 return strrchr (str->str, tag);
61 sami_context_push_state (GstSamiContext * sctx, char state)
63 GST_LOG ("state %c", state);
64 g_string_append_c (sctx->state, state);
68 sami_context_pop_state (GstSamiContext * sctx, char state)
70 GString *str = g_string_new ("");
71 GString *context_state = sctx->state;
74 GST_LOG ("state %c", state);
75 for (i = context_state->len - 1; i >= 0; i--) {
76 switch (context_state->str[i]) {
77 case ITALIC_TAG: /* <i> */
79 g_string_append (str, "</i>");
82 case SPAN_TAG: /* <span foreground= > */
84 g_string_append (str, "</span>");
87 case RUBY_TAG: /* <span size= > -- ruby */
91 case RT_TAG: /* ruby */
93 /* FIXME: support for furigana/ruby once implemented in pango */
94 g_string_append (sctx->rubybuf, "</span>");
95 if (has_tag (context_state, ITALIC_TAG)) {
96 g_string_append (sctx->rubybuf, "</i>");
104 if (context_state->str[i] == state) {
105 g_string_append (sctx->buf, str->str);
106 g_string_free (str, TRUE);
107 g_string_truncate (context_state, i);
111 if (state == CLEAR_TAG) {
112 g_string_append (sctx->buf, str->str);
113 g_string_truncate (context_state, 0);
115 g_string_free (str, TRUE);
119 handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
123 sami_context_pop_state (sctx, CLEAR_TAG);
125 for (i = 0; (atts[i] != NULL); i += 2) {
126 const xmlChar *key, *value;
133 if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
134 /* Only set a new start time if we don't have text pending */
135 if (sctx->resultbuf->len == 0)
136 sctx->time1 = sctx->time2;
138 sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
139 g_string_append (sctx->resultbuf, sctx->buf->str);
140 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
141 g_string_truncate (sctx->buf, 0);
148 handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
152 sami_context_pop_state (sctx, SPAN_TAG);
154 g_string_append (sctx->buf, "<span");
155 for (i = 0; (atts[i] != NULL); i += 2) {
156 const xmlChar *key, *value;
163 if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
165 * There are invalid color value in many
167 * It will fix hex color value that start without '#'
169 const gchar *sharp = "";
170 int len = xmlStrlen (value);
172 if (!(*value == '#' && len == 7)) {
175 /* check if it looks like hex */
176 if (strtol ((const char *) value, &r, 16) >= 0 &&
177 ((xmlChar *) r == (value + 6) && len == 6)) {
181 /* some colours can be found in many sami files, but X RGB database
182 * doesn't contain a colour by this name, so map explicitly */
183 if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
184 value = (const xmlChar *) "#00ffff";
185 } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
186 value = (const xmlChar *) "#dc143c";
187 } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
188 value = (const xmlChar *) "#ff00ff";
189 } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
190 value = (const xmlChar *) "#4b0082";
191 } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
192 value = (const xmlChar *) "#00ff00";
193 } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
194 value = (const xmlChar *) "#808000";
195 } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
196 value = (const xmlChar *) "#c0c0c0";
197 } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
198 value = (const xmlChar *) "#008080";
200 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
202 } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
203 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
206 g_string_append_c (sctx->buf, '>');
207 sami_context_push_state (sctx, SPAN_TAG);
212 start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
214 GstSamiContext *sctx = (GstSamiContext *) ctx;
216 GST_LOG ("name:%s", name);
218 if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
219 handle_start_sync (sctx, atts);
220 sctx->in_sync = TRUE;
221 } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
222 handle_start_font (sctx, atts);
223 } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
224 sami_context_push_state (sctx, RUBY_TAG);
225 } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
226 g_string_append_c (sctx->buf, '\n');
227 /* FIXME: support for furigana/ruby once implemented in pango */
228 } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
229 if (has_tag (sctx->state, ITALIC_TAG)) {
230 g_string_append (sctx->rubybuf, "<i>");
232 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
233 sami_context_push_state (sctx, RT_TAG);
234 } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
235 } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
236 g_string_append (sctx->buf, "<i>");
237 sami_context_push_state (sctx, ITALIC_TAG);
242 end_sami_element (void *ctx, const xmlChar * name)
244 GstSamiContext *sctx = (GstSamiContext *) ctx;
246 GST_LOG ("name:%s", name);
248 if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
249 sctx->in_sync = FALSE;
250 } else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) ||
251 (!xmlStrncmp ((const xmlChar *) "sami", name, 4))) {
252 /* We will usually have one buffer left when the body is closed
253 * as we need the next sync to actually send it */
254 if (sctx->buf->len != 0) {
255 /* Only set a new start time if we don't have text pending */
256 if (sctx->resultbuf->len == 0)
257 sctx->time1 = sctx->time2;
259 sctx->time2 = GST_CLOCK_TIME_NONE;
260 g_string_append (sctx->resultbuf, sctx->buf->str);
261 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
262 g_string_truncate (sctx->buf, 0);
264 } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
265 sami_context_pop_state (sctx, SPAN_TAG);
266 } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
267 sami_context_pop_state (sctx, RUBY_TAG);
268 } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
269 sami_context_pop_state (sctx, ITALIC_TAG);
274 characters_sami (void *ctx, const xmlChar * ch, int len)
276 GstSamiContext *sctx = (GstSamiContext *) ctx;
281 /* Skip everything except content of the sync elements */
285 escaped = g_markup_escape_text ((const gchar *) ch, len);
286 g_strstrip (escaped);
288 /* Remove double spaces forom the string as those are
289 * usually added by newlines and indention */
291 for (i = 0; i <= strlen (escaped); i++) {
301 if (has_tag (sctx->state, RT_TAG)) {
302 g_string_append_c (sctx->rubybuf, ' ');
303 g_string_append (sctx->rubybuf, escaped);
304 g_string_append_c (sctx->rubybuf, ' ');
306 g_string_append (sctx->buf, escaped);
311 static xmlSAXHandler samiSAXHandlerStruct = {
312 NULL, /* internalSubset */
313 NULL, /* isStandalone */
314 NULL, /* hasInternalSubset */
315 NULL, /* hasExternalSubset */
316 NULL, /* resolveEntity */
317 NULL, /* getEntity */
318 NULL, /* entityDecl */
319 NULL, /* notationDecl */
320 NULL, /* attributeDecl */
321 NULL, /* elementDecl */
322 NULL, /* unparsedEntityDecl */
323 NULL, /* setDocumentLocator */
324 NULL, /* startDocument */
325 NULL, /* endDocument */
326 start_sami_element, /* startElement */
327 end_sami_element, /* endElement */
328 NULL, /* reference */
329 characters_sami, /* characters */
330 NULL, /* ignorableWhitespace */
331 NULL, /* processingInstruction */
333 NULL, /* xmlParserWarning */
334 NULL, /* xmlParserError */
335 NULL, /* xmlParserError */
336 NULL, /* getParameterEntity */
337 NULL, /* cdataBlock */
338 NULL, /* externalSubset */
341 NULL, /* startElementNsSAX2Func */
342 NULL, /* endElementNsSAX2Func */
343 NULL /* xmlStructuredErrorFunc */
346 static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
349 sami_context_init (ParserState * state)
351 GstSamiContext *context;
353 g_assert (state->user_data == NULL);
354 state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
355 context = (GstSamiContext *) state->user_data;
357 context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
358 "", 0, NULL, XML_CHAR_ENCODING_UTF8);
359 context->buf = g_string_new ("");
360 context->rubybuf = g_string_new ("");
361 context->resultbuf = g_string_new ("");
362 context->state = g_string_new ("");
366 sami_context_deinit (ParserState * state)
368 GstSamiContext *context = (GstSamiContext *) state->user_data;
371 htmlParserCtxtPtr htmlctxt = context->htmlctxt;
373 /* destroy sax context */
376 htmlParseChunk (htmlctxt, "", 0, 1);
377 doc = htmlctxt->myDoc;
378 htmlFreeParserCtxt (htmlctxt);
379 context->htmlctxt = NULL;
382 g_string_free (context->buf, TRUE);
383 g_string_free (context->rubybuf, TRUE);
384 g_string_free (context->resultbuf, TRUE);
385 g_string_free (context->state, TRUE);
387 state->user_data = NULL;
392 sami_context_reset (ParserState * state)
394 GstSamiContext *context = (GstSamiContext *) state->user_data;
397 g_string_truncate (context->buf, 0);
398 g_string_truncate (context->rubybuf, 0);
399 g_string_truncate (context->resultbuf, 0);
400 g_string_truncate (context->state, 0);
401 context->has_result = FALSE;
402 context->in_sync = FALSE;
409 fix_invalid_entities (const gchar * line)
411 const gchar *cp, *pp; /* current pointer, previous pointer */
413 GString *ret = g_string_new (NULL);
416 cp = strchr (line, '&');
419 ret = g_string_append_len (ret, pp, size);
421 if (g_ascii_strncasecmp (cp, "nbsp;", 5)
422 && (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
423 /* translate " " to " " */
424 ret = g_string_append_len (ret, " ", 6);
426 } else if (g_ascii_strncasecmp (cp, "quot;", 5)
427 && g_ascii_strncasecmp (cp, "amp;", 4)
428 && g_ascii_strncasecmp (cp, "apos;", 5)
429 && g_ascii_strncasecmp (cp, "lt;", 3)
430 && g_ascii_strncasecmp (cp, "gt;", 3)
431 && g_ascii_strncasecmp (cp, "nbsp;", 5)
433 /* translate "&" to "&" */
434 ret = g_string_append_len (ret, "&", 5);
436 /* do not translate */
437 ret = g_string_append_c (ret, '&');
441 cp = strchr (pp, '&');
443 ret = g_string_append (ret, pp);
444 return g_string_free (ret, FALSE);
448 parse_sami (ParserState * state, const gchar * line)
451 GstSamiContext *context = (GstSamiContext *) state->user_data;
453 fixed_line = fix_invalid_entities (line);
454 htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
457 if (context->has_result) {
460 if (context->rubybuf->len) {
461 context->rubybuf = g_string_append_c (context->rubybuf, '\n');
462 g_string_prepend (context->resultbuf, context->rubybuf->str);
463 context->rubybuf = g_string_truncate (context->rubybuf, 0);
466 r = g_string_free (context->resultbuf, FALSE);
467 context->resultbuf = g_string_new ("");
468 state->start_time = context->time1;
469 state->duration = context->time2 - context->time1;
470 context->has_result = FALSE;