1 /* GStreamer SAMI subtitle parser
2 * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
20 #include "samiparse.h"
22 #include <libxml/HTMLparser.h>
25 #define ITALIC_TAG 'i'
31 typedef struct _GstSamiContext GstSamiContext;
33 struct _GstSamiContext
35 GString *buf; /* buffer to collect content */
36 GString *rubybuf; /* buffer to collect ruby content */
37 GString *resultbuf; /* when opening the next 'sync' tag, move
38 * from 'buf' to avoid to append following
40 GString *state; /* in many sami files there are tags that
41 * are not closed, so for each open tag the
42 * parser will append a tag flag here so
43 * that tags can be closed properly on
44 * 'sync' tags. See _context_push_state()
45 * and _context_pop_state(). */
46 htmlParserCtxtPtr htmlctxt; /* html parser context */
47 gboolean has_result; /* set when ready to push out result */
48 gboolean in_sync; /* flag to avoid appending anything except the
49 * content of the sync elements to buf */
50 guint64 time1; /* previous start attribute in sync tag */
51 guint64 time2; /* current start attribute in sync tag */
55 has_tag (GString * str, const gchar tag)
57 return strrchr (str->str, tag);
61 sami_context_push_state (GstSamiContext * sctx, char state)
63 g_string_append_c (sctx->state, state);
67 sami_context_pop_state (GstSamiContext * sctx, char state)
69 GString *str = g_string_new ("");
70 GString *context_state = sctx->state;
73 for (i = context_state->len - 1; i >= 0; i--) {
74 switch (context_state->str[i]) {
75 case ITALIC_TAG: /* <i> */
77 g_string_append (str, "</i>");
80 case SPAN_TAG: /* <span foreground= > */
82 g_string_append (str, "</span>");
85 case RUBY_TAG: /* <span size= > -- ruby */
89 case RT_TAG: /* ruby */
91 /* FIXME: support for furigana/ruby once implemented in pango */
92 g_string_append (sctx->rubybuf, "</span>");
93 if (has_tag (context_state, ITALIC_TAG)) {
94 g_string_append (sctx->rubybuf, "</i>");
102 if (context_state->str[i] == state) {
103 g_string_append (sctx->buf, str->str);
104 g_string_free (str, TRUE);
105 g_string_truncate (context_state, i);
109 if (state == CLEAR_TAG) {
110 g_string_append (sctx->buf, str->str);
111 g_string_truncate (context_state, 0);
113 g_string_free (str, TRUE);
117 handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
121 sami_context_pop_state (sctx, CLEAR_TAG);
123 for (i = 0; (atts[i] != NULL); i += 2) {
124 const xmlChar *key, *value;
131 if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
132 /* Only set a new start time if we don't have text pending */
133 if (sctx->resultbuf->len == 0)
134 sctx->time1 = sctx->time2;
136 sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
137 g_string_append (sctx->resultbuf, sctx->buf->str);
138 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
139 g_string_truncate (sctx->buf, 0);
146 handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
150 sami_context_pop_state (sctx, SPAN_TAG);
152 g_string_append (sctx->buf, "<span");
153 for (i = 0; (atts[i] != NULL); i += 2) {
154 const xmlChar *key, *value;
161 if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
163 * There are invalid color value in many
165 * It will fix hex color value that start without '#'
168 int len = xmlStrlen (value);
170 if (!(*value == '#' && len == 7)) {
173 /* check if it looks like hex */
174 if (strtol ((const char *) value, &r, 16) >= 0 &&
175 ((xmlChar *) r == (value + 6) && len == 6)) {
179 /* some colours can be found in many sami files, but X RGB database
180 * doesn't contain a colour by this name, so map explicitly */
181 if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
182 value = (const xmlChar *) "#00ffff";
183 } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
184 value = (const xmlChar *) "#dc143c";
185 } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
186 value = (const xmlChar *) "#ff00ff";
187 } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
188 value = (const xmlChar *) "#4b0082";
189 } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
190 value = (const xmlChar *) "#00ff00";
191 } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
192 value = (const xmlChar *) "#808000";
193 } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
194 value = (const xmlChar *) "#c0c0c0";
195 } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
196 value = (const xmlChar *) "#008080";
198 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
200 } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
201 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
204 g_string_append_c (sctx->buf, '>');
205 sami_context_push_state (sctx, SPAN_TAG);
210 start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
212 GstSamiContext *sctx = (GstSamiContext *) ctx;
214 if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
215 handle_start_sync (sctx, atts);
216 sctx->in_sync = TRUE;
217 } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
218 handle_start_font (sctx, atts);
219 } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
220 sami_context_push_state (sctx, RUBY_TAG);
221 } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
222 g_string_append_c (sctx->buf, '\n');
223 /* FIXME: support for furigana/ruby once implemented in pango */
224 } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
225 if (has_tag (sctx->state, ITALIC_TAG)) {
226 g_string_append (sctx->rubybuf, "<i>");
228 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
229 sami_context_push_state (sctx, RT_TAG);
230 } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
231 } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
232 g_string_append (sctx->buf, "<i>");
233 sami_context_push_state (sctx, ITALIC_TAG);
238 end_sami_element (void *ctx, const xmlChar * name)
240 GstSamiContext *sctx = (GstSamiContext *) ctx;
242 if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
243 sctx->in_sync = FALSE;
244 } else if (!xmlStrncmp ((const xmlChar *) "body", name, 4)) {
245 /* We will usually have one buffer left when the body is closed
246 * as we need the next sync to actually send it */
247 if (sctx->buf->len != 0) {
248 /* Only set a new start time if we don't have text pending */
249 if (sctx->resultbuf->len == 0)
250 sctx->time1 = sctx->time2;
252 sctx->time2 = GST_CLOCK_TIME_NONE;
253 g_string_append (sctx->resultbuf, sctx->buf->str);
254 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
255 g_string_truncate (sctx->buf, 0);
257 } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
258 sami_context_pop_state (sctx, SPAN_TAG);
259 } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
260 sami_context_pop_state (sctx, RUBY_TAG);
261 } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
262 sami_context_pop_state (sctx, ITALIC_TAG);
267 characters_sami (void *ctx, const xmlChar * ch, int len)
269 GstSamiContext *sctx = (GstSamiContext *) ctx;
274 /* Skip everything except content of the sync elements */
278 escaped = g_markup_escape_text ((const gchar *) ch, len);
279 g_strstrip (escaped);
281 /* Remove double spaces forom the string as those are
282 * usually added by newlines and indention */
284 for (i = 0; i <= strlen (escaped); i++) {
294 if (has_tag (sctx->state, RT_TAG)) {
295 g_string_append_c (sctx->rubybuf, ' ');
296 g_string_append (sctx->rubybuf, escaped);
297 g_string_append_c (sctx->rubybuf, ' ');
299 g_string_append (sctx->buf, escaped);
304 static xmlSAXHandler samiSAXHandlerStruct = {
305 NULL, /* internalSubset */
306 NULL, /* isStandalone */
307 NULL, /* hasInternalSubset */
308 NULL, /* hasExternalSubset */
309 NULL, /* resolveEntity */
310 NULL, /* getEntity */
311 NULL, /* entityDecl */
312 NULL, /* notationDecl */
313 NULL, /* attributeDecl */
314 NULL, /* elementDecl */
315 NULL, /* unparsedEntityDecl */
316 NULL, /* setDocumentLocator */
317 NULL, /* startDocument */
318 NULL, /* endDocument */
319 start_sami_element, /* startElement */
320 end_sami_element, /* endElement */
321 NULL, /* reference */
322 characters_sami, /* characters */
323 NULL, /* ignorableWhitespace */
324 NULL, /* processingInstruction */
326 NULL, /* xmlParserWarning */
327 NULL, /* xmlParserError */
328 NULL, /* xmlParserError */
329 NULL, /* getParameterEntity */
330 NULL, /* cdataBlock */
331 NULL, /* externalSubset */
334 NULL, /* startElementNsSAX2Func */
335 NULL, /* endElementNsSAX2Func */
336 NULL /* xmlStructuredErrorFunc */
338 static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
341 sami_context_init (ParserState * state)
343 GstSamiContext *context;
345 g_assert (state->user_data == NULL);
346 state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
347 context = (GstSamiContext *) state->user_data;
349 context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
350 "", 0, NULL, XML_CHAR_ENCODING_UTF8);
351 context->buf = g_string_new ("");
352 context->rubybuf = g_string_new ("");
353 context->resultbuf = g_string_new ("");
354 context->state = g_string_new ("");
358 sami_context_deinit (ParserState * state)
360 GstSamiContext *context = (GstSamiContext *) state->user_data;
363 htmlParserCtxtPtr htmlctxt = context->htmlctxt;
365 /* destroy sax context */
368 htmlParseChunk (htmlctxt, "", 0, 1);
369 doc = htmlctxt->myDoc;
370 htmlFreeParserCtxt (htmlctxt);
371 context->htmlctxt = NULL;
374 g_string_free (context->buf, TRUE);
375 g_string_free (context->rubybuf, TRUE);
376 g_string_free (context->resultbuf, TRUE);
377 g_string_free (context->state, TRUE);
379 state->user_data = NULL;
384 sami_context_reset (ParserState * state)
386 GstSamiContext *context = (GstSamiContext *) state->user_data;
389 g_string_truncate (context->buf, 0);
390 g_string_truncate (context->rubybuf, 0);
391 g_string_truncate (context->resultbuf, 0);
392 g_string_truncate (context->state, 0);
393 context->has_result = FALSE;
394 context->in_sync = FALSE;
401 fix_invalid_entities (const gchar * line)
403 const gchar *cp, *pp; /* current pointer, previous pointer */
405 GString *ret = g_string_new (NULL);
408 cp = strchr (line, '&');
411 ret = g_string_append_len (ret, pp, size);
413 if (g_ascii_strncasecmp (cp, "nbsp;", 5)
414 && (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
415 /* translate " " to " " */
416 ret = g_string_append_len (ret, " ", 6);
418 } else if (g_ascii_strncasecmp (cp, "quot;", 5)
419 && g_ascii_strncasecmp (cp, "amp;", 4)
420 && g_ascii_strncasecmp (cp, "apos;", 5)
421 && g_ascii_strncasecmp (cp, "lt;", 3)
422 && g_ascii_strncasecmp (cp, "gt;", 3)
423 && g_ascii_strncasecmp (cp, "nbsp;", 5)
425 /* translate "&" to "&" */
426 ret = g_string_append_len (ret, "&", 5);
428 /* do not translate */
429 ret = g_string_append_c (ret, '&');
433 cp = strchr (pp, '&');
435 ret = g_string_append (ret, pp);
436 return g_string_free (ret, FALSE);
440 parse_sami (ParserState * state, const gchar * line)
443 GstSamiContext *context = (GstSamiContext *) state->user_data;
445 fixed_line = fix_invalid_entities (line);
446 htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
449 if (context->has_result) {
452 if (context->rubybuf->len) {
453 context->rubybuf = g_string_append_c (context->rubybuf, '\n');
454 g_string_prepend (context->resultbuf, context->rubybuf->str);
455 context->rubybuf = g_string_truncate (context->rubybuf, 0);
458 r = g_string_free (context->resultbuf, FALSE);
459 context->resultbuf = g_string_new ("");
460 state->start_time = context->time1;
461 state->duration = context->time2 - context->time1;
462 context->has_result = FALSE;