1 /* GStreamer SAMI subtitle parser
2 * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
20 #include "samiparse.h"
26 #define ITALIC_TAG 'i'
32 typedef struct _HtmlParser HtmlParser;
33 typedef struct _HtmlContext HtmlContext;
34 typedef struct _GstSamiContext GstSamiContext;
36 struct _GstSamiContext
38 GString *buf; /* buffer to collect content */
39 GString *rubybuf; /* buffer to collect ruby content */
40 GString *resultbuf; /* when opening the next 'sync' tag, move
41 * from 'buf' to avoid to append following
43 GString *state; /* in many sami files there are tags that
44 * are not closed, so for each open tag the
45 * parser will append a tag flag here so
46 * that tags can be closed properly on
47 * 'sync' tags. See _context_push_state()
48 * and _context_pop_state(). */
49 HtmlContext *htmlctxt; /* html parser context */
50 gboolean has_result; /* set when ready to push out result */
51 gboolean in_sync; /* flag to avoid appending anything except the
52 * content of the sync elements to buf */
53 guint64 time1; /* previous start attribute in sync tag */
54 guint64 time2; /* current start attribute in sync tag */
59 void (*start_element) (HtmlContext * ctx,
60 const gchar * name, const gchar ** attr, gpointer user_data);
61 void (*end_element) (HtmlContext * ctx,
62 const gchar * name, gpointer user_data);
63 void (*text) (HtmlContext * ctx,
64 const gchar * text, gsize text_len, gpointer user_data);
69 const HtmlParser *parser;
75 html_context_new (HtmlParser * parser, gpointer user_data)
77 HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
78 ctxt->parser = parser;
79 ctxt->user_data = user_data;
80 ctxt->buf = g_string_new (NULL);
85 html_context_free (HtmlContext * ctxt)
87 g_string_free (ctxt->buf, TRUE);
93 const gunichar unescaped;
97 struct EntityMap XmlEntities[] = {
106 struct EntityMap HtmlEntities[] = {
107 /* nbsp will handle manually
360 unescape_string (const gchar * text)
363 GString *unescaped = g_string_new (NULL);
369 /* unescape   and */
370 if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
371 unescaped = g_string_append_unichar (unescaped, 160);
379 /* pass xml entities. these will be processed as pango markup */
380 for (i = 0; XmlEntities[i].escaped; i++) {
381 gssize len = strlen (XmlEntities[i].escaped);
382 if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
383 unescaped = g_string_append_c (unescaped, '&');
385 g_string_append_len (unescaped, XmlEntities[i].escaped, len);
391 /* convert html entities */
392 for (i = 0; HtmlEntities[i].escaped; i++) {
393 gssize len = strlen (HtmlEntities[i].escaped);
394 if (!strncmp (text, HtmlEntities[i].escaped, len)) {
396 g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
403 gboolean is_hex = FALSE;
414 l = strtoul (text, &end, 16);
416 l = strtoul (text, &end, 10);
419 if (text == end || errno != 0) {
420 /* error occurred. pass it */
423 unescaped = g_string_append_unichar (unescaped, l);
433 unescaped = g_string_append (unescaped, "&");
438 } else if (g_ascii_isspace (*text)) {
439 unescaped = g_string_append_c (unescaped, ' ');
440 /* strip whitespace */
443 } while ((*text) && g_ascii_isspace (*text));
445 unescaped = g_string_append_c (unescaped, *text);
450 return g_string_free (unescaped, FALSE);
454 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
456 gchar *next = strstr (string, delimiter);
458 *first = g_strndup (string, next - string);
460 *first = g_strdup (string);
466 html_context_handle_element (HtmlContext * ctxt,
467 const gchar * string, gboolean must_close)
472 const gchar *found, *next;
474 /* split element name and attributes */
475 next = string_token (string, " ", &name);
478 /* count attributes */
481 found = strchr (found, '=');
491 attrs = g_new0 (gchar *, (count + 1) * 2);
493 for (i = 0; i < count && next != NULL; i += 2) {
494 gchar *attr_name = NULL, *attr_value = NULL;
496 next = string_token (next + 1, "=", &attr_name);
501 next = string_token (next + 1, " ", &attr_value);
503 /* strip " or ' from attribute value */
504 if (attr_value[0] == '"' || attr_value[0] == '\'') {
505 gchar *tmp = g_strdup (attr_value + 1);
510 length = strlen (attr_value);
511 if (length > 0 && (attr_value[length - 1] == '"'
512 || attr_value[length - 1] == '\'')) {
513 attr_value[length - 1] = '\0';
516 attrs[i] = attr_name;
517 attrs[i + 1] = attr_value;
520 ctxt->parser->start_element (ctxt, name,
521 (const gchar **) attrs, ctxt->user_data);
523 ctxt->parser->end_element (ctxt, name, ctxt->user_data);
530 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
532 const gchar *next = NULL;
533 ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
534 next = ctxt->buf->str;
536 if (next[0] == '<') {
537 gchar *element = NULL;
538 /* find <blahblah> */
539 if (!strchr (next, '>')) {
540 /* no tag end point. buffer will be process in next time */
544 next = string_token (next, ">", &element);
546 if (g_str_has_suffix (next, "/")) {
548 element[strlen (element) - 1] = '\0';
549 html_context_handle_element (ctxt, element + 1, TRUE);
550 } else if (element[1] == '/') {
552 ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
555 html_context_handle_element (ctxt, element + 1, FALSE);
558 } else if (strchr (next, '<')) {
561 next = string_token (next, "<", &text);
562 text = g_strstrip (text);
563 length = strlen (text);
564 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
568 gchar *text = (gchar *) next;
570 text = g_strstrip (text);
571 length = strlen (text);
572 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
573 ctxt->buf = g_string_assign (ctxt->buf, "");
578 ctxt->buf = g_string_assign (ctxt->buf, next);
582 has_tag (GString * str, const gchar tag)
584 return strrchr (str->str, tag);
588 sami_context_push_state (GstSamiContext * sctx, char state)
590 GST_LOG ("state %c", state);
591 g_string_append_c (sctx->state, state);
595 sami_context_pop_state (GstSamiContext * sctx, char state)
597 GString *str = g_string_new ("");
598 GString *context_state = sctx->state;
601 GST_LOG ("state %c", state);
602 for (i = context_state->len - 1; i >= 0; i--) {
603 switch (context_state->str[i]) {
604 case ITALIC_TAG: /* <i> */
606 g_string_append (str, "</i>");
609 case SPAN_TAG: /* <span foreground= > */
611 g_string_append (str, "</span>");
614 case RUBY_TAG: /* <span size= > -- ruby */
618 case RT_TAG: /* ruby */
620 /* FIXME: support for furigana/ruby once implemented in pango */
621 g_string_append (sctx->rubybuf, "</span>");
622 if (has_tag (context_state, ITALIC_TAG)) {
623 g_string_append (sctx->rubybuf, "</i>");
631 if (context_state->str[i] == state) {
632 g_string_append (sctx->buf, str->str);
633 g_string_free (str, TRUE);
634 g_string_truncate (context_state, i);
638 if (state == CLEAR_TAG) {
639 g_string_append (sctx->buf, str->str);
640 g_string_truncate (context_state, 0);
642 g_string_free (str, TRUE);
646 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
650 sami_context_pop_state (sctx, CLEAR_TAG);
652 for (i = 0; (atts[i] != NULL); i += 2) {
653 const gchar *key, *value;
660 if (!g_ascii_strcasecmp ("start", key)) {
661 /* Only set a new start time if we don't have text pending */
662 if (sctx->resultbuf->len == 0)
663 sctx->time1 = sctx->time2;
665 sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
666 sctx->time2 = MAX (sctx->time2, sctx->time1);
667 g_string_append (sctx->resultbuf, sctx->buf->str);
668 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
669 g_string_truncate (sctx->buf, 0);
676 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
680 sami_context_pop_state (sctx, SPAN_TAG);
682 g_string_append (sctx->buf, "<span");
683 for (i = 0; (atts[i] != NULL); i += 2) {
684 const gchar *key, *value;
691 if (!g_ascii_strcasecmp ("color", key)) {
693 * There are invalid color value in many
695 * It will fix hex color value that start without '#'
697 const gchar *sharp = "";
698 int len = strlen (value);
700 if (!(*value == '#' && len == 7)) {
703 /* check if it looks like hex */
704 if (strtol ((const char *) value, &r, 16) >= 0 &&
705 ((gchar *) r == (value + 6) && len == 6)) {
709 /* some colours can be found in many sami files, but X RGB database
710 * doesn't contain a colour by this name, so map explicitly */
711 if (!g_ascii_strcasecmp ("aqua", value)) {
713 } else if (!g_ascii_strcasecmp ("crimson", value)) {
715 } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
717 } else if (!g_ascii_strcasecmp ("indigo", value)) {
719 } else if (!g_ascii_strcasecmp ("lime", value)) {
721 } else if (!g_ascii_strcasecmp ("olive", value)) {
723 } else if (!g_ascii_strcasecmp ("silver", value)) {
725 } else if (!g_ascii_strcasecmp ("teal", value)) {
728 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
730 } else if (!g_ascii_strcasecmp ("face", key)) {
731 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
734 g_string_append_c (sctx->buf, '>');
735 sami_context_push_state (sctx, SPAN_TAG);
740 handle_start_element (HtmlContext * ctx, const gchar * name,
741 const char **atts, gpointer user_data)
743 GstSamiContext *sctx = (GstSamiContext *) user_data;
745 GST_LOG ("name:%s", name);
747 if (!g_ascii_strcasecmp ("sync", name)) {
748 handle_start_sync (sctx, atts);
749 sctx->in_sync = TRUE;
750 } else if (!g_ascii_strcasecmp ("font", name)) {
751 handle_start_font (sctx, atts);
752 } else if (!g_ascii_strcasecmp ("ruby", name)) {
753 sami_context_push_state (sctx, RUBY_TAG);
754 } else if (!g_ascii_strcasecmp ("br", name)) {
755 g_string_append_c (sctx->buf, '\n');
756 /* FIXME: support for furigana/ruby once implemented in pango */
757 } else if (!g_ascii_strcasecmp ("rt", name)) {
758 if (has_tag (sctx->state, ITALIC_TAG)) {
759 g_string_append (sctx->rubybuf, "<i>");
761 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
762 sami_context_push_state (sctx, RT_TAG);
763 } else if (!g_ascii_strcasecmp ("i", name)) {
764 g_string_append (sctx->buf, "<i>");
765 sami_context_push_state (sctx, ITALIC_TAG);
766 } else if (!g_ascii_strcasecmp ("p", name)) {
771 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
773 GstSamiContext *sctx = (GstSamiContext *) user_data;
775 GST_LOG ("name:%s", name);
777 if (!g_ascii_strcasecmp ("sync", name)) {
778 sctx->in_sync = FALSE;
779 } else if ((!g_ascii_strcasecmp ("body", name)) ||
780 (!g_ascii_strcasecmp ("sami", name))) {
781 /* We will usually have one buffer left when the body is closed
782 * as we need the next sync to actually send it */
783 if (sctx->buf->len != 0) {
784 /* Only set a new start time if we don't have text pending */
785 if (sctx->resultbuf->len == 0)
786 sctx->time1 = sctx->time2;
788 sctx->time2 = GST_CLOCK_TIME_NONE;
789 g_string_append (sctx->resultbuf, sctx->buf->str);
790 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
791 g_string_truncate (sctx->buf, 0);
793 } else if (!g_ascii_strcasecmp ("font", name)) {
794 sami_context_pop_state (sctx, SPAN_TAG);
795 } else if (!g_ascii_strcasecmp ("ruby", name)) {
796 sami_context_pop_state (sctx, RUBY_TAG);
797 } else if (!g_ascii_strcasecmp ("i", name)) {
798 sami_context_pop_state (sctx, ITALIC_TAG);
803 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
806 GstSamiContext *sctx = (GstSamiContext *) user_data;
808 /* Skip everything except content of the sync elements */
812 if (has_tag (sctx->state, RT_TAG)) {
813 g_string_append_c (sctx->rubybuf, ' ');
814 g_string_append (sctx->rubybuf, text);
815 g_string_append_c (sctx->rubybuf, ' ');
817 g_string_append (sctx->buf, text);
821 static HtmlParser samiParser = {
822 handle_start_element, /* start_element */
823 handle_end_element, /* end_element */
824 handle_text, /* text */
828 sami_context_init (ParserState * state)
830 GstSamiContext *context;
832 g_assert (state->user_data == NULL);
834 context = g_new0 (GstSamiContext, 1);
836 context->htmlctxt = html_context_new (&samiParser, context);
837 context->buf = g_string_new ("");
838 context->rubybuf = g_string_new ("");
839 context->resultbuf = g_string_new ("");
840 context->state = g_string_new ("");
842 state->user_data = context;
846 sami_context_deinit (ParserState * state)
848 GstSamiContext *context = (GstSamiContext *) state->user_data;
851 html_context_free (context->htmlctxt);
852 context->htmlctxt = NULL;
853 g_string_free (context->buf, TRUE);
854 g_string_free (context->rubybuf, TRUE);
855 g_string_free (context->resultbuf, TRUE);
856 g_string_free (context->state, TRUE);
858 state->user_data = NULL;
863 sami_context_reset (ParserState * state)
865 GstSamiContext *context = (GstSamiContext *) state->user_data;
868 g_string_truncate (context->buf, 0);
869 g_string_truncate (context->rubybuf, 0);
870 g_string_truncate (context->resultbuf, 0);
871 g_string_truncate (context->state, 0);
872 context->has_result = FALSE;
873 context->in_sync = FALSE;
880 parse_sami (ParserState * state, const gchar * line)
883 GstSamiContext *context = (GstSamiContext *) state->user_data;
885 gchar *unescaped = unescape_string (line);
886 html_context_parse (context->htmlctxt, (gchar *) unescaped,
890 if (context->has_result) {
891 if (context->rubybuf->len) {
892 context->rubybuf = g_string_append_c (context->rubybuf, '\n');
893 g_string_prepend (context->resultbuf, context->rubybuf->str);
894 context->rubybuf = g_string_truncate (context->rubybuf, 0);
897 ret = g_string_free (context->resultbuf, FALSE);
898 context->resultbuf = g_string_new ("");
899 state->start_time = context->time1;
900 state->duration = context->time2 - context->time1;
901 context->has_result = FALSE;