1 /* GStreamer SAMI subtitle parser
2 * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
21 #include "samiparse.h"
27 #define ITALIC_TAG 'i'
33 typedef struct _HtmlParser HtmlParser;
34 typedef struct _HtmlContext HtmlContext;
35 typedef struct _GstSamiContext GstSamiContext;
36 #ifdef SUBPARSE_MODIFICATION
37 typedef struct _LanguageStruct GstLangStruct;
38 struct _LanguageStruct
44 struct _GstSamiContext
46 GString *buf; /* buffer to collect content */
47 GString *rubybuf; /* buffer to collect ruby content */
48 GString *resultbuf; /* when opening the next 'sync' tag, move
49 * from 'buf' to avoid to append following
51 GString *state; /* in many sami files there are tags that
52 * are not closed, so for each open tag the
53 * parser will append a tag flag here so
54 * that tags can be closed properly on
55 * 'sync' tags. See _context_push_state()
56 * and _context_pop_state(). */
57 HtmlContext *htmlctxt; /* html parser context */
58 gboolean has_result; /* set when ready to push out result */
59 gboolean in_sync; /* flag to avoid appending anything except the
60 * content of the sync elements to buf */
61 guint64 time1; /* previous start attribute in sync tag */
62 guint64 time2; /* current start attribute in sync tag */
63 #ifdef SUBPARSE_MODIFICATION
64 guint64 time3; /* To store the last current time when language is changed */
65 GList *lang_list; /* Language list for an external subtitle file */
66 gboolean time_set; /* If language is set already by user */
67 gchar *current_language; /* Current language parsed */
68 gchar *desired_language; /* Language set by user */
69 gboolean language_changed; /* language changed signal */
75 void (*start_element) (HtmlContext * ctx,
76 const gchar * name, const gchar ** attr, gpointer user_data);
77 void (*end_element) (HtmlContext * ctx,
78 const gchar * name, gpointer user_data);
79 void (*text) (HtmlContext * ctx,
80 const gchar * text, gsize text_len, gpointer user_data);
85 const HtmlParser *parser;
91 html_context_new (HtmlParser * parser, gpointer user_data)
93 HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
94 ctxt->parser = parser;
95 ctxt->user_data = user_data;
96 ctxt->buf = g_string_new (NULL);
101 html_context_free (HtmlContext * ctxt)
103 g_string_free (ctxt->buf, TRUE);
109 const gunichar unescaped;
110 const gchar *escaped;
113 struct EntityMap XmlEntities[] = {
122 struct EntityMap HtmlEntities[] = {
123 /* nbsp will handle manually
376 unescape_string (const gchar * text)
379 GString *unescaped = g_string_new (NULL);
385 /* unescape   and */
386 if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
387 unescaped = g_string_append_unichar (unescaped, 160);
395 /* pass xml entities. these will be processed as pango markup */
396 for (i = 0; XmlEntities[i].escaped; i++) {
397 gssize len = strlen (XmlEntities[i].escaped);
398 if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
399 unescaped = g_string_append_c (unescaped, '&');
401 g_string_append_len (unescaped, XmlEntities[i].escaped, len);
407 /* convert html entities */
408 for (i = 0; HtmlEntities[i].escaped; i++) {
409 gssize len = strlen (HtmlEntities[i].escaped);
410 if (!strncmp (text, HtmlEntities[i].escaped, len)) {
412 g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
419 gboolean is_hex = FALSE;
430 l = strtoul (text, &end, 16);
432 l = strtoul (text, &end, 10);
435 if (text == end || errno != 0) {
436 /* error occured. pass it */
439 unescaped = g_string_append_unichar (unescaped, l);
449 unescaped = g_string_append (unescaped, "&");
454 } else if (g_ascii_isspace (*text)) {
455 unescaped = g_string_append_c (unescaped, ' ');
456 /* strip whitespace */
459 } while ((*text) && g_ascii_isspace (*text));
461 unescaped = g_string_append_c (unescaped, *text);
466 return g_string_free (unescaped, FALSE);
470 string_token (const gchar * string, const gchar * delimiter, gchar ** first)
472 gchar *next = strstr (string, delimiter);
474 *first = g_strndup (string, next - string);
476 *first = g_strdup (string);
482 html_context_handle_element (HtmlContext * ctxt,
483 const gchar * string, gboolean must_close)
488 const gchar *found, *next;
489 #ifdef SUBPARSE_MODIFICATION
490 const gchar *name_temp = NULL;
493 /* split element name and attributes */
494 next = string_token (string, " ", &name);
497 /* count attributes */
500 found = strchr (found, '=');
510 attrs = g_new0 (gchar *, (count + 1) * 2);
512 for (i = 0; i < count; i += 2) {
513 gchar *attr_name = NULL, *attr_value = NULL;
515 next = string_token (next + 1, "=", &attr_name);
516 next = string_token (next + 1, " ", &attr_value);
518 /* strip " or ' from attribute value */
519 if (attr_value[0] == '"' || attr_value[0] == '\'') {
520 gchar *tmp = g_strdup (attr_value + 1);
525 length = strlen (attr_value);
526 if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
527 attr_value[length - 1] = '\0';
530 attrs[i] = attr_name;
531 attrs[i + 1] = attr_value;
533 #ifdef SUBPARSE_MODIFICATION
534 /* sometimes spaces can be there in between !-- and P
535 * that also we have to take care */
536 if (!g_ascii_strcasecmp("!--", name)) {
537 gchar* tempchar = (gchar*)(string + 3);
538 while (*tempchar == ' ') {
540 if (*tempchar == 'P' || *tempchar == 'p') {
541 *(name + 3) = *tempchar;
548 if (next && (!g_ascii_strcasecmp("!--P", name))) {
551 /* count attributes */
554 found = (gchar*)strcasestr (found, "lang");
562 attrs = g_new0 (gchar *, count * 2);
564 for (i = 0; i < count; i++) {
565 gchar *attr_name = NULL, *attr_value = NULL;
567 next = (gchar*)strcasestr (next, "lang:");
568 attr_value = (gchar*)malloc (3);
570 strncpy (attr_value, next, 2);
571 attr_value[2] = '\0';
572 GST_LOG ("Language value comes as %s", attr_value);
575 if (*name_temp == '{') {
576 int character_count = 0;
581 if (*name_temp == '.') {
582 attr_name = (gchar*) malloc (character_count + 1);
585 else if (*name_temp != ' ')
593 for (j = 0; *(name_temp + j) != ' '; j++) {
594 attr_name[j] = *(name_temp + j);
597 attrs[attrindex++] = attr_name;
598 attrs[attrindex++] = attr_value;
604 ctxt->parser->start_element (ctxt, name,
605 (const gchar **) attrs, ctxt->user_data);
607 ctxt->parser->end_element (ctxt, name, ctxt->user_data);
614 html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
616 const gchar *next = NULL;
617 ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
618 next = ctxt->buf->str;
620 if (next[0] == '<') {
621 gchar *element = NULL;
622 /* find <blahblah> */
623 if (!strchr (next, '>')) {
624 /* no tag end point. buffer will be process in next time */
628 next = string_token (next, ">", &element);
630 if (g_str_has_suffix (next, "/")) {
632 element[strlen (element) - 1] = '\0';
633 html_context_handle_element (ctxt, element + 1, TRUE);
634 } else if (element[1] == '/') {
636 ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
639 html_context_handle_element (ctxt, element + 1, FALSE);
642 } else if (strchr (next, '<')) {
645 next = string_token (next, "<", &text);
646 text = g_strstrip (text);
647 length = strlen (text);
648 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
652 gchar *text = (gchar *) next;
654 text = g_strstrip (text);
655 length = strlen (text);
656 ctxt->parser->text (ctxt, text, length, ctxt->user_data);
657 ctxt->buf = g_string_assign (ctxt->buf, "");
662 ctxt->buf = g_string_assign (ctxt->buf, next);
666 has_tag (GString * str, const gchar tag)
668 return strrchr (str->str, tag);
672 sami_context_push_state (GstSamiContext * sctx, char state)
674 GST_LOG ("state %c", state);
675 g_string_append_c (sctx->state, state);
679 sami_context_pop_state (GstSamiContext * sctx, char state)
681 GString *str = g_string_new ("");
682 GString *context_state = sctx->state;
685 GST_LOG ("state %c", state);
686 for (i = context_state->len - 1; i >= 0; i--) {
687 switch (context_state->str[i]) {
688 case ITALIC_TAG: /* <i> */
690 g_string_append (str, "</i>");
693 case SPAN_TAG: /* <span foreground= > */
695 g_string_append (str, "</span>");
698 case RUBY_TAG: /* <span size= > -- ruby */
702 case RT_TAG: /* ruby */
704 /* FIXME: support for furigana/ruby once implemented in pango */
705 g_string_append (sctx->rubybuf, "</span>");
706 if (has_tag (context_state, ITALIC_TAG)) {
707 g_string_append (sctx->rubybuf, "</i>");
715 if (context_state->str[i] == state) {
716 g_string_append (sctx->buf, str->str);
717 g_string_free (str, TRUE);
718 g_string_truncate (context_state, i);
722 if (state == CLEAR_TAG) {
723 g_string_append (sctx->buf, str->str);
724 g_string_truncate (context_state, 0);
726 g_string_free (str, TRUE);
730 handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
734 sami_context_pop_state (sctx, CLEAR_TAG);
736 for (i = 0; (atts[i] != NULL); i += 2) {
737 const gchar *key, *value;
744 if (!g_ascii_strcasecmp ("start", key)) {
745 /* Only set a new start time if we don't have text pending */
746 if (sctx->resultbuf->len == 0)
747 sctx->time1 = sctx->time2;
749 sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
750 #ifdef SUBPARSE_MODIFICATION
751 sctx->time3 = sctx->time2;
753 sctx->time2 = MAX (sctx->time2, sctx->time1);
754 g_string_append (sctx->resultbuf, sctx->buf->str);
755 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
756 g_string_truncate (sctx->buf, 0);
763 handle_start_font (GstSamiContext * sctx, const gchar ** atts)
767 sami_context_pop_state (sctx, SPAN_TAG);
769 g_string_append (sctx->buf, "<span");
770 for (i = 0; (atts[i] != NULL); i += 2) {
771 const gchar *key, *value;
778 if (!g_ascii_strcasecmp ("color", key)) {
780 * There are invalid color value in many
782 * It will fix hex color value that start without '#'
784 const gchar *sharp = "";
785 int len = strlen (value);
787 if (!(*value == '#' && len == 7)) {
790 /* check if it looks like hex */
791 if (strtol ((const char *) value, &r, 16) >= 0 &&
792 ((gchar *) r == (value + 6) && len == 6)) {
796 /* some colours can be found in many sami files, but X RGB database
797 * doesn't contain a colour by this name, so map explicitly */
798 if (!g_ascii_strcasecmp ("aqua", value)) {
800 } else if (!g_ascii_strcasecmp ("crimson", value)) {
802 } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
804 } else if (!g_ascii_strcasecmp ("indigo", value)) {
806 } else if (!g_ascii_strcasecmp ("lime", value)) {
808 } else if (!g_ascii_strcasecmp ("olive", value)) {
810 } else if (!g_ascii_strcasecmp ("silver", value)) {
812 } else if (!g_ascii_strcasecmp ("teal", value)) {
815 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
817 } else if (!g_ascii_strcasecmp ("face", key)) {
818 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
821 g_string_append_c (sctx->buf, '>');
822 sami_context_push_state (sctx, SPAN_TAG);
826 #ifdef SUBPARSE_MODIFICATION
828 handle_p (GstSamiContext * sctx, const gchar ** atts)
833 for (i = 0; (atts[i] != NULL); i += 2) {
834 const gchar *key, *value;
839 if (sctx->current_language && value && strcmp(sctx->current_language, value))
840 sctx->language_changed = TRUE;
842 else if (!sctx->current_language)
843 sctx->current_language = (gchar*) malloc (128);
845 if (key && !g_ascii_strcasecmp ("class", key) && value) {
846 strcpy (sctx->current_language, value);
848 if (sctx->language_changed)
850 sctx->time1 = sctx->time3;
851 sctx->time2 = sctx->time1;
852 sctx->time_set = FALSE;
853 sctx->language_changed = FALSE;
862 handle_start_language_list (GstSamiContext * sctx, const gchar ** atts)
866 GstLangStruct *new = NULL;
867 GstLangStruct *temp = NULL;
870 for (i = 0; (atts[attrIndex] != NULL); i++) {
871 const gchar *key, *value;
873 key = atts[attrIndex++];
874 value = atts[attrIndex++];
876 GST_LOG ("Inside handle_start_language_list key: %s, value: %s", key, value);
881 new = g_new0 (GstLangStruct, 1);
882 new->language_code = (gchar*) malloc (strlen(value) + 1);
883 if (new->language_code && value)
884 strcpy (new->language_code, value);
885 new->language_key = (gchar*) malloc (strlen(key) + 1);
886 if (new->language_key && key)
887 strcpy (new->language_key, key);
888 sctx->lang_list = g_list_append (sctx->lang_list, new);
889 temp = g_list_nth_data (sctx->lang_list, i);
890 if (sctx->desired_language == NULL && key){
891 sctx->desired_language = (gchar*) malloc (strlen(key) + 1);
892 strcpy(sctx->desired_language, key);
896 GST_LOG ("Inside handle_start_language_list of glist key: %s, value: %s",
897 temp->language_key, temp->language_code);
904 handle_start_element (HtmlContext * ctx, const gchar * name,
905 const char **atts, gpointer user_data)
907 GstSamiContext *sctx = (GstSamiContext *) user_data;
909 GST_LOG ("name:%s", name);
911 if (!g_ascii_strcasecmp ("sync", name)) {
912 handle_start_sync (sctx, atts);
913 sctx->in_sync = TRUE;
914 } else if (!g_ascii_strcasecmp ("font", name)) {
915 handle_start_font (sctx, atts);
916 } else if (!g_ascii_strcasecmp ("ruby", name)) {
917 sami_context_push_state (sctx, RUBY_TAG);
918 } else if (!g_ascii_strcasecmp ("br", name)) {
919 #ifdef SUBPARSE_MODIFICATION
920 if (sctx->current_language && sctx->desired_language &&
921 !strcmp(sctx->current_language, sctx->desired_language))
923 g_string_append_c (sctx->buf, '\n');
924 /* FIXME: support for furigana/ruby once implemented in pango */
925 } else if (!g_ascii_strcasecmp ("rt", name)) {
926 if (has_tag (sctx->state, ITALIC_TAG)) {
927 g_string_append (sctx->rubybuf, "<i>");
929 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
930 sami_context_push_state (sctx, RT_TAG);
931 } else if (!g_ascii_strcasecmp ("i", name)) {
932 #ifdef SUBPARSE_MODIFICATION
933 if (sctx->current_language && sctx->desired_language &&
934 !strcmp(sctx->current_language, sctx->desired_language))
936 g_string_append (sctx->buf, "<i>");
937 sami_context_push_state (sctx, ITALIC_TAG);
938 } else if (!g_ascii_strcasecmp ("p", name)) {
939 #ifdef SUBPARSE_MODIFICATION
940 handle_p (sctx, atts);
941 } else if (!g_ascii_strcasecmp ("!--P", name)) {
942 handle_start_language_list (sctx, atts);
948 handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
950 GstSamiContext *sctx = (GstSamiContext *) user_data;
952 GST_LOG ("name:%s", name);
954 if (!g_ascii_strcasecmp ("sync", name)) {
955 sctx->in_sync = FALSE;
956 } else if ((!g_ascii_strcasecmp ("body", name)) ||
957 (!g_ascii_strcasecmp ("sami", name))) {
958 /* We will usually have one buffer left when the body is closed
959 * as we need the next sync to actually send it */
960 if (sctx->buf->len != 0) {
961 /* Only set a new start time if we don't have text pending */
962 if (sctx->resultbuf->len == 0)
963 sctx->time1 = sctx->time2;
965 sctx->time2 = GST_CLOCK_TIME_NONE;
966 g_string_append (sctx->resultbuf, sctx->buf->str);
967 sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
968 g_string_truncate (sctx->buf, 0);
970 } else if (!g_ascii_strcasecmp ("font", name)) {
971 sami_context_pop_state (sctx, SPAN_TAG);
972 } else if (!g_ascii_strcasecmp ("ruby", name)) {
973 sami_context_pop_state (sctx, RUBY_TAG);
974 } else if (!g_ascii_strcasecmp ("i", name)) {
975 sami_context_pop_state (sctx, ITALIC_TAG);
980 handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
983 GstSamiContext *sctx = (GstSamiContext *) user_data;
985 /* Skip everything except content of the sync elements */
988 #ifdef SUBPARSE_MODIFICATION
989 if (has_tag (sctx->state, RT_TAG) && (sctx->current_language && sctx->desired_language &&
990 !strcmp(sctx->current_language, sctx->desired_language))) {
992 if (has_tag (sctx->state, RT_TAG)) {
994 g_string_append_c (sctx->rubybuf, ' ');
995 g_string_append (sctx->rubybuf, text);
996 g_string_append_c (sctx->rubybuf, ' ');
998 #ifdef SUBPARSE_MODIFICATION
999 if (sctx->current_language && sctx->desired_language &&
1000 !strcmp(sctx->current_language, sctx->desired_language))
1002 g_string_append (sctx->buf, text);
1006 static HtmlParser samiParser = {
1007 handle_start_element, /* start_element */
1008 handle_end_element, /* end_element */
1009 handle_text, /* text */
1013 sami_context_init (ParserState * state)
1015 GstSamiContext *context;
1017 g_assert (state->user_data == NULL);
1019 context = g_new0 (GstSamiContext, 1);
1021 context->htmlctxt = html_context_new (&samiParser, context);
1022 context->buf = g_string_new ("");
1023 context->rubybuf = g_string_new ("");
1024 context->resultbuf = g_string_new ("");
1025 context->state = g_string_new ("");
1026 #ifdef SUBPARSE_MODIFICATION
1027 context->current_language = NULL;
1028 context->desired_language = NULL;
1029 context->time_set = FALSE;
1030 context->lang_list = NULL;
1031 context->language_changed = FALSE;
1033 state->user_data = context;
1037 sami_context_deinit (ParserState * state)
1039 GstSamiContext *context = (GstSamiContext *) state->user_data;
1040 #ifdef SUBPARSE_MODIFICATION
1041 GstLangStruct *temp = NULL;
1045 html_context_free (context->htmlctxt);
1046 context->htmlctxt = NULL;
1047 g_string_free (context->buf, TRUE);
1048 g_string_free (context->rubybuf, TRUE);
1049 g_string_free (context->resultbuf, TRUE);
1050 g_string_free (context->state, TRUE);
1051 #ifdef SUBPARSE_MODIFICATION
1052 if (context->lang_list) {
1053 while ((temp = g_list_nth_data (context->lang_list, i))) {
1054 if (temp->language_code)
1055 free (temp->language_code);
1056 temp->language_code = NULL;
1057 if (temp->language_key)
1058 free (temp->language_key);
1059 temp->language_key = NULL;
1063 g_list_free (context->lang_list);
1065 context->lang_list = NULL;
1067 if (context->current_language)
1068 free (context->current_language);
1069 context->current_language = NULL;
1071 context->desired_language = NULL;
1074 state->user_data = NULL;
1079 sami_context_reset (ParserState * state)
1081 GstSamiContext *context = (GstSamiContext *) state->user_data;
1084 g_string_truncate (context->buf, 0);
1085 g_string_truncate (context->rubybuf, 0);
1086 g_string_truncate (context->resultbuf, 0);
1087 g_string_truncate (context->state, 0);
1088 context->has_result = FALSE;
1089 context->in_sync = FALSE;
1095 #ifdef SUBPARSE_MODIFICATION
1097 sami_context_change_language (ParserState * state)
1099 GstSamiContext *context = (GstSamiContext *) state->user_data;
1100 GST_LOG ("**********desired language was %s**************", context->desired_language);
1101 free (context->desired_language);
1102 context->desired_language = state->current_language;
1103 context->time_set = TRUE;
1104 GST_LOG ("desired language changed to %s", context->desired_language);
1109 parse_sami (ParserState * state, const gchar * line)
1112 GstSamiContext *context = (GstSamiContext *) state->user_data;
1114 gchar *unescaped = unescape_string (line);
1115 html_context_parse (context->htmlctxt, (gchar *) unescaped,
1116 strlen (unescaped));
1117 #ifdef SUBPARSE_MODIFICATION
1118 if (context->lang_list)
1119 state->language_list = context->lang_list;
1121 if (context->desired_language)
1122 state->current_language = context->desired_language;
1125 #ifdef SUBPARSE_MODIFICATION
1126 if (context->desired_language && context->current_language) {
1127 if (!strcmp(context->current_language, context->desired_language)) {
1129 if (context->has_result) {
1130 if (context->rubybuf->len) {
1131 context->rubybuf = g_string_append_c (context->rubybuf, '\n');
1132 g_string_prepend (context->resultbuf, context->rubybuf->str);
1133 context->rubybuf = g_string_truncate (context->rubybuf, 0);
1136 ret = g_string_free (context->resultbuf, FALSE);
1137 context->resultbuf = g_string_new ("");
1138 state->start_time = context->time1;
1139 state->duration = context->time2 - context->time1;
1140 context->has_result = FALSE;
1142 #ifdef SUBPARSE_MODIFICATION