From: Tim-Philipp Müller Date: Wed, 10 Jun 2009 13:37:36 +0000 (+0100) Subject: subparse: recognise more subrip timestamp variants X-Git-Tag: 1.19.3~511^2~9567 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=40bea96ff60c20207c2c54125e12730fa4f24958;p=platform%2Fupstream%2Fgstreamer.git subparse: recognise more subrip timestamp variants Be even less restrictive in what we accept for .srt timestamps when typefinding and parsing subrip subtitles and add a unit test for the 'new' format. Fixes #585197. --- diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index 030ea94..6aa312c 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -730,11 +730,59 @@ subrip_fix_up_markup (gchar ** p_txt) } } +static gboolean +parse_subrip_time (const gchar * ts_string, GstClockTime * t) +{ + gchar s[128] = { '\0', }; + gchar *end, *p; + guint hour, min, sec, msec, len; + + while (*ts_string == ' ') + ++ts_string; + + g_strlcpy (s, ts_string, sizeof (s)); + if ((end = strstr (s, "-->"))) + *end = '\0'; + g_strchomp (s); + + /* ms may be in these formats: + * hh:mm:ss,500 = 500ms + * hh:mm:ss, 5 = 5ms + * hh:mm:ss, 5 = 50ms + * hh:mm:ss, 50 = 50ms + * hh:mm:ss,5 = 500ms + * and sscanf() doesn't differentiate between ' 5' and '5' so munge + * the white spaces within the timestamp to '0' (I'm sure there's a + * way to make sscanf() do this for us, but how?) + */ + g_strdelimit (s, " ", '0'); + + /* make sure we have exactly three digits after he comma */ + p = strchr (s, ','); + g_assert (p != NULL); + ++p; + len = strlen (p); + if (len > 3) { + p[3] = '\0'; + } else + while (len < 3) { + g_strlcat (&p[len], "0", 2); + ++len; + } + + GST_LOG ("parsing timestamp '%s'", s); + if (sscanf (s, "%u:%u:%u,%u", &hour, &min, &sec, &msec) != 4) { + GST_WARNING ("failed to parse subrip timestamp string '%s'", s); + return FALSE; + } + + *t = ((hour * 3600) + (min * 60) + sec) * GST_SECOND + msec * GST_MSECOND; + return TRUE; +} + static gchar * parse_subrip (ParserState * state, const gchar * line) { - guint h1, m1, s1, ms1; - guint h2, m2, s2, ms2; int subnum; gchar *ret; @@ -745,21 +793,24 @@ parse_subrip (ParserState * state, const gchar * line) state->state = 1; return NULL; case 1: + { + GstClockTime ts_start, ts_end; + gchar *end_time; + /* looking for start_time --> end_time */ - if (sscanf (line, "%u:%u:%u,%u --> %u:%u:%u,%u", - &h1, &m1, &s1, &ms1, &h2, &m2, &s2, &ms2) == 8) { + if ((end_time = strstr (line, " --> ")) && + parse_subrip_time (line, &ts_start) && + parse_subrip_time (end_time + strlen (" --> "), &ts_end) && + state->start_time <= ts_end) { state->state = 2; - state->start_time = - (((guint64) h1) * 3600 + m1 * 60 + s1) * GST_SECOND + - ms1 * GST_MSECOND; - state->duration = - (((guint64) h2) * 3600 + m2 * 60 + s2) * GST_SECOND + - ms2 * GST_MSECOND - state->start_time; + state->start_time = ts_start; + state->duration = ts_end - ts_start; } else { - GST_DEBUG ("error parsing subrip time line"); + GST_DEBUG ("error parsing subrip time line '%s'", line); state->state = 0; } return NULL; + } case 2: { /* No need to parse that text if it's out of segment */ @@ -993,9 +1044,9 @@ gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype) } break; case GST_SUB_PARSE_REGEX_SUBRIP: - result = (gpointer) g_regex_new ("^([ 0-9]){0,3}[0-9](\x0d)?\x0a" - "[ 0-9][0-9]:[ 0-9][0-9]:[ 0-9][0-9],[ 0-9]{2}[0-9]" - " --> ([ 0-9])?[0-9]:[ 0-9][0-9]:[ 0-9][0-9],[ 0-9]{2}[0-9]", + result = (gpointer) g_regex_new ("^([ 0-9]){0,3}[0-9]\\s*(\x0d)?\x0a" + "[ 0-9][0-9]:[ 0-9][0-9]:[ 0-9][0-9],[ 0-9]{0,2}[0-9]" + " +--> +([ 0-9])?[0-9]:[ 0-9][0-9]:[ 0-9][0-9],[ 0-9]{0,2}[0-9]", 0, 0, &gerr); if (result == NULL) { g_warning ("Compilation of subrip regex failed: %s", gerr->message); @@ -1083,7 +1134,7 @@ gst_sub_parse_format_autodetect (GstSubParse * self) gchar *data; GstSubParseFormat format; - if (strlen (self->textbuf->str) < 35) { + if (strlen (self->textbuf->str) < 30) { GST_DEBUG ("File too small to be a subtitles file"); return NULL; } diff --git a/tests/check/elements/subparse.c b/tests/check/elements/subparse.c index 56beab4..70ec1c6 100644 --- a/tests/check/elements/subparse.c +++ b/tests/check/elements/subparse.c @@ -139,6 +139,18 @@ static SubParseInputChunk srt_input2[] = { 0, 3 * GST_SECOND + 50 * GST_MSECOND, "Just testing."} }; +/* starts with chunk number 0 and has less than three digits after the comma + * and a few extra spaces before the arrow or at the end of the line */ +static SubParseInputChunk srt_input3[] = { + { + "0\n00:00:01,0 --> 00:00:02,0\nOne\n\n", + 1000 * GST_MSECOND, 2000 * GST_MSECOND, "One"}, { + "1\n00:00:02,5 --> 00:00:03, 5 \nTwo\n\n", + 2500 * GST_MSECOND, 3005 * GST_MSECOND, "Two"}, { + "2\n00:00:03, 9 --> 00:00:04,0 \nThree\n\n", + 3090 * GST_MSECOND, 4000 * GST_MSECOND, "Three"} +}; + static void setup_subparse (void) { @@ -247,6 +259,9 @@ GST_START_TEST (test_srt) /* try with UTF-8 BOM at the start */ test_srt_do_test (srt_input1, 0, G_N_ELEMENTS (srt_input2)); + + /* try with fewer than three post-comma digits, and some extra spaces */ + test_srt_do_test (srt_input3, 0, G_N_ELEMENTS (srt_input3)); } GST_END_TEST;