gst/subparse/gstsubparse.c

   1 /* GStreamer
   2  * Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu>
   3  * Copyright (C) 2004 Ronald S. Bultje <rbultje@ronald.bitfreak.net>
   4  * Copyright (C) 2006 Tim-Philipp Müller <tim centricular net>
   5  * Copyright (C) 2016 Philippe Normand <pnormand@igalia.com>
   6  * Copyright (C) 2016 Jan Schmidt <jan@centricular.com>
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Library General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Library General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Library General Public
  19  * License along with this library; if not, write to the
  20  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  21  * Boston, MA 02110-1301, USA.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include "config.h"
  26 #endif
  27
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <sys/types.h>
  32 #include <glib.h>
  33
  34 #include "gstsubparse.h"
  35 #include "gstssaparse.h"
  36 #include "samiparse.h"
  37 #include "tmplayerparse.h"
  38 #include "mpl2parse.h"
  39 #include "qttextparse.h"
  40
  41 GST_DEBUG_CATEGORY (sub_parse_debug);
  42
  43 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
  44 #define SUBPARSE_SEEK_GET_LOCK(elem)    (&elem->seek_lock)
  45 #define SUBPARSE_SEEK_LOCK(elem)        g_mutex_lock(SUBPARSE_SEEK_GET_LOCK(elem))
  46 #define SUBPARSE_SEEK_TRYLOCK(elem)     g_mutex_trylock(SUBPARSE_SEEK_GET_LOCK(elem))
  47 #define SUBPARSE_SEEK_UNLOCK(elem)      g_mutex_unlock(SUBPARSE_SEEK_GET_LOCK(elem))
  48 #endif
  49 #define DEFAULT_ENCODING   NULL
  50 #define ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\. \t\\(\\)]*"
  51 static const gchar *allowed_srt_tags[] = { "i", "b", "u", NULL };
  52 static const gchar *allowed_vtt_tags[] =
  53     { "i", "b", "c", "u", "v", "ruby", "rt", NULL };
  54
  55 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
  56 #define DEFAULT_CURRENT_LANGUAGE   NULL
  57 #endif
  58 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
  59 #define DEFAULT_DROP_OUT_OF_SEGMENT TRUE
  60 #endif
  61 enum
  62 {
  63   PROP_0,
  64   PROP_ENCODING,
  65   PROP_VIDEOFPS,
  66 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
  67   PROP_EXTSUB_CURRENT_LANGUAGE,
  68 #endif
  69 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
  70   PROP_DROP_OUT_OF_SEGMENT,
  71 #endif
  72 };
  73 #ifdef TIZEN_FEATURE_HLS_WEBVTT
  74 #define MPEGTIME_TO_GSTTIME(t) ((t) * (guint64)100000 / 9)
  75 #endif
  76
  77 static void
  78 gst_sub_parse_set_property (GObject * object, guint prop_id,
  79     const GValue * value, GParamSpec * pspec);
  80 static void
  81 gst_sub_parse_get_property (GObject * object, guint prop_id,
  82     GValue * value, GParamSpec * pspec);
  83
  84
  85 static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
  86     GST_PAD_SINK,
  87     GST_PAD_ALWAYS,
  88     GST_STATIC_CAPS ("application/x-subtitle; application/x-subtitle-sami; "
  89         "application/x-subtitle-tmplayer; application/x-subtitle-mpl2; "
  90         "application/x-subtitle-dks; application/x-subtitle-qttext;"
  91         "application/x-subtitle-lrc; application/x-subtitle-vtt")
  92     );
  93
  94 static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src",
  95     GST_PAD_SRC,
  96     GST_PAD_ALWAYS,
  97     GST_STATIC_CAPS ("text/x-raw, format= { pango-markup, utf8 }")
  98     );
  99
 100
 101 static gboolean gst_sub_parse_src_event (GstPad * pad, GstObject * parent,
 102     GstEvent * event);
 103 static gboolean gst_sub_parse_src_query (GstPad * pad, GstObject * parent,
 104     GstQuery * query);
 105 static gboolean gst_sub_parse_sink_event (GstPad * pad, GstObject * parent,
 106     GstEvent * event);
 107
 108 static GstStateChangeReturn gst_sub_parse_change_state (GstElement * element,
 109     GstStateChange transition);
 110
 111 static GstFlowReturn gst_sub_parse_chain (GstPad * sinkpad, GstObject * parent,
 112     GstBuffer * buf);
 113 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 114 static gboolean gst_sub_parse_check_byte_seekability (GstSubParse * subparse);
 115 #endif
 116 #define gst_sub_parse_parent_class parent_class
 117 G_DEFINE_TYPE (GstSubParse, gst_sub_parse, GST_TYPE_ELEMENT);
 118
 119 static void
 120 gst_sub_parse_dispose (GObject * object)
 121 {
 122   GstSubParse *subparse = GST_SUBPARSE (object);
 123
 124   GST_DEBUG_OBJECT (subparse, "cleaning up subtitle parser");
 125
 126   if (subparse->encoding) {
 127     g_free (subparse->encoding);
 128     subparse->encoding = NULL;
 129   }
 130
 131   if (subparse->detected_encoding) {
 132     g_free (subparse->detected_encoding);
 133     subparse->detected_encoding = NULL;
 134   }
 135
 136   if (subparse->adapter) {
 137     g_object_unref (subparse->adapter);
 138     subparse->adapter = NULL;
 139   }
 140
 141   if (subparse->textbuf) {
 142     g_string_free (subparse->textbuf, TRUE);
 143     subparse->textbuf = NULL;
 144   }
 145 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 146   g_free (subparse->state.current_language);
 147   subparse->state.current_language = NULL;
 148
 149   g_mutex_clear (&subparse->seek_lock);
 150 #endif
 151   GST_CALL_PARENT (G_OBJECT_CLASS, dispose, (object));
 152 }
 153
 154 static void
 155 gst_sub_parse_class_init (GstSubParseClass * klass)
 156 {
 157   GObjectClass *object_class = G_OBJECT_CLASS (klass);
 158   GstElementClass *element_class = GST_ELEMENT_CLASS (klass);
 159
 160   object_class->dispose = gst_sub_parse_dispose;
 161   object_class->set_property = gst_sub_parse_set_property;
 162   object_class->get_property = gst_sub_parse_get_property;
 163
 164   gst_element_class_add_static_pad_template (element_class, &sink_templ);
 165   gst_element_class_add_static_pad_template (element_class, &src_templ);
 166   gst_element_class_set_static_metadata (element_class,
 167       "Subtitle parser", "Codec/Parser/Subtitle",
 168       "Parses subtitle (.sub) files into text streams",
 169       "Gustavo J. A. M. Carneiro <gjc@inescporto.pt>, "
 170       "GStreamer maintainers <gstreamer-devel@lists.freedesktop.org>");
 171
 172   element_class->change_state = gst_sub_parse_change_state;
 173
 174   g_object_class_install_property (object_class, PROP_ENCODING,
 175       g_param_spec_string ("subtitle-encoding", "subtitle charset encoding",
 176           "Encoding to assume if input subtitles are not in UTF-8 or any other "
 177           "Unicode encoding. If not set, the GST_SUBTITLE_ENCODING environment "
 178           "variable will be checked for an encoding to use. If that is not set "
 179           "either, ISO-8859-15 will be assumed.", DEFAULT_ENCODING,
 180           G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
 181
 182   g_object_class_install_property (object_class, PROP_VIDEOFPS,
 183       gst_param_spec_fraction ("video-fps", "Video framerate",
 184           "Framerate of the video stream. This is needed by some subtitle "
 185           "formats to synchronize subtitles and video properly. If not set "
 186           "and the subtitle format requires it subtitles may be out of sync.",
 187           0, 1, G_MAXINT, 1, 24000, 1001,
 188           G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
 189 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 190   g_object_class_install_property (object_class, PROP_EXTSUB_CURRENT_LANGUAGE,
 191       g_param_spec_string ("current-language", "Current language",
 192           "Current language of the subtitle in external subtitle case.",
 193           DEFAULT_CURRENT_LANGUAGE,
 194           G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
 195 #endif
 196
 197 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
 198   g_object_class_install_property (object_class, PROP_DROP_OUT_OF_SEGMENT,
 199       g_param_spec_boolean ("drop-out-of-segment",
 200           "Drop out-of-segment buffers",
 201           "Drop and don't send out-of-segment buffers",
 202           DEFAULT_DROP_OUT_OF_SEGMENT,
 203           G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
 204 #endif
 205 }
 206
 207 static void
 208 gst_sub_parse_init (GstSubParse * subparse)
 209 {
 210   subparse->sinkpad = gst_pad_new_from_static_template (&sink_templ, "sink");
 211   gst_pad_set_chain_function (subparse->sinkpad,
 212       GST_DEBUG_FUNCPTR (gst_sub_parse_chain));
 213   gst_pad_set_event_function (subparse->sinkpad,
 214       GST_DEBUG_FUNCPTR (gst_sub_parse_sink_event));
 215   gst_element_add_pad (GST_ELEMENT (subparse), subparse->sinkpad);
 216
 217   subparse->srcpad = gst_pad_new_from_static_template (&src_templ, "src");
 218   gst_pad_set_event_function (subparse->srcpad,
 219       GST_DEBUG_FUNCPTR (gst_sub_parse_src_event));
 220   gst_pad_set_query_function (subparse->srcpad,
 221       GST_DEBUG_FUNCPTR (gst_sub_parse_src_query));
 222   gst_element_add_pad (GST_ELEMENT (subparse), subparse->srcpad);
 223
 224   subparse->textbuf = g_string_new (NULL);
 225   subparse->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
 226 #ifdef TIZEN_FEATURE_UPSTREAM
 227   subparse->strip_pango_markup = FALSE;
 228 #endif
 229   subparse->flushing = FALSE;
 230   gst_segment_init (&subparse->segment, GST_FORMAT_TIME);
 231   subparse->need_segment = TRUE;
 232   subparse->encoding = g_strdup (DEFAULT_ENCODING);
 233   subparse->detected_encoding = NULL;
 234   subparse->adapter = gst_adapter_new ();
 235
 236   subparse->fps_n = 24000;
 237   subparse->fps_d = 1001;
 238 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 239   subparse->state.language_list = NULL;
 240   subparse->state.current_language = NULL;
 241   subparse->state.langlist_msg_posted = FALSE;
 242   g_mutex_init (&subparse->seek_lock);
 243 #endif
 244 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
 245   subparse->state.drop_out_of_segment = DEFAULT_DROP_OUT_OF_SEGMENT;
 246 #endif
 247 }
 248
 249 /*
 250  * Source pad functions.
 251  */
 252
 253 static gboolean
 254 gst_sub_parse_src_query (GstPad * pad, GstObject * parent, GstQuery * query)
 255 {
 256   GstSubParse *self = GST_SUBPARSE (parent);
 257   gboolean ret = FALSE;
 258
 259   GST_DEBUG ("Handling %s query", GST_QUERY_TYPE_NAME (query));
 260
 261   switch (GST_QUERY_TYPE (query)) {
 262     case GST_QUERY_POSITION:{
 263       GstFormat fmt;
 264
 265       gst_query_parse_position (query, &fmt, NULL);
 266       if (fmt != GST_FORMAT_TIME) {
 267         ret = gst_pad_peer_query (self->sinkpad, query);
 268       } else {
 269         ret = TRUE;
 270         gst_query_set_position (query, GST_FORMAT_TIME, self->segment.position);
 271       }
 272       break;
 273     }
 274     case GST_QUERY_SEEKING:
 275     {
 276       GstFormat fmt;
 277       gboolean seekable = FALSE;
 278
 279       ret = TRUE;
 280
 281       gst_query_parse_seeking (query, &fmt, NULL, NULL, NULL);
 282       if (fmt == GST_FORMAT_TIME) {
 283         GstQuery *peerquery = gst_query_new_seeking (GST_FORMAT_BYTES);
 284
 285         seekable = gst_pad_peer_query (self->sinkpad, peerquery);
 286         if (seekable)
 287           gst_query_parse_seeking (peerquery, NULL, &seekable, NULL, NULL);
 288         gst_query_unref (peerquery);
 289       }
 290
 291       gst_query_set_seeking (query, fmt, seekable, seekable ? 0 : -1, -1);
 292       break;
 293     }
 294     default:
 295       ret = gst_pad_query_default (pad, parent, query);
 296       break;
 297   }
 298
 299   return ret;
 300 }
 301
 302 static gboolean
 303 gst_sub_parse_src_event (GstPad * pad, GstObject * parent, GstEvent * event)
 304 {
 305   GstSubParse *self = GST_SUBPARSE (parent);
 306   gboolean ret = FALSE;
 307
 308   GST_DEBUG ("Handling %s event", GST_EVENT_TYPE_NAME (event));
 309
 310   switch (GST_EVENT_TYPE (event)) {
 311     case GST_EVENT_SEEK:
 312     {
 313       GstFormat format;
 314       GstSeekFlags flags;
 315       GstSeekType start_type, stop_type;
 316       gint64 start, stop;
 317       gdouble rate;
 318       gboolean update;
 319
 320       gst_event_parse_seek (event, &rate, &format, &flags,
 321           &start_type, &start, &stop_type, &stop);
 322
 323       if (format != GST_FORMAT_TIME) {
 324         GST_WARNING_OBJECT (self, "we only support seeking in TIME format");
 325         gst_event_unref (event);
 326         goto beach;
 327       }
 328 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 329       if (!gst_sub_parse_check_byte_seekability (self)) {
 330         ret = gst_pad_event_default (pad, parent, event);
 331         break;
 332       }
 333
 334       SUBPARSE_SEEK_LOCK (self);
 335 #endif
 336       /* Convert that seek to a seeking in bytes at position 0,
 337          FIXME: could use an index */
 338       ret = gst_pad_push_event (self->sinkpad,
 339           gst_event_new_seek (rate, GST_FORMAT_BYTES, flags,
 340               GST_SEEK_TYPE_SET, 0, GST_SEEK_TYPE_NONE, 0));
 341
 342       if (ret) {
 343         /* Apply the seek to our segment */
 344         gst_segment_do_seek (&self->segment, rate, format, flags,
 345             start_type, start, stop_type, stop, &update);
 346
 347         GST_DEBUG_OBJECT (self, "segment after seek: %" GST_SEGMENT_FORMAT,
 348             &self->segment);
 349
 350         /* will mark need_segment when receiving segment from upstream,
 351          * after FLUSH and all that has happened,
 352          * rather than racing with chain */
 353       } else {
 354         GST_WARNING_OBJECT (self, "seek to 0 bytes failed");
 355       }
 356
 357 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 358       SUBPARSE_SEEK_UNLOCK (self);
 359 #endif
 360
 361       gst_event_unref (event);
 362       break;
 363     }
 364     default:
 365       ret = gst_pad_event_default (pad, parent, event);
 366       break;
 367   }
 368
 369 beach:
 370   return ret;
 371 }
 372
 373 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 374 static gboolean
 375 gst_sub_parse_check_byte_seekability (GstSubParse * subparse)
 376 {
 377   GstQuery *query;
 378   gboolean seekable = FALSE;
 379
 380   query = gst_query_new_seeking (GST_FORMAT_BYTES);
 381
 382   if (gst_pad_peer_query (subparse->sinkpad, query)) {
 383     gst_query_parse_seeking (query, NULL, &seekable, NULL, NULL);
 384   } else {
 385     GST_DEBUG_OBJECT (subparse, "seeking query failed");
 386   }
 387
 388   gst_query_unref (query);
 389
 390   GST_INFO_OBJECT (subparse, "byte seekable: %d", seekable);
 391
 392   return seekable;
 393 }
 394 #endif
 395
 396 static void
 397 gst_sub_parse_set_property (GObject * object, guint prop_id,
 398     const GValue * value, GParamSpec * pspec)
 399 {
 400   GstSubParse *subparse = GST_SUBPARSE (object);
 401
 402   GST_OBJECT_LOCK (subparse);
 403   switch (prop_id) {
 404     case PROP_ENCODING:
 405       g_free (subparse->encoding);
 406       subparse->encoding = g_value_dup_string (value);
 407       GST_LOG_OBJECT (object, "subtitle encoding set to %s",
 408           GST_STR_NULL (subparse->encoding));
 409       break;
 410     case PROP_VIDEOFPS:
 411     {
 412       subparse->fps_n = gst_value_get_fraction_numerator (value);
 413       subparse->fps_d = gst_value_get_fraction_denominator (value);
 414       GST_DEBUG_OBJECT (object, "video framerate set to %d/%d", subparse->fps_n,
 415           subparse->fps_d);
 416
 417       if (!subparse->state.have_internal_fps) {
 418         subparse->state.fps_n = subparse->fps_n;
 419         subparse->state.fps_d = subparse->fps_d;
 420       }
 421       break;
 422     }
 423 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 424     case PROP_EXTSUB_CURRENT_LANGUAGE:
 425       g_free (subparse->state.current_language);
 426       subparse->state.current_language = g_value_dup_string (value);
 427       GST_LOG_OBJECT (subparse, "subtitle current language set to %s",
 428           GST_STR_NULL (subparse->state.current_language));
 429       sami_context_change_language (&subparse->state);
 430       break;
 431 #endif
 432 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
 433     case PROP_DROP_OUT_OF_SEGMENT:
 434       subparse->state.drop_out_of_segment = g_value_get_boolean (value);
 435       GST_DEBUG_OBJECT (object, "Drop out of segment set to %d",
 436           subparse->state.drop_out_of_segment);
 437       break;
 438 #endif
 439     default:
 440       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
 441       break;
 442   }
 443   GST_OBJECT_UNLOCK (subparse);
 444 }
 445
 446 static void
 447 gst_sub_parse_get_property (GObject * object, guint prop_id,
 448     GValue * value, GParamSpec * pspec)
 449 {
 450   GstSubParse *subparse = GST_SUBPARSE (object);
 451
 452   GST_OBJECT_LOCK (subparse);
 453   switch (prop_id) {
 454     case PROP_ENCODING:
 455       g_value_set_string (value, subparse->encoding);
 456       break;
 457     case PROP_VIDEOFPS:
 458       gst_value_set_fraction (value, subparse->fps_n, subparse->fps_d);
 459       break;
 460 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 461     case PROP_EXTSUB_CURRENT_LANGUAGE:
 462       g_value_set_string (value, subparse->state.current_language);
 463       break;
 464 #endif
 465 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
 466     case PROP_DROP_OUT_OF_SEGMENT:
 467       g_value_set_boolean (value, subparse->state.drop_out_of_segment);
 468       break;
 469 #endif
 470     default:
 471       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
 472       break;
 473   }
 474   GST_OBJECT_UNLOCK (subparse);
 475 }
 476
 477 static const gchar *
 478 gst_sub_parse_get_format_description (GstSubParseFormat format)
 479 {
 480   switch (format) {
 481     case GST_SUB_PARSE_FORMAT_MDVDSUB:
 482       return "MicroDVD";
 483     case GST_SUB_PARSE_FORMAT_SUBRIP:
 484       return "SubRip";
 485     case GST_SUB_PARSE_FORMAT_MPSUB:
 486       return "MPSub";
 487     case GST_SUB_PARSE_FORMAT_SAMI:
 488       return "SAMI";
 489     case GST_SUB_PARSE_FORMAT_TMPLAYER:
 490       return "TMPlayer";
 491     case GST_SUB_PARSE_FORMAT_MPL2:
 492       return "MPL2";
 493     case GST_SUB_PARSE_FORMAT_SUBVIEWER:
 494       return "SubViewer";
 495     case GST_SUB_PARSE_FORMAT_DKS:
 496       return "DKS";
 497     case GST_SUB_PARSE_FORMAT_VTT:
 498       return "WebVTT";
 499     case GST_SUB_PARSE_FORMAT_QTTEXT:
 500       return "QTtext";
 501     case GST_SUB_PARSE_FORMAT_LRC:
 502       return "LRC";
 503     default:
 504     case GST_SUB_PARSE_FORMAT_UNKNOWN:
 505       break;
 506   }
 507   return NULL;
 508 }
 509
 510 static gchar *
 511 gst_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding,
 512     gsize * consumed, GError ** err)
 513 {
 514   gchar *ret = NULL;
 515
 516   *consumed = 0;
 517   /* The char cast is necessary in glib < 2.24 */
 518   ret =
 519       g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*",
 520       consumed, NULL, err);
 521   if (ret == NULL)
 522     return ret;
 523
 524   /* + 3 to skip UTF-8 BOM if it was added */
 525   len = strlen (ret);
 526   if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
 527       && (guint8) ret[2] == 0xBF)
 528     memmove (ret, ret + 3, len + 1 - 3);
 529
 530   return ret;
 531 }
 532
 533 static gchar *
 534 detect_encoding (const gchar * str, gsize len)
 535 {
 536   if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB
 537       && (guint8) str[2] == 0xBF)
 538     return g_strdup ("UTF-8");
 539
 540   if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF)
 541     return g_strdup ("UTF-16BE");
 542
 543   if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE)
 544     return g_strdup ("UTF-16LE");
 545
 546   if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00
 547       && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF)
 548     return g_strdup ("UTF-32BE");
 549
 550   if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE
 551       && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00)
 552     return g_strdup ("UTF-32LE");
 553
 554   return NULL;
 555 }
 556
 557 static gchar *
 558 convert_encoding (GstSubParse * self, const gchar * str, gsize len,
 559     gsize * consumed)
 560 {
 561   const gchar *encoding;
 562   GError *err = NULL;
 563   gchar *ret = NULL;
 564
 565   *consumed = 0;
 566
 567   /* First try any detected encoding */
 568   if (self->detected_encoding) {
 569     ret =
 570         gst_convert_to_utf8 (str, len, self->detected_encoding, consumed, &err);
 571
 572     if (!err)
 573       return ret;
 574
 575     GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
 576         self->detected_encoding, err->message);
 577     g_free (self->detected_encoding);
 578     self->detected_encoding = NULL;
 579     g_clear_error (&err);
 580   }
 581
 582   /* Otherwise check if it's UTF8 */
 583   if (self->valid_utf8) {
 584     if (g_utf8_validate (str, len, NULL)) {
 585       GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
 586       *consumed = len;
 587       return g_strndup (str, len);
 588     }
 589     GST_INFO_OBJECT (self, "invalid UTF-8!");
 590     self->valid_utf8 = FALSE;
 591   }
 592
 593   /* Else try fallback */
 594   encoding = self->encoding;
 595   if (encoding == NULL || *encoding == '\0') {
 596     encoding = g_getenv ("GST_SUBTITLE_ENCODING");
 597   }
 598   if (encoding == NULL || *encoding == '\0') {
 599     /* if local encoding is UTF-8 and no encoding specified
 600      * via the environment variable, assume ISO-8859-15 */
 601     if (g_get_charset (&encoding)) {
 602       encoding = "ISO-8859-15";
 603     }
 604   }
 605
 606   ret = gst_convert_to_utf8 (str, len, encoding, consumed, &err);
 607
 608   if (err) {
 609     GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
 610         encoding, err->message);
 611     g_clear_error (&err);
 612
 613 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 614     if (!g_strcmp0 (self->encoding, "EUC-KR")) {
 615       GST_LOG_OBJECT (self, "use CP949 as fallback");
 616       g_free (self->encoding);
 617       self->encoding = g_strdup ("CP949");
 618       encoding = self->encoding;
 619       ret = gst_convert_to_utf8 (str, len, encoding, consumed, &err);
 620     } else {
 621 #endif
 622       /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
 623       GST_LOG_OBJECT (self, "use ISO-8859-15 as fallback");
 624       ret = gst_convert_to_utf8 (str, len, "ISO-8859-15", consumed, NULL);
 625 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
 626     }
 627 #endif
 628   }
 629
 630   GST_LOG_OBJECT (self,
 631       "successfully converted %" G_GSIZE_FORMAT " characters from %s to UTF-8",
 632       len, encoding);
 633
 634   return ret;
 635 }
 636
 637 static gchar *
 638 get_next_line (GstSubParse * self)
 639 {
 640   char *line = NULL;
 641   const char *line_end;
 642   int line_len;
 643   gboolean have_r = FALSE;
 644
 645   line_end = strchr (self->textbuf->str, '\n');
 646
 647   if (!line_end) {
 648     /* end-of-line not found; return for more data */
 649     return NULL;
 650   }
 651
 652   /* get rid of '\r' */
 653   if (line_end != self->textbuf->str && *(line_end - 1) == '\r') {
 654     line_end--;
 655     have_r = TRUE;
 656   }
 657
 658   line_len = line_end - self->textbuf->str;
 659   line = g_strndup (self->textbuf->str, line_len);
 660   self->textbuf = g_string_erase (self->textbuf, 0,
 661       line_len + (have_r ? 2 : 1));
 662   return line;
 663 }
 664
 665 static gchar *
 666 parse_mdvdsub (ParserState * state, const gchar * line)
 667 {
 668   const gchar *line_split;
 669   gchar *line_chunk;
 670   guint start_frame, end_frame;
 671   guint64 clip_start = 0, clip_stop = 0;
 672   gboolean in_seg = FALSE;
 673   GString *markup;
 674   gchar *ret;
 675
 676   /* style variables */
 677   gboolean italic;
 678   gboolean bold;
 679   guint fontsize;
 680   gdouble fps = 0.0;
 681
 682   if (sscanf (line, "{%u}{%u}", &start_frame, &end_frame) != 2) {
 683     g_warning ("Parse of the following line, assumed to be in microdvd .sub"
 684         " format, failed:\n%s", line);
 685     return NULL;
 686   }
 687
 688   /* skip the {%u}{%u} part */
 689   line = strchr (line, '}') + 1;
 690   line = strchr (line, '}') + 1;
 691
 692   /* see if there's a first line with a framerate */
 693   if (start_frame == 1 && end_frame == 1) {
 694     gchar *rest, *end = NULL;
 695
 696     rest = g_strdup (line);
 697     g_strdelimit (rest, ",", '.');
 698     fps = g_ascii_strtod (rest, &end);
 699     if (end != rest) {
 700       gst_util_double_to_fraction (fps, &state->fps_n, &state->fps_d);
 701       GST_INFO ("framerate from file: %d/%d ('%s')", state->fps_n,
 702           state->fps_d, rest);
 703     }
 704     g_free (rest);
 705     return NULL;
 706   }
 707
 708   state->start_time =
 709       gst_util_uint64_scale (start_frame, GST_SECOND * state->fps_d,
 710       state->fps_n);
 711   state->duration =
 712       gst_util_uint64_scale (end_frame - start_frame, GST_SECOND * state->fps_d,
 713       state->fps_n);
 714
 715   /* Check our segment start/stop */
 716 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
 717   if (state->drop_out_of_segment) {
 718 #endif
 719   in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
 720       state->start_time, state->start_time + state->duration, &clip_start,
 721       &clip_stop);
 722
 723   /* No need to parse that text if it's out of segment */
 724   if (in_seg) {
 725     state->start_time = clip_start;
 726     state->duration = clip_stop - clip_start;
 727   } else {
 728     return NULL;
 729   }
 730 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
 731   }
 732 #endif
 733
 734   markup = g_string_new (NULL);
 735   while (1) {
 736     italic = FALSE;
 737     bold = FALSE;
 738     fontsize = 0;
 739     /* parse style markup */
 740     if (strncmp (line, "{y:i}", 5) == 0) {
 741       italic = TRUE;
 742       line = strchr (line, '}') + 1;
 743     }
 744     if (strncmp (line, "{y:b}", 5) == 0) {
 745       bold = TRUE;
 746       line = strchr (line, '}') + 1;
 747     }
 748     if (sscanf (line, "{s:%u}", &fontsize) == 1) {
 749       line = strchr (line, '}') + 1;
 750     }
 751     /* forward slashes at beginning/end signify italics too */
 752     if (g_str_has_prefix (line, "/")) {
 753       italic = TRUE;
 754       ++line;
 755     }
 756     if ((line_split = strchr (line, '|')))
 757       line_chunk = g_markup_escape_text (line, line_split - line);
 758     else
 759       line_chunk = g_markup_escape_text (line, strlen (line));
 760
 761     /* Remove italics markers at end of line/stanza (CHECKME: are end slashes
 762      * always at the end of a line or can they span multiple lines?) */
 763     if (g_str_has_suffix (line_chunk, "/")) {
 764       line_chunk[strlen (line_chunk) - 1] = '\0';
 765     }
 766
 767     markup = g_string_append (markup, "<span");
 768     if (italic)
 769       g_string_append (markup, " style=\"italic\"");
 770     if (bold)
 771       g_string_append (markup, " weight=\"bold\"");
 772     if (fontsize)
 773       g_string_append_printf (markup, " size=\"%u\"", fontsize * 1000);
 774     g_string_append_printf (markup, ">%s</span>", line_chunk);
 775     g_free (line_chunk);
 776     if (line_split) {
 777       g_string_append (markup, "\n");
 778       line = line_split + 1;
 779     } else {
 780       break;
 781     }
 782   }
 783   ret = markup->str;
 784   g_string_free (markup, FALSE);
 785   GST_DEBUG ("parse_mdvdsub returning (%f+%f): %s",
 786       state->start_time / (double) GST_SECOND,
 787       state->duration / (double) GST_SECOND, ret);
 788   return ret;
 789 }
 790
 791 static void
 792 strip_trailing_newlines (gchar * txt)
 793 {
 794   if (txt) {
 795     guint len;
 796
 797     len = strlen (txt);
 798     while (len > 1 && txt[len - 1] == '\n') {
 799       txt[len - 1] = '\0';
 800       --len;
 801     }
 802   }
 803 }
 804
 805 /* we want to escape text in general, but retain basic markup like
 806  * <i></i>, <u></u>, and <b></b>. The easiest and safest way is to
 807  * just unescape a white list of allowed markups again after
 808  * escaping everything (the text between these simple markers isn't
 809  * necessarily escaped, so it seems best to do it like this) */
 810 static void
 811 subrip_unescape_formatting (gchar * txt, gconstpointer allowed_tags_ptr,
 812     gboolean allows_tag_attributes)
 813 {
 814   gchar *res;
 815   GRegex *tag_regex;
 816   gchar *allowed_tags_pattern, *search_pattern;
 817   const gchar *replace_pattern;
 818
 819   /* No processing needed if no escaped tag marker found in the string. */
 820   if (strstr (txt, "&lt;") == NULL)
 821     return;
 822
 823   /* Build a list of alternates for our regexp.
 824    * FIXME: Could be built once and stored */
 825   allowed_tags_pattern = g_strjoinv ("|", (gchar **) allowed_tags_ptr);
 826   /* Look for starting/ending escaped tags with optional attributes. */
 827   search_pattern = g_strdup_printf ("&lt;(/)?\\ *(%s)(%s)&gt;",
 828       allowed_tags_pattern, ATTRIBUTE_REGEX);
 829   /* And unescape appropriately */
 830   if (allows_tag_attributes) {
 831     replace_pattern = "<\\1\\2\\3>";
 832   } else {
 833     replace_pattern = "<\\1\\2>";
 834   }
 835
 836   tag_regex = g_regex_new (search_pattern, 0, 0, NULL);
 837   res = g_regex_replace (tag_regex, txt, strlen (txt), 0,
 838       replace_pattern, 0, NULL);
 839
 840   /* res will always be shorter than the input or identical, so this
 841    * copy is OK */
 842   strcpy (txt, res);
 843
 844   g_free (res);
 845   g_free (search_pattern);
 846   g_free (allowed_tags_pattern);
 847
 848   g_regex_unref (tag_regex);
 849 }
 850
 851
 852 static gboolean
 853 subrip_remove_unhandled_tag (gchar * start, gchar * stop)
 854 {
 855   gchar *tag, saved;
 856
 857   tag = start + strlen ("&lt;");
 858   if (*tag == '/')
 859     ++tag;
 860
 861   if (g_ascii_tolower (*tag) < 'a' || g_ascii_tolower (*tag) > 'z')
 862     return FALSE;
 863
 864   saved = *stop;
 865   *stop = '\0';
 866   GST_LOG ("removing unhandled tag '%s'", start);
 867   *stop = saved;
 868   memmove (start, stop, strlen (stop) + 1);
 869   return TRUE;
 870 }
 871
 872 /* remove tags we haven't explicitly allowed earlier on, like font tags
 873  * for example */
 874 static void
 875 subrip_remove_unhandled_tags (gchar * txt)
 876 {
 877   gchar *pos, *gt;
 878
 879   for (pos = txt; pos != NULL && *pos != '\0'; ++pos) {
 880     if (strncmp (pos, "&lt;", 4) == 0 && (gt = strstr (pos + 4, "&gt;"))) {
 881       if (subrip_remove_unhandled_tag (pos, gt + strlen ("&gt;")))
 882         --pos;
 883     }
 884   }
 885 }
 886
 887 /* we only allow a fixed set of tags like <i>, <u> and <b>, so let's
 888  * take a simple approach. This code assumes the input has been
 889  * escaped and subrip_unescape_formatting() has then been run over the
 890  * input! This function adds missing closing markup tags and removes
 891  * broken closing tags for tags that have never been opened. */
 892 static void
 893 subrip_fix_up_markup (gchar ** p_txt, gconstpointer allowed_tags_ptr)
 894 {
 895   gchar *cur, *next_tag;
 896   GPtrArray *open_tags = NULL;
 897   guint num_open_tags = 0;
 898   const gchar *iter_tag;
 899   guint offset = 0;
 900   guint index;
 901   gchar *cur_tag;
 902   gchar *end_tag;
 903   GRegex *tag_regex;
 904   GMatchInfo *match_info;
 905   gchar **allowed_tags = (gchar **) allowed_tags_ptr;
 906
 907   g_assert (*p_txt != NULL);
 908
 909   open_tags = g_ptr_array_new_with_free_func (g_free);
 910   cur = *p_txt;
 911   while (*cur != '\0') {
 912     next_tag = strchr (cur, '<');
 913     if (next_tag == NULL)
 914       break;
 915     offset = 0;
 916     index = 0;
 917     while (index < g_strv_length (allowed_tags)) {
 918       iter_tag = allowed_tags[index];
 919       /* Look for a white listed tag */
 920       cur_tag = g_strconcat ("<", iter_tag, ATTRIBUTE_REGEX, ">", NULL);
 921       tag_regex = g_regex_new (cur_tag, 0, 0, NULL);
 922       (void) g_regex_match (tag_regex, next_tag, 0, &match_info);
 923
 924       if (g_match_info_matches (match_info)) {
 925         gint start_pos, end_pos;
 926         gchar *word = g_match_info_fetch (match_info, 0);
 927         g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos);
 928         if (start_pos == 0) {
 929           offset = strlen (word);
 930         }
 931         g_free (word);
 932       }
 933       g_match_info_free (match_info);
 934       g_regex_unref (tag_regex);
 935       g_free (cur_tag);
 936       index++;
 937       if (offset) {
 938         /* OK we found a tag, let's keep track of it */
 939         g_ptr_array_add (open_tags, g_ascii_strdown (iter_tag, -1));
 940         ++num_open_tags;
 941         break;
 942       }
 943     }
 944
 945     if (offset) {
 946       next_tag += offset;
 947       cur = next_tag;
 948       continue;
 949     }
 950
 951     if (*next_tag == '<' && *(next_tag + 1) == '/') {
 952       end_tag = strchr (cur, '>');
 953       if (end_tag) {
 954         const gchar *last = NULL;
 955         if (num_open_tags > 0)
 956           last = g_ptr_array_index (open_tags, num_open_tags - 1);
 957         if (num_open_tags == 0
 958             || g_ascii_strncasecmp (end_tag - 1, last, strlen (last))) {
 959           GST_LOG ("broken input, closing tag '%s' is not open", end_tag - 1);
 960           memmove (next_tag, end_tag + 1, strlen (end_tag) + 1);
 961           next_tag -= strlen (end_tag);
 962         } else {
 963           --num_open_tags;
 964           g_ptr_array_remove_index (open_tags, num_open_tags);
 965         }
 966       }
 967     }
 968     ++next_tag;
 969     cur = next_tag;
 970   }
 971
 972   if (num_open_tags > 0) {
 973     GString *s;
 974
 975     s = g_string_new (*p_txt);
 976     while (num_open_tags > 0) {
 977       GST_LOG ("adding missing closing tag '%s'",
 978           (char *) g_ptr_array_index (open_tags, num_open_tags - 1));
 979       g_string_append_c (s, '<');
 980       g_string_append_c (s, '/');
 981       g_string_append (s, g_ptr_array_index (open_tags, num_open_tags - 1));
 982       g_string_append_c (s, '>');
 983       --num_open_tags;
 984     }
 985     g_free (*p_txt);
 986     *p_txt = g_string_free (s, FALSE);
 987   }
 988   g_ptr_array_free (open_tags, TRUE);
 989 }
 990
 991 static gboolean
 992 parse_subrip_time (const gchar * ts_string, GstClockTime * t)
 993 {
 994   gchar s[128] = { '\0', };
 995   gchar *end, *p;
 996   guint hour, min, sec, msec, len;
 997
 998   while (*ts_string == ' ')
 999     ++ts_string;
1000
1001   g_strlcpy (s, ts_string, sizeof (s));
1002   if ((end = strstr (s, "-->")))
1003     *end = '\0';
1004   g_strchomp (s);
1005
1006   /* ms may be in these formats:
1007    * hh:mm:ss,500 = 500ms
1008    * hh:mm:ss,  5 =   5ms
1009    * hh:mm:ss, 5  =  50ms
1010    * hh:mm:ss, 50 =  50ms
1011    * hh:mm:ss,5   = 500ms
1012    * and the same with . instead of ,.
1013    * sscanf() doesn't differentiate between '  5' and '5' so munge
1014    * the white spaces within the timestamp to '0' (I'm sure there's a
1015    * way to make sscanf() do this for us, but how?)
1016    */
1017   g_strdelimit (s, " ", '0');
1018   g_strdelimit (s, ".", ',');
1019
1020   /* make sure we have exactly three digits after he comma */
1021   p = strchr (s, ',');
1022   if (p == NULL) {
1023     /* If there isn't a ',' the timestamp is broken */
1024     /* https://gitlab.freedesktop.org/gstreamer/gst-plugins-base/issues/532#note_100179 */
1025     GST_WARNING ("failed to parse subrip timestamp string '%s'", s);
1026     return FALSE;
1027   }
1028
1029   ++p;
1030   len = strlen (p);
1031   if (len > 3) {
1032     p[3] = '\0';
1033   } else
1034     while (len < 3) {
1035       g_strlcat (&p[len], "0", 2);
1036       ++len;
1037     }
1038
1039   GST_LOG ("parsing timestamp '%s'", s);
1040   if (sscanf (s, "%u:%u:%u,%u", &hour, &min, &sec, &msec) != 4) {
1041 #ifdef TIZEN_FEATURE_UPSTREAM
1042     /* https://www.w3.org/TR/webvtt1/#webvtt-timestamp
1043      *
1044      * The hours component is optional with webVTT, for example
1045      * mm:ss,500 is a valid webVTT timestamp. When not present,
1046      * hours is 0.
1047      */
1048     hour = 0;
1049
1050     if (sscanf (s, "%u:%u,%u", &min, &sec, &msec) != 3) {
1051       GST_WARNING ("failed to parse subrip timestamp string '%s'", s);
1052       return FALSE;
1053     }
1054 #else
1055     GST_WARNING ("failed to parse subrip timestamp string '%s'", s);
1056     return FALSE;
1057 #endif
1058   }
1059
1060   *t = ((hour * 3600) + (min * 60) + sec) * GST_SECOND + msec * GST_MSECOND;
1061   return TRUE;
1062 }
1063
1064 /* cue settings are part of the WebVTT specification. They are
1065  * declared after the time interval in the first line of the
1066  * cue. Example: 00:00:01,000 --> 00:00:02,000 D:vertical-lr A:start
1067  * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html
1068  */
1069 static void
1070 parse_webvtt_cue_settings (ParserState * state, const gchar * settings)
1071 {
1072   gchar **splitted_settings = g_strsplit_set (settings, " \t", -1);
1073   gint i = 0;
1074   gint16 text_position, text_size;
1075   gint16 line_position;
1076   gboolean vertical_found = FALSE;
1077   gboolean alignment_found = FALSE;
1078
1079   while (i < g_strv_length (splitted_settings)) {
1080     gboolean valid_tag = FALSE;
1081     switch (splitted_settings[i][0]) {
1082       case 'T':
1083         if (sscanf (splitted_settings[i], "T:%" G_GINT16_FORMAT "%%",
1084                 &text_position) > 0) {
1085           state->text_position = (guint8) text_position;
1086           valid_tag = TRUE;
1087         }
1088         break;
1089       case 'D':
1090         if (strlen (splitted_settings[i]) > 2) {
1091           vertical_found = TRUE;
1092           g_free (state->vertical);
1093           state->vertical = g_strdup (splitted_settings[i] + 2);
1094           valid_tag = TRUE;
1095         }
1096         break;
1097       case 'L':
1098         if (g_str_has_suffix (splitted_settings[i], "%")) {
1099           if (sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT "%%",
1100                   &line_position) > 0) {
1101             state->line_position = line_position;
1102             valid_tag = TRUE;
1103           }
1104         } else {
1105           if (sscanf (splitted_settings[i], "L:%" G_GINT16_FORMAT,
1106                   &line_position) > 0) {
1107             state->line_number = line_position;
1108             valid_tag = TRUE;
1109           }
1110         }
1111         break;
1112       case 'S':
1113         if (sscanf (splitted_settings[i], "S:%" G_GINT16_FORMAT "%%",
1114                 &text_size) > 0) {
1115           state->text_size = (guint8) text_size;
1116           valid_tag = TRUE;
1117         }
1118         break;
1119       case 'A':
1120         if (strlen (splitted_settings[i]) > 2) {
1121           g_free (state->alignment);
1122           state->alignment = g_strdup (splitted_settings[i] + 2);
1123           alignment_found = TRUE;
1124           valid_tag = TRUE;
1125         }
1126         break;
1127       default:
1128         break;
1129     }
1130     if (!valid_tag) {
1131       GST_LOG ("Invalid or unrecognised setting found: %s",
1132           splitted_settings[i]);
1133     }
1134     i++;
1135   }
1136   g_strfreev (splitted_settings);
1137   if (!vertical_found) {
1138     g_free (state->vertical);
1139     state->vertical = g_strdup ("");
1140   }
1141   if (!alignment_found) {
1142     g_free (state->alignment);
1143     state->alignment = g_strdup ("");
1144   }
1145 }
1146
1147 #ifdef TIZEN_FEATURE_HLS_WEBVTT
1148 static void
1149 parse_timestamp_map (ParserState * state, const gchar * timestamp_map)
1150 {
1151   GstClockTime local = 0;
1152   guint64 mpegts = 0;
1153   gchar *local_start = NULL;
1154   gchar *mpegts_start = NULL;
1155
1156   if (!timestamp_map)
1157     return;
1158
1159   local_start = g_strrstr (timestamp_map, "LOCAL:");
1160   if (local_start)
1161     parse_subrip_time (local_start + strlen ("LOCAL:"), &local);
1162
1163   mpegts_start = g_strrstr (timestamp_map, "MPEGTS:");
1164   if (mpegts_start)
1165     mpegts = g_ascii_strtoull (mpegts_start + strlen ("MPEGTS:"), NULL, 10);
1166
1167   GST_LOG ("parsed local time %" GST_TIME_FORMAT " MPEGTS: %" G_GUINT64_FORMAT,
1168       GST_TIME_ARGS (local), mpegts);
1169
1170   state->local = local;
1171   state->mpegts = mpegts;
1172 }
1173
1174 static void
1175 send_fragment_timestamp_event (GstSubParse * self, GstClockTime timestamp)
1176 {
1177   GstEvent *event = NULL;
1178
1179   if (!GST_CLOCK_TIME_IS_VALID (timestamp))
1180     return;
1181
1182   GST_LOG ("send fragment_timestamp %" GST_TIME_FORMAT,
1183       GST_TIME_ARGS (timestamp));
1184
1185   event = gst_event_new_custom (GST_EVENT_CUSTOM_DOWNSTREAM,
1186       gst_structure_new ("fragment_timestamp",
1187           "timestamp", G_TYPE_UINT64, timestamp, NULL));
1188
1189   gst_pad_push_event (self->srcpad, event);
1190 }
1191 #endif
1192
1193 static gchar *
1194 parse_subrip (ParserState * state, const gchar * line)
1195 {
1196   gchar *ret;
1197
1198   switch (state->state) {
1199     case 0:{
1200       char *endptr;
1201       guint64 id;
1202
1203       /* looking for a single integer as a Cue ID, but we
1204        * don't actually use it */
1205       errno = 0;
1206       id = g_ascii_strtoull (line, &endptr, 10);
1207       if (id == G_MAXUINT64 && errno == ERANGE)
1208         state->state = 1;
1209       else if (id == 0 && errno == EINVAL)
1210         state->state = 1;
1211       else if (endptr != line && *endptr == '\0')
1212         state->state = 1;
1213       return NULL;
1214     }
1215     case 1:
1216     {
1217       GstClockTime ts_start, ts_end;
1218       gchar *end_time;
1219
1220       /* looking for start_time --> end_time */
1221       if ((end_time = strstr (line, " --> ")) &&
1222           parse_subrip_time (line, &ts_start) &&
1223           parse_subrip_time (end_time + strlen (" --> "), &ts_end) &&
1224 #ifdef TIZEN_FEATURE_HLS_WEBVTT
1225           state->start_time <=
1226               ts_end + MPEGTIME_TO_GSTTIME (state->mpegts) - state->local) {
1227 #else
1228           state->start_time <= ts_end) {
1229 #endif
1230         state->state = 2;
1231         state->start_time = ts_start;
1232 #ifdef TIZEN_FEATURE_HLS_WEBVTT
1233         state->start_time += MPEGTIME_TO_GSTTIME (state->mpegts) - state->local;
1234 #endif
1235         state->duration = ts_end - ts_start;
1236       } else {
1237         GST_DEBUG ("error parsing subrip time line '%s'", line);
1238         state->state = 0;
1239       }
1240       return NULL;
1241     }
1242     case 2:
1243     {
1244       /* No need to parse that text if it's out of segment */
1245 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1246       if (state->drop_out_of_segment) {
1247 #endif
1248       guint64 clip_start = 0, clip_stop = 0;
1249       gboolean in_seg = FALSE;
1250
1251       /* Check our segment start/stop */
1252       in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
1253           state->start_time, state->start_time + state->duration,
1254           &clip_start, &clip_stop);
1255
1256       if (in_seg) {
1257         state->start_time = clip_start;
1258         state->duration = clip_stop - clip_start;
1259       } else {
1260         state->state = 0;
1261         return NULL;
1262       }
1263 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1264       }
1265 #endif
1266     }
1267       /* looking for subtitle text; empty line ends this subtitle entry */
1268       if (state->buf->len)
1269         g_string_append_c (state->buf, '\n');
1270       g_string_append (state->buf, line);
1271       if (strlen (line) == 0) {
1272         ret = g_markup_escape_text (state->buf->str, state->buf->len);
1273         g_string_truncate (state->buf, 0);
1274         state->state = 0;
1275         subrip_unescape_formatting (ret, state->allowed_tags,
1276             state->allows_tag_attributes);
1277         subrip_remove_unhandled_tags (ret);
1278         strip_trailing_newlines (ret);
1279         subrip_fix_up_markup (&ret, state->allowed_tags);
1280         return ret;
1281       }
1282       return NULL;
1283     default:
1284       g_return_val_if_reached (NULL);
1285   }
1286 }
1287
1288 static gchar *
1289 parse_lrc (ParserState * state, const gchar * line)
1290 {
1291   gint m, s, c;
1292   const gchar *start;
1293   gint milli;
1294
1295   if (line[0] != '[')
1296     return NULL;
1297
1298   if (sscanf (line, "[%u:%02u.%03u]", &m, &s, &c) != 3 &&
1299       sscanf (line, "[%u:%02u.%02u]", &m, &s, &c) != 3)
1300     return NULL;
1301
1302   start = strchr (line, ']');
1303   if (start - line == 9)
1304     milli = 10;
1305   else
1306     milli = 1;
1307
1308   state->start_time = gst_util_uint64_scale (m, 60 * GST_SECOND, 1)
1309       + gst_util_uint64_scale (s, GST_SECOND, 1)
1310       + gst_util_uint64_scale (c, milli * GST_MSECOND, 1);
1311   state->duration = GST_CLOCK_TIME_NONE;
1312
1313   return g_strdup (start + 1);
1314 }
1315
1316 /* WebVTT is a new subtitle format for the upcoming HTML5 video track
1317  * element. This format is similar to Subrip, the biggest differences
1318  * are that there can be cue settings detailing how to display the cue
1319  * text and more markup tags are allowed.
1320  * See also http://www.whatwg.org/specs/web-apps/current-work/webvtt.html
1321  */
1322 static gchar *
1323 parse_webvtt (ParserState * state, const gchar * line)
1324 {
1325   /* Cue IDs are optional in WebVTT, but not in subrip,
1326    * so when in state 0 (cue ID), also check if we're
1327    * already at the start --> end time marker */
1328   if (state->state == 0 || state->state == 1) {
1329     GstClockTime ts_start, ts_end;
1330     gchar *end_time;
1331     gchar *cue_settings = NULL;
1332
1333     /* looking for start_time --> end_time */
1334     if ((end_time = strstr (line, " --> ")) &&
1335         parse_subrip_time (line, &ts_start) &&
1336         parse_subrip_time (end_time + strlen (" --> "), &ts_end) &&
1337         state->start_time <= ts_end) {
1338       state->state = 2;
1339       state->start_time = ts_start;
1340 #ifdef TIZEN_FEATURE_HLS_WEBVTT
1341       state->start_time += MPEGTIME_TO_GSTTIME (state->mpegts) - state->local;
1342 #endif
1343       state->duration = ts_end - ts_start;
1344       cue_settings = strstr (end_time + strlen (" --> "), " ");
1345 #ifdef TIZEN_FEATURE_HLS_WEBVTT
1346     } else if (strstr (line, "X-TIMESTAMP-MAP")) {
1347       GST_DEBUG ("got X-TIMESTAMP-MAP '%s'", line);
1348       parse_timestamp_map (state, line);
1349       state->state = 0;
1350 #endif
1351     } else {
1352       GST_DEBUG ("error parsing subrip time line '%s'", line);
1353       state->state = 0;
1354     }
1355
1356     state->text_position = 0;
1357     state->text_size = 0;
1358     state->line_position = 0;
1359     state->line_number = 0;
1360
1361     if (cue_settings)
1362       parse_webvtt_cue_settings (state, cue_settings + 1);
1363     else {
1364       g_free (state->vertical);
1365       state->vertical = g_strdup ("");
1366       g_free (state->alignment);
1367       state->alignment = g_strdup ("");
1368     }
1369
1370     return NULL;
1371   } else
1372     return parse_subrip (state, line);
1373 }
1374
1375 static void
1376 unescape_newlines_br (gchar * read)
1377 {
1378   gchar *write = read;
1379
1380   /* Replace all occurences of '[br]' with a newline as version 2
1381    * of the subviewer format uses this for newlines */
1382
1383   if (read[0] == '\0' || read[1] == '\0' || read[2] == '\0' || read[3] == '\0')
1384     return;
1385
1386   do {
1387     if (strncmp (read, "[br]", 4) == 0) {
1388       *write = '\n';
1389       read += 4;
1390     } else {
1391       *write = *read;
1392       read++;
1393     }
1394     write++;
1395   } while (*read);
1396
1397   *write = '\0';
1398 }
1399
1400 static gchar *
1401 parse_subviewer (ParserState * state, const gchar * line)
1402 {
1403   guint h1, m1, s1, ms1;
1404   guint h2, m2, s2, ms2;
1405   gchar *ret;
1406
1407   /* TODO: Maybe also parse the fields in the header, especially DELAY.
1408    * For examples see the unit test or
1409    * http://www.doom9.org/index.html?/sub.htm */
1410
1411   switch (state->state) {
1412     case 0:
1413       /* looking for start_time,end_time */
1414       if (sscanf (line, "%u:%u:%u.%u,%u:%u:%u.%u",
1415               &h1, &m1, &s1, &ms1, &h2, &m2, &s2, &ms2) == 8) {
1416         state->state = 1;
1417         state->start_time =
1418             (((guint64) h1) * 3600 + m1 * 60 + s1) * GST_SECOND +
1419             ms1 * GST_MSECOND;
1420         state->duration =
1421             (((guint64) h2) * 3600 + m2 * 60 + s2) * GST_SECOND +
1422             ms2 * GST_MSECOND - state->start_time;
1423       }
1424       return NULL;
1425     case 1:
1426     {
1427       /* No need to parse that text if it's out of segment */
1428 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1429       if (state->drop_out_of_segment) {
1430 #endif
1431       guint64 clip_start = 0, clip_stop = 0;
1432       gboolean in_seg = FALSE;
1433
1434       /* Check our segment start/stop */
1435       in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
1436           state->start_time, state->start_time + state->duration,
1437           &clip_start, &clip_stop);
1438
1439       if (in_seg) {
1440         state->start_time = clip_start;
1441         state->duration = clip_stop - clip_start;
1442       } else {
1443         state->state = 0;
1444         return NULL;
1445       }
1446 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1447       }
1448 #endif
1449     }
1450       /* looking for subtitle text; empty line ends this subtitle entry */
1451       if (state->buf->len)
1452         g_string_append_c (state->buf, '\n');
1453       g_string_append (state->buf, line);
1454       if (strlen (line) == 0) {
1455         ret = g_strdup (state->buf->str);
1456         unescape_newlines_br (ret);
1457         strip_trailing_newlines (ret);
1458         g_string_truncate (state->buf, 0);
1459         state->state = 0;
1460         return ret;
1461       }
1462       return NULL;
1463     default:
1464       g_assert_not_reached ();
1465       return NULL;
1466   }
1467 }
1468
1469 static gchar *
1470 parse_mpsub (ParserState * state, const gchar * line)
1471 {
1472   gchar *ret;
1473   float t1, t2;
1474
1475   switch (state->state) {
1476     case 0:
1477       /* looking for two floats (offset, duration) */
1478       if (sscanf (line, "%f %f", &t1, &t2) == 2) {
1479         state->state = 1;
1480         state->start_time += state->duration + GST_SECOND * t1;
1481         state->duration = GST_SECOND * t2;
1482       }
1483       return NULL;
1484     case 1:
1485     {                           /* No need to parse that text if it's out of segment */
1486 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1487       if (state->drop_out_of_segment) {
1488 #endif
1489       guint64 clip_start = 0, clip_stop = 0;
1490       gboolean in_seg = FALSE;
1491
1492       /* Check our segment start/stop */
1493       in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
1494           state->start_time, state->start_time + state->duration,
1495           &clip_start, &clip_stop);
1496
1497       if (in_seg) {
1498         state->start_time = clip_start;
1499         state->duration = clip_stop - clip_start;
1500       } else {
1501         state->state = 0;
1502         return NULL;
1503       }
1504 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1505       }
1506 #endif
1507     }
1508       /* looking for subtitle text; empty line ends this
1509        * subtitle entry */
1510       if (state->buf->len)
1511         g_string_append_c (state->buf, '\n');
1512       g_string_append (state->buf, line);
1513       if (strlen (line) == 0) {
1514         ret = g_strdup (state->buf->str);
1515         g_string_truncate (state->buf, 0);
1516         state->state = 0;
1517         return ret;
1518       }
1519       return NULL;
1520     default:
1521       g_assert_not_reached ();
1522       return NULL;
1523   }
1524 }
1525
1526 static const gchar *
1527 dks_skip_timestamp (const gchar * line)
1528 {
1529   while (*line && *line != ']')
1530     line++;
1531   if (*line == ']')
1532     line++;
1533   return line;
1534 }
1535
1536 static gchar *
1537 parse_dks (ParserState * state, const gchar * line)
1538 {
1539   guint h, m, s;
1540
1541   switch (state->state) {
1542     case 0:
1543       /* Looking for the start time and text */
1544       if (sscanf (line, "[%u:%u:%u]", &h, &m, &s) == 3) {
1545         const gchar *text;
1546         state->start_time = (((guint64) h) * 3600 + m * 60 + s) * GST_SECOND;
1547         text = dks_skip_timestamp (line);
1548         if (*text) {
1549           state->state = 1;
1550           g_string_append (state->buf, text);
1551         }
1552       }
1553       return NULL;
1554     case 1:
1555     {
1556       guint64 clip_start = 0, clip_stop = 0;
1557       gboolean in_seg;
1558       gchar *ret;
1559
1560       /* Looking for the end time */
1561       if (sscanf (line, "[%u:%u:%u]", &h, &m, &s) == 3) {
1562         state->state = 0;
1563         state->duration = (((guint64) h) * 3600 + m * 60 + s) * GST_SECOND -
1564             state->start_time;
1565       } else {
1566         GST_WARNING ("Failed to parse subtitle end time");
1567         return NULL;
1568       }
1569 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1570       if (state->drop_out_of_segment) {
1571 #endif
1572       /* Check if this subtitle is out of the current segment */
1573       in_seg = gst_segment_clip (state->segment, GST_FORMAT_TIME,
1574           state->start_time, state->start_time + state->duration,
1575           &clip_start, &clip_stop);
1576
1577       if (!in_seg) {
1578         return NULL;
1579       }
1580 #ifdef TIZEN_FEATURE_SUBPARSE_DROP_OUT_OF_SEGMENT
1581       }
1582 #endif
1583
1584       state->start_time = clip_start;
1585       state->duration = clip_stop - clip_start;
1586
1587       ret = g_strdup (state->buf->str);
1588       g_string_truncate (state->buf, 0);
1589       unescape_newlines_br (ret);
1590       return ret;
1591     }
1592     default:
1593       g_assert_not_reached ();
1594       return NULL;
1595   }
1596 }
1597
1598 static void
1599 parser_state_init (ParserState * state)
1600 {
1601   GST_DEBUG ("initialising parser");
1602
1603   if (state->buf) {
1604     g_string_truncate (state->buf, 0);
1605   } else {
1606     state->buf = g_string_new (NULL);
1607   }
1608
1609   state->start_time = 0;
1610   state->duration = 0;
1611   state->max_duration = 0;      /* no limit */
1612   state->state = 0;
1613   state->segment = NULL;
1614 #ifdef TIZEN_FEATURE_HLS_WEBVTT
1615   state->local = 0;
1616   state->mpegts = 0;
1617 #endif
1618 }
1619
1620 static void
1621 parser_state_dispose (GstSubParse * self, ParserState * state)
1622 {
1623   if (state->buf) {
1624     g_string_free (state->buf, TRUE);
1625     state->buf = NULL;
1626   }
1627
1628   g_free (state->vertical);
1629   state->vertical = NULL;
1630   g_free (state->alignment);
1631   state->alignment = NULL;
1632
1633   if (state->user_data) {
1634     switch (self->parser_type) {
1635       case GST_SUB_PARSE_FORMAT_QTTEXT:
1636         qttext_context_deinit (state);
1637         break;
1638       case GST_SUB_PARSE_FORMAT_SAMI:
1639         sami_context_deinit (state);
1640         break;
1641       default:
1642         break;
1643     }
1644   }
1645   state->allowed_tags = NULL;
1646 }
1647
1648 /* regex type enum */
1649 typedef enum
1650 {
1651   GST_SUB_PARSE_REGEX_UNKNOWN = 0,
1652   GST_SUB_PARSE_REGEX_MDVDSUB = 1,
1653   GST_SUB_PARSE_REGEX_SUBRIP = 2,
1654   GST_SUB_PARSE_REGEX_DKS = 3,
1655   GST_SUB_PARSE_REGEX_VTT = 4,
1656 } GstSubParseRegex;
1657
1658 static gpointer
1659 gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype)
1660 {
1661   gpointer result = NULL;
1662   GError *gerr = NULL;
1663   switch (regtype) {
1664     case GST_SUB_PARSE_REGEX_MDVDSUB:
1665       result =
1666           (gpointer) g_regex_new ("^\\{[0-9]+\\}\\{[0-9]+\\}",
1667           G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
1668       if (result == NULL) {
1669         g_warning ("Compilation of mdvd regex failed: %s", gerr->message);
1670         g_clear_error (&gerr);
1671       }
1672       break;
1673     case GST_SUB_PARSE_REGEX_SUBRIP:
1674       result = (gpointer)
1675           g_regex_new ("^[\\s\\n]*[\\n]? {0,3}[ 0-9]{1,4}\\s*(\x0d)?\x0a"
1676           " ?[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,3}"
1677           " +--> +[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,2}",
1678           G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
1679       if (result == NULL) {
1680         g_warning ("Compilation of subrip regex failed: %s", gerr->message);
1681         g_clear_error (&gerr);
1682       }
1683       break;
1684     case GST_SUB_PARSE_REGEX_DKS:
1685       result = (gpointer) g_regex_new ("^\\[[0-9]+:[0-9]+:[0-9]+\\].*",
1686           G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr);
1687       if (result == NULL) {
1688         g_warning ("Compilation of dks regex failed: %s", gerr->message);
1689         g_clear_error (&gerr);
1690       }
1691       break;
1692     case GST_SUB_PARSE_REGEX_VTT:
1693       result = (gpointer)
1694           g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0,
1695           &gerr);
1696       if (result == NULL) {
1697         g_warning ("Compilation of vtt regex failed: %s", gerr->message);
1698         g_error_free (gerr);
1699       }
1700       break;
1701
1702     default:
1703       GST_WARNING ("Trying to allocate regex of unknown type %u", regtype);
1704   }
1705   return result;
1706 }
1707
1708 /*
1709  * FIXME: maybe we should pass along a second argument, the preceding
1710  * text buffer, because that is how this originally worked, even though
1711  * I don't really see the use of that.
1712  */
1713
1714 static GstSubParseFormat
1715 gst_sub_parse_data_format_autodetect (gchar * match_str)
1716 {
1717   guint n1, n2, n3;
1718
1719   static GOnce mdvd_rx_once = G_ONCE_INIT;
1720   static GOnce subrip_rx_once = G_ONCE_INIT;
1721   static GOnce dks_rx_once = G_ONCE_INIT;
1722   static GOnce vtt_rx_once = G_ONCE_INIT;
1723
1724   GRegex *mdvd_grx;
1725   GRegex *subrip_grx;
1726   GRegex *dks_grx;
1727   GRegex *vtt_grx;
1728
1729   g_once (&mdvd_rx_once,
1730       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
1731       (gpointer) GST_SUB_PARSE_REGEX_MDVDSUB);
1732   g_once (&subrip_rx_once,
1733       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
1734       (gpointer) GST_SUB_PARSE_REGEX_SUBRIP);
1735   g_once (&dks_rx_once,
1736       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
1737       (gpointer) GST_SUB_PARSE_REGEX_DKS);
1738   g_once (&vtt_rx_once,
1739       (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once,
1740       (gpointer) GST_SUB_PARSE_REGEX_VTT);
1741
1742   mdvd_grx = (GRegex *) mdvd_rx_once.retval;
1743   subrip_grx = (GRegex *) subrip_rx_once.retval;
1744   dks_grx = (GRegex *) dks_rx_once.retval;
1745   vtt_grx = (GRegex *) vtt_rx_once.retval;
1746
1747   if (g_regex_match (mdvd_grx, match_str, 0, NULL)) {
1748     GST_LOG ("MicroDVD (frame based) format detected");
1749     return GST_SUB_PARSE_FORMAT_MDVDSUB;
1750   }
1751   if (g_regex_match (subrip_grx, match_str, 0, NULL)) {
1752     GST_LOG ("SubRip (time based) format detected");
1753     return GST_SUB_PARSE_FORMAT_SUBRIP;
1754   }
1755   if (g_regex_match (dks_grx, match_str, 0, NULL)) {
1756     GST_LOG ("DKS (time based) format detected");
1757     return GST_SUB_PARSE_FORMAT_DKS;
1758   }
1759   if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) {
1760     GST_LOG ("WebVTT (time based) format detected");
1761     return GST_SUB_PARSE_FORMAT_VTT;
1762   }
1763
1764   if (!strncmp (match_str, "FORMAT=TIME", 11)) {
1765     GST_LOG ("MPSub (time based) format detected");
1766     return GST_SUB_PARSE_FORMAT_MPSUB;
1767   }
1768   if (strstr (match_str, "<SAMI>") != NULL ||
1769       strstr (match_str, "<sami>") != NULL) {
1770     GST_LOG ("SAMI (time based) format detected");
1771     return GST_SUB_PARSE_FORMAT_SAMI;
1772   }
1773   /* we're boldly assuming the first subtitle appears within the first hour */
1774   if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 ||
1775       sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 ||
1776       sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 ||
1777       sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 ||
1778       sscanf (match_str, "00:%02u:%02u,%u=", &n1, &n2, &n3) == 3) {
1779     GST_LOG ("TMPlayer (time based) format detected");
1780     return GST_SUB_PARSE_FORMAT_TMPLAYER;
1781   }
1782   if (sscanf (match_str, "[%u][%u]", &n1, &n2) == 2) {
1783     GST_LOG ("MPL2 (time based) format detected");
1784     return GST_SUB_PARSE_FORMAT_MPL2;
1785   }
1786   if (strstr (match_str, "[INFORMATION]") != NULL) {
1787     GST_LOG ("SubViewer (time based) format detected");
1788     return GST_SUB_PARSE_FORMAT_SUBVIEWER;
1789   }
1790   if (strstr (match_str, "{QTtext}") != NULL) {
1791     GST_LOG ("QTtext (time based) format detected");
1792     return GST_SUB_PARSE_FORMAT_QTTEXT;
1793   }
1794   /* We assume the LRC file starts immediately */
1795   if (match_str[0] == '[') {
1796     gboolean all_lines_good = TRUE;
1797     gchar **split;
1798     gchar **ptr;
1799
1800     ptr = split = g_strsplit (match_str, "\n", -1);
1801     while (*ptr && *(ptr + 1)) {
1802       gchar *str = *ptr;
1803       gint len = strlen (str);
1804
1805       if (sscanf (str, "[%u:%02u.%02u]", &n1, &n2, &n3) == 3 ||
1806           sscanf (str, "[%u:%02u.%03u]", &n1, &n2, &n3) == 3) {
1807         all_lines_good = TRUE;
1808       } else if (str[len - 1] == ']' && strchr (str, ':') != NULL) {
1809         all_lines_good = TRUE;
1810       } else {
1811         all_lines_good = FALSE;
1812         break;
1813       }
1814
1815       ptr++;
1816     }
1817     g_strfreev (split);
1818
1819     if (all_lines_good)
1820       return GST_SUB_PARSE_FORMAT_LRC;
1821   }
1822
1823   GST_DEBUG ("no subtitle format detected");
1824   return GST_SUB_PARSE_FORMAT_UNKNOWN;
1825 }
1826
1827 static GstCaps *
1828 gst_sub_parse_format_autodetect (GstSubParse * self)
1829 {
1830   gchar *data;
1831   GstSubParseFormat format;
1832
1833 #ifdef TIZEN_FEATURE_UPSTREAM
1834   if (strlen (self->textbuf->str) < 6) {
1835 #else
1836   if (strlen (self->textbuf->str) < 30) {
1837 #endif
1838     GST_DEBUG ("File too small to be a subtitles file");
1839     return NULL;
1840   }
1841
1842   data = g_strndup (self->textbuf->str, 35);
1843   format = gst_sub_parse_data_format_autodetect (data);
1844   g_free (data);
1845
1846   self->parser_type = format;
1847   self->subtitle_codec = gst_sub_parse_get_format_description (format);
1848   parser_state_init (&self->state);
1849   self->state.allowed_tags = NULL;
1850
1851   switch (format) {
1852     case GST_SUB_PARSE_FORMAT_MDVDSUB:
1853       self->parse_line = parse_mdvdsub;
1854       return gst_caps_new_simple ("text/x-raw",
1855           "format", G_TYPE_STRING, "pango-markup", NULL);
1856     case GST_SUB_PARSE_FORMAT_SUBRIP:
1857       self->state.allowed_tags = (gpointer) allowed_srt_tags;
1858       self->state.allows_tag_attributes = FALSE;
1859       self->parse_line = parse_subrip;
1860       return gst_caps_new_simple ("text/x-raw",
1861           "format", G_TYPE_STRING, "pango-markup", NULL);
1862     case GST_SUB_PARSE_FORMAT_MPSUB:
1863       self->parse_line = parse_mpsub;
1864       return gst_caps_new_simple ("text/x-raw",
1865           "format", G_TYPE_STRING, "utf8", NULL);
1866     case GST_SUB_PARSE_FORMAT_SAMI:
1867       self->parse_line = parse_sami;
1868       sami_context_init (&self->state);
1869       return gst_caps_new_simple ("text/x-raw",
1870           "format", G_TYPE_STRING, "pango-markup", NULL);
1871     case GST_SUB_PARSE_FORMAT_TMPLAYER:
1872       self->parse_line = parse_tmplayer;
1873       self->state.max_duration = 5 * GST_SECOND;
1874       return gst_caps_new_simple ("text/x-raw",
1875           "format", G_TYPE_STRING, "utf8", NULL);
1876     case GST_SUB_PARSE_FORMAT_MPL2:
1877       self->parse_line = parse_mpl2;
1878       return gst_caps_new_simple ("text/x-raw",
1879           "format", G_TYPE_STRING, "pango-markup", NULL);
1880     case GST_SUB_PARSE_FORMAT_DKS:
1881       self->parse_line = parse_dks;
1882       return gst_caps_new_simple ("text/x-raw",
1883           "format", G_TYPE_STRING, "utf8", NULL);
1884     case GST_SUB_PARSE_FORMAT_VTT:
1885       self->state.allowed_tags = (gpointer) allowed_vtt_tags;
1886       self->state.allows_tag_attributes = TRUE;
1887       self->parse_line = parse_webvtt;
1888       return gst_caps_new_simple ("text/x-raw",
1889           "format", G_TYPE_STRING, "pango-markup", NULL);
1890     case GST_SUB_PARSE_FORMAT_SUBVIEWER:
1891       self->parse_line = parse_subviewer;
1892       return gst_caps_new_simple ("text/x-raw",
1893           "format", G_TYPE_STRING, "utf8", NULL);
1894     case GST_SUB_PARSE_FORMAT_QTTEXT:
1895       self->parse_line = parse_qttext;
1896       qttext_context_init (&self->state);
1897       return gst_caps_new_simple ("text/x-raw",
1898           "format", G_TYPE_STRING, "pango-markup", NULL);
1899     case GST_SUB_PARSE_FORMAT_LRC:
1900       self->parse_line = parse_lrc;
1901       return gst_caps_new_simple ("text/x-raw",
1902           "format", G_TYPE_STRING, "utf8", NULL);
1903     case GST_SUB_PARSE_FORMAT_UNKNOWN:
1904     default:
1905       GST_DEBUG ("no subtitle format detected");
1906       GST_ELEMENT_ERROR (self, STREAM, WRONG_TYPE,
1907           ("The input is not a valid/supported subtitle file"), (NULL));
1908       return NULL;
1909   }
1910 }
1911
1912 static void
1913 feed_textbuf (GstSubParse * self, GstBuffer * buf)
1914 {
1915   gboolean discont;
1916   gsize consumed;
1917   gchar *input = NULL;
1918   const guint8 *data;
1919   gsize avail;
1920
1921   discont = GST_BUFFER_IS_DISCONT (buf);
1922
1923   if (GST_BUFFER_OFFSET_IS_VALID (buf) &&
1924       GST_BUFFER_OFFSET (buf) != self->offset) {
1925     self->offset = GST_BUFFER_OFFSET (buf);
1926     discont = TRUE;
1927   }
1928
1929   if (discont) {
1930     GST_INFO ("discontinuity");
1931     /* flush the parser state */
1932     parser_state_init (&self->state);
1933     g_string_truncate (self->textbuf, 0);
1934     gst_adapter_clear (self->adapter);
1935     if (self->parser_type == GST_SUB_PARSE_FORMAT_SAMI)
1936       sami_context_reset (&self->state);
1937     /* we could set a flag to make sure that the next buffer we push out also
1938      * has the DISCONT flag set, but there's no point really given that it's
1939      * subtitles which are discontinuous by nature. */
1940   }
1941
1942   self->offset += gst_buffer_get_size (buf);
1943
1944   gst_adapter_push (self->adapter, buf);
1945
1946   avail = gst_adapter_available (self->adapter);
1947   data = gst_adapter_map (self->adapter, avail);
1948   input = convert_encoding (self, (const gchar *) data, avail, &consumed);
1949
1950   if (input && consumed > 0) {
1951     self->textbuf = g_string_append (self->textbuf, input);
1952     gst_adapter_unmap (self->adapter);
1953     gst_adapter_flush (self->adapter, consumed);
1954   } else {
1955     gst_adapter_unmap (self->adapter);
1956   }
1957
1958   g_free (input);
1959 }
1960
1961 #ifdef TIZEN_FEATURE_UPSTREAM
1962 static void
1963 xml_text (GMarkupParseContext * context,
1964     const gchar * text, gsize text_len, gpointer user_data, GError ** error)
1965 {
1966   gchar **accum = (gchar **) user_data;
1967   gchar *concat;
1968
1969   if (*accum) {
1970     concat = g_strconcat (*accum, text, NULL);
1971     g_free (*accum);
1972     *accum = concat;
1973   } else {
1974     *accum = g_strdup (text);
1975   }
1976 }
1977
1978 static gchar *
1979 strip_pango_markup (gchar * markup, GError ** error)
1980 {
1981   GMarkupParser parser = { 0, };
1982   GMarkupParseContext *context;
1983   gchar *accum = NULL;
1984
1985   parser.text = xml_text;
1986   context = g_markup_parse_context_new (&parser, 0, &accum, NULL);
1987
1988   g_markup_parse_context_parse (context, "<root>", 6, NULL);
1989   g_markup_parse_context_parse (context, markup, strlen (markup), error);
1990   g_markup_parse_context_parse (context, "</root>", 7, NULL);
1991   if (*error)
1992     goto error;
1993
1994   g_markup_parse_context_end_parse (context, error);
1995   if (*error)
1996     goto error;
1997
1998 done:
1999   g_markup_parse_context_free (context);
2000   return accum;
2001
2002 error:
2003   g_free (accum);
2004   accum = NULL;
2005   goto done;
2006 }
2007
2008 static gboolean
2009 gst_sub_parse_negotiate (GstSubParse * self, GstCaps * preferred)
2010 {
2011   GstCaps *caps;
2012   gboolean ret = FALSE;
2013   const GstStructure *s1, *s2;
2014
2015   caps = gst_pad_get_allowed_caps (self->srcpad);
2016
2017   s1 = gst_caps_get_structure (preferred, 0);
2018
2019   if (!g_strcmp0 (gst_structure_get_string (s1, "format"), "utf8")) {
2020     GstCaps *intersected = gst_caps_intersect (caps, preferred);
2021     gst_caps_unref (caps);
2022     caps = intersected;
2023   }
2024
2025   caps = gst_caps_fixate (caps);
2026
2027   if (gst_caps_is_empty (caps)) {
2028     goto done;
2029   }
2030
2031   s2 = gst_caps_get_structure (caps, 0);
2032
2033   self->strip_pango_markup =
2034       !g_strcmp0 (gst_structure_get_string (s2, "format"), "utf8")
2035       && !g_strcmp0 (gst_structure_get_string (s1, "format"), "pango-markup");
2036
2037   if (self->strip_pango_markup) {
2038     GST_INFO_OBJECT (self, "We will convert from pango-markup to utf8");
2039   }
2040
2041   ret = gst_pad_set_caps (self->srcpad, caps);
2042
2043 done:
2044   gst_caps_unref (caps);
2045   return ret;
2046 }
2047 #endif
2048
2049 static GstFlowReturn
2050 handle_buffer (GstSubParse * self, GstBuffer * buf)
2051 {
2052   GstFlowReturn ret = GST_FLOW_OK;
2053 #ifndef TIZEN_FEATURE_UPSTREAM
2054   GstCaps *caps = NULL;
2055 #endif
2056   gchar *line, *subtitle;
2057   gboolean need_tags = FALSE;
2058 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
2059   GstMessage *m = NULL;
2060 #endif
2061 #ifdef TIZEN_FEATURE_HLS_WEBVTT
2062   GstClockTime fragment_timestamp = GST_CLOCK_TIME_NONE;
2063 #endif
2064
2065   if (self->first_buffer) {
2066     GstMapInfo map;
2067
2068     gst_buffer_map (buf, &map, GST_MAP_READ);
2069     self->detected_encoding = detect_encoding ((gchar *) map.data, map.size);
2070     gst_buffer_unmap (buf, &map);
2071     self->first_buffer = FALSE;
2072     self->state.fps_n = self->fps_n;
2073     self->state.fps_d = self->fps_d;
2074   }
2075 #ifdef TIZEN_FEATURE_HLS_WEBVTT
2076   if (GST_BUFFER_IS_DISCONT (buf) && GST_BUFFER_PTS_IS_VALID (buf))
2077     fragment_timestamp = GST_BUFFER_PTS (buf);
2078 #endif
2079
2080   feed_textbuf (self, buf);
2081
2082   /* make sure we know the format */
2083   if (G_UNLIKELY (self->parser_type == GST_SUB_PARSE_FORMAT_UNKNOWN)) {
2084 #ifdef TIZEN_FEATURE_UPSTREAM
2085     GstCaps *preferred;
2086
2087     if (!(preferred = gst_sub_parse_format_autodetect (self))) {
2088       return GST_FLOW_NOT_NEGOTIATED;
2089     }
2090
2091     if (!gst_sub_parse_negotiate (self, preferred)) {
2092       gst_caps_unref (preferred);
2093       return GST_FLOW_NOT_NEGOTIATED;
2094     }
2095
2096     gst_caps_unref (preferred);
2097 #else
2098     if (!(caps = gst_sub_parse_format_autodetect (self))) {
2099       return GST_FLOW_EOS;
2100     }
2101     if (!gst_pad_set_caps (self->srcpad, caps)) {
2102       gst_caps_unref (caps);
2103       return GST_FLOW_EOS;
2104     }
2105     gst_caps_unref (caps);
2106 #endif
2107     need_tags = TRUE;
2108   }
2109
2110   /* Push newsegment if needed */
2111   if (self->need_segment) {
2112     GST_LOG_OBJECT (self, "pushing newsegment event with %" GST_SEGMENT_FORMAT,
2113         &self->segment);
2114
2115     gst_pad_push_event (self->srcpad, gst_event_new_segment (&self->segment));
2116     self->need_segment = FALSE;
2117   }
2118
2119   if (need_tags) {
2120     /* push tags */
2121     if (self->subtitle_codec != NULL) {
2122       GstTagList *tags;
2123
2124       tags = gst_tag_list_new (GST_TAG_SUBTITLE_CODEC, self->subtitle_codec,
2125           NULL);
2126       gst_pad_push_event (self->srcpad, gst_event_new_tag (tags));
2127     }
2128   }
2129 #ifdef TIZEN_FEATURE_HLS_WEBVTT
2130   if (self->parser_type == GST_SUB_PARSE_FORMAT_VTT)
2131     send_fragment_timestamp_event (self, fragment_timestamp);
2132 #endif
2133
2134   while (!self->flushing && (line = get_next_line (self))) {
2135     guint offset = 0;
2136
2137     /* Set segment on our parser state machine */
2138     self->state.segment = &self->segment;
2139     /* Now parse the line, out of segment lines will just return NULL */
2140     GST_LOG_OBJECT (self, "State %d. Parsing line '%s'", self->state.state,
2141         line + offset);
2142     subtitle = self->parse_line (&self->state, line + offset);
2143     g_free (line);
2144 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
2145     if (!self->state.langlist_msg_posted && self->state.language_list) {
2146       m = gst_message_new_element (GST_OBJECT_CAST (self),
2147           gst_structure_new ("Ext_Sub_Language_List", "lang_list",
2148               G_TYPE_POINTER, self->state.language_list, NULL));
2149
2150       gst_element_post_message (GST_ELEMENT_CAST (self), m);
2151       self->state.langlist_msg_posted = TRUE;
2152       GST_DEBUG_OBJECT (self, "curr lang as : %s ",
2153           GST_STR_NULL (self->state.current_language));
2154     }
2155 #endif
2156     if (subtitle) {
2157 #ifdef TIZEN_FEATURE_UPSTREAM
2158       guint subtitle_len;
2159
2160       if (self->strip_pango_markup) {
2161         GError *error = NULL;
2162         gchar *stripped;
2163
2164         if ((stripped = strip_pango_markup (subtitle, &error))) {
2165           g_free (subtitle);
2166           subtitle = stripped;
2167         } else {
2168           GST_WARNING_OBJECT (self, "Failed to strip pango markup: %s",
2169               error->message);
2170         }
2171       }
2172
2173       subtitle_len = strlen (subtitle);
2174 #else
2175       guint subtitle_len = strlen (subtitle);
2176 #endif
2177       /* +1 for terminating NUL character */
2178       buf = gst_buffer_new_and_alloc (subtitle_len + 1);
2179
2180       /* copy terminating NUL character as well */
2181       gst_buffer_fill (buf, 0, subtitle, subtitle_len + 1);
2182       gst_buffer_set_size (buf, subtitle_len);
2183
2184       GST_BUFFER_TIMESTAMP (buf) = self->state.start_time;
2185       GST_BUFFER_DURATION (buf) = self->state.duration;
2186
2187       /* in some cases (e.g. tmplayer) we can only determine the duration
2188        * of a text chunk from the timestamp of the next text chunk; in those
2189        * cases, we probably want to limit the duration to something
2190        * reasonable, so we don't end up showing some text for e.g. 40 seconds
2191        * just because nothing else is being said during that time */
2192       if (self->state.max_duration > 0 && GST_BUFFER_DURATION_IS_VALID (buf)) {
2193         if (GST_BUFFER_DURATION (buf) > self->state.max_duration)
2194           GST_BUFFER_DURATION (buf) = self->state.max_duration;
2195       }
2196
2197       self->segment.position = self->state.start_time;
2198
2199       GST_DEBUG_OBJECT (self, "Sending text '%s', %" GST_TIME_FORMAT " + %"
2200           GST_TIME_FORMAT, subtitle, GST_TIME_ARGS (self->state.start_time),
2201           GST_TIME_ARGS (self->state.duration));
2202
2203       g_free (self->state.vertical);
2204       self->state.vertical = NULL;
2205       g_free (self->state.alignment);
2206       self->state.alignment = NULL;
2207
2208       ret = gst_pad_push (self->srcpad, buf);
2209
2210       /* move this forward (the tmplayer parser needs this) */
2211       if (self->state.duration != GST_CLOCK_TIME_NONE)
2212         self->state.start_time += self->state.duration;
2213
2214       g_free (subtitle);
2215       subtitle = NULL;
2216
2217       if (ret != GST_FLOW_OK) {
2218         GST_DEBUG_OBJECT (self, "flow: %s", gst_flow_get_name (ret));
2219         break;
2220       }
2221     }
2222   }
2223
2224   return ret;
2225 }
2226
2227 static GstFlowReturn
2228 gst_sub_parse_chain (GstPad * sinkpad, GstObject * parent, GstBuffer * buf)
2229 {
2230   GstFlowReturn ret;
2231   GstSubParse *self;
2232
2233   self = GST_SUBPARSE (parent);
2234
2235   ret = handle_buffer (self, buf);
2236
2237   return ret;
2238 }
2239
2240 static gboolean
2241 gst_sub_parse_sink_event (GstPad * pad, GstObject * parent, GstEvent * event)
2242 {
2243   GstSubParse *self = GST_SUBPARSE (parent);
2244   gboolean ret = FALSE;
2245
2246   GST_LOG_OBJECT (self, "%s event", GST_EVENT_TYPE_NAME (event));
2247
2248   switch (GST_EVENT_TYPE (event)) {
2249     case GST_EVENT_STREAM_GROUP_DONE:
2250     case GST_EVENT_EOS:{
2251       /* Make sure the last subrip chunk is pushed out even
2252        * if the file does not have an empty line at the end */
2253       if (self->parser_type == GST_SUB_PARSE_FORMAT_SUBRIP ||
2254           self->parser_type == GST_SUB_PARSE_FORMAT_TMPLAYER ||
2255           self->parser_type == GST_SUB_PARSE_FORMAT_MPL2 ||
2256           self->parser_type == GST_SUB_PARSE_FORMAT_QTTEXT ||
2257           self->parser_type == GST_SUB_PARSE_FORMAT_VTT) {
2258         gchar term_chars[] = { '\n', '\n', '\0' };
2259         GstBuffer *buf = gst_buffer_new_and_alloc (2 + 1);
2260
2261         GST_DEBUG_OBJECT (self, "%s: force pushing of any remaining text",
2262             GST_EVENT_TYPE_NAME (event));
2263
2264         gst_buffer_fill (buf, 0, term_chars, 3);
2265         gst_buffer_set_size (buf, 2);
2266
2267         GST_BUFFER_OFFSET (buf) = self->offset;
2268         gst_sub_parse_chain (pad, parent, buf);
2269       }
2270       ret = gst_pad_event_default (pad, parent, event);
2271       break;
2272     }
2273     case GST_EVENT_SEGMENT:
2274     {
2275       const GstSegment *s;
2276
2277 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
2278       if (self->first_buffer) {
2279         if (!SUBPARSE_SEEK_TRYLOCK (self)) {
2280           /* new seeking request is in process */
2281           GST_WARNING_OBJECT (self, "ignore the old newsegment event");
2282           ret = TRUE;
2283           gst_event_unref (event);
2284           break;
2285         }
2286       } else {
2287         SUBPARSE_SEEK_LOCK (self);
2288       }
2289 #endif
2290
2291       gst_event_parse_segment (event, &s);
2292       if (s->format == GST_FORMAT_TIME)
2293         gst_event_copy_segment (event, &self->segment);
2294       GST_DEBUG_OBJECT (self, "newsegment (%s)",
2295           gst_format_get_name (self->segment.format));
2296
2297       /* if not time format, we'll either start with a 0 timestamp anyway or
2298        * it's following a seek in which case we'll have saved the requested
2299        * seek segment and don't want to overwrite it (remember that on a seek
2300        * we always just seek back to the start in BYTES format and just throw
2301        * away all text that's before the requested position; if the subtitles
2302        * come from an upstream demuxer, it won't be able to handle our BYTES
2303        * seek request and instead send us a newsegment from the seek request
2304        * it received via its video pads instead, so all is fine then too) */
2305       ret = TRUE;
2306       gst_event_unref (event);
2307       /* in either case, let's not simply discard this event;
2308        * trigger sending of the saved requested seek segment
2309        * or the one taken here from upstream */
2310       self->need_segment = TRUE;
2311
2312 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
2313       SUBPARSE_SEEK_UNLOCK (self);
2314 #endif
2315
2316       break;
2317     }
2318     case GST_EVENT_FLUSH_START:
2319     {
2320       self->flushing = TRUE;
2321
2322       ret = gst_pad_event_default (pad, parent, event);
2323       break;
2324     }
2325     case GST_EVENT_FLUSH_STOP:
2326     {
2327       self->flushing = FALSE;
2328
2329       ret = gst_pad_event_default (pad, parent, event);
2330       break;
2331     }
2332     default:
2333       ret = gst_pad_event_default (pad, parent, event);
2334       break;
2335   }
2336
2337   return ret;
2338 }
2339
2340
2341 static GstStateChangeReturn
2342 gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
2343 {
2344   GstStateChangeReturn ret = GST_STATE_CHANGE_SUCCESS;
2345   GstSubParse *self = GST_SUBPARSE (element);
2346
2347   switch (transition) {
2348     case GST_STATE_CHANGE_READY_TO_PAUSED:
2349       /* format detection will init the parser state */
2350       self->offset = 0;
2351       self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
2352 #ifdef TIZEN_FEATURE_UPSTREAM
2353       self->strip_pango_markup = FALSE;
2354 #endif
2355       self->valid_utf8 = TRUE;
2356       self->first_buffer = TRUE;
2357       g_free (self->detected_encoding);
2358       self->detected_encoding = NULL;
2359       g_string_truncate (self->textbuf, 0);
2360       gst_adapter_clear (self->adapter);
2361       break;
2362     default:
2363       break;
2364   }
2365
2366   ret = GST_ELEMENT_CLASS (parent_class)->change_state (element, transition);
2367   if (ret == GST_STATE_CHANGE_FAILURE)
2368     return ret;
2369
2370   switch (transition) {
2371     case GST_STATE_CHANGE_PAUSED_TO_READY:
2372       parser_state_dispose (self, &self->state);
2373       self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
2374       break;
2375     default:
2376       break;
2377   }
2378
2379   return ret;
2380 }
2381
2382 /*
2383  * Typefind support.
2384  */
2385
2386 /* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so;
2387  * also, give different  subtitle formats really different types */
2388 static GstStaticCaps mpl2_caps =
2389 GST_STATIC_CAPS ("application/x-subtitle-mpl2");
2390 #define SUB_CAPS (gst_static_caps_get (&sub_caps))
2391
2392 static GstStaticCaps tmp_caps =
2393 GST_STATIC_CAPS ("application/x-subtitle-tmplayer");
2394 #define TMP_CAPS (gst_static_caps_get (&tmp_caps))
2395
2396 static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle");
2397 #define MPL2_CAPS (gst_static_caps_get (&mpl2_caps))
2398
2399 static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami");
2400 #define SAMI_CAPS (gst_static_caps_get (&smi_caps))
2401
2402 static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks");
2403 #define DKS_CAPS (gst_static_caps_get (&dks_caps))
2404
2405 static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt");
2406 #define VTT_CAPS (gst_static_caps_get (&vtt_caps))
2407
2408 static GstStaticCaps qttext_caps =
2409 GST_STATIC_CAPS ("application/x-subtitle-qttext");
2410 #define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps))
2411
2412 static GstStaticCaps lrc_caps = GST_STATIC_CAPS ("application/x-subtitle-lrc");
2413 #define LRC_CAPS (gst_static_caps_get (&lrc_caps))
2414
2415 static void
2416 gst_subparse_type_find (GstTypeFind * tf, gpointer private)
2417 {
2418   GstSubParseFormat format;
2419   const guint8 *data;
2420   GstCaps *caps;
2421   gchar *str;
2422   gchar *encoding = NULL;
2423   const gchar *end;
2424
2425   if (!(data = gst_type_find_peek (tf, 0, 129)))
2426     return;
2427
2428   /* make sure string passed to _autodetect() is NUL-terminated */
2429   str = g_malloc0 (129);
2430   memcpy (str, data, 128);
2431
2432   if ((encoding = detect_encoding (str, 128)) != NULL) {
2433     gchar *converted_str;
2434     GError *err = NULL;
2435     gsize tmp;
2436
2437     converted_str = gst_convert_to_utf8 (str, 128, encoding, &tmp, &err);
2438     if (converted_str == NULL) {
2439       GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding,
2440           err->message);
2441       g_clear_error (&err);
2442     } else {
2443       g_free (str);
2444       str = converted_str;
2445     }
2446     g_free (encoding);
2447   }
2448
2449   /* Check if at least the first 120 chars are valid UTF8,
2450    * otherwise convert as always */
2451   if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) {
2452     gchar *converted_str;
2453     gsize tmp;
2454     const gchar *enc;
2455
2456     enc = g_getenv ("GST_SUBTITLE_ENCODING");
2457     if (enc == NULL || *enc == '\0') {
2458       /* if local encoding is UTF-8 and no encoding specified
2459        * via the environment variable, assume ISO-8859-15 */
2460       if (g_get_charset (&enc)) {
2461         enc = "ISO-8859-15";
2462       }
2463     }
2464     converted_str = gst_convert_to_utf8 (str, 128, enc, &tmp, NULL);
2465     if (converted_str != NULL) {
2466       g_free (str);
2467       str = converted_str;
2468     }
2469   }
2470
2471   format = gst_sub_parse_data_format_autodetect (str);
2472   g_free (str);
2473
2474   switch (format) {
2475     case GST_SUB_PARSE_FORMAT_MDVDSUB:
2476       GST_DEBUG ("MicroDVD format detected");
2477       caps = SUB_CAPS;
2478       break;
2479     case GST_SUB_PARSE_FORMAT_SUBRIP:
2480       GST_DEBUG ("SubRip format detected");
2481       caps = SUB_CAPS;
2482       break;
2483     case GST_SUB_PARSE_FORMAT_MPSUB:
2484       GST_DEBUG ("MPSub format detected");
2485       caps = SUB_CAPS;
2486       break;
2487     case GST_SUB_PARSE_FORMAT_SAMI:
2488       GST_DEBUG ("SAMI (time-based) format detected");
2489       caps = SAMI_CAPS;
2490       break;
2491     case GST_SUB_PARSE_FORMAT_TMPLAYER:
2492       GST_DEBUG ("TMPlayer (time based) format detected");
2493       caps = TMP_CAPS;
2494       break;
2495       /* FIXME: our MPL2 typefinding is not really good enough to warrant
2496        * returning a high probability (however, since we registered our
2497        * typefinder here with a rank of MARGINAL we should pretty much only
2498        * be called if most other typefinders have already run */
2499     case GST_SUB_PARSE_FORMAT_MPL2:
2500       GST_DEBUG ("MPL2 (time based) format detected");
2501       caps = MPL2_CAPS;
2502       break;
2503     case GST_SUB_PARSE_FORMAT_SUBVIEWER:
2504       GST_DEBUG ("SubViewer format detected");
2505       caps = SUB_CAPS;
2506       break;
2507     case GST_SUB_PARSE_FORMAT_DKS:
2508       GST_DEBUG ("DKS format detected");
2509       caps = DKS_CAPS;
2510       break;
2511     case GST_SUB_PARSE_FORMAT_QTTEXT:
2512       GST_DEBUG ("QTtext format detected");
2513       caps = QTTEXT_CAPS;
2514       break;
2515     case GST_SUB_PARSE_FORMAT_LRC:
2516       GST_DEBUG ("LRC format detected");
2517       caps = LRC_CAPS;
2518       break;
2519     case GST_SUB_PARSE_FORMAT_VTT:
2520       GST_DEBUG ("WebVTT format detected");
2521       caps = VTT_CAPS;
2522       break;
2523     default:
2524     case GST_SUB_PARSE_FORMAT_UNKNOWN:
2525       GST_DEBUG ("no subtitle format detected");
2526       return;
2527   }
2528
2529   /* if we're here, it's ok */
2530   gst_type_find_suggest (tf, GST_TYPE_FIND_MAXIMUM, caps);
2531 }
2532
2533 static gboolean
2534 plugin_init (GstPlugin * plugin)
2535 {
2536   GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser");
2537
2538   if (!gst_type_find_register (plugin, "subparse_typefind", GST_RANK_MARGINAL,
2539           gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt",
2540           SUB_CAPS, NULL, NULL))
2541     return FALSE;
2542
2543   if (!gst_element_register (plugin, "subparse",
2544           GST_RANK_PRIMARY, GST_TYPE_SUBPARSE) ||
2545       !gst_element_register (plugin, "ssaparse",
2546           GST_RANK_PRIMARY, GST_TYPE_SSA_PARSE)) {
2547     return FALSE;
2548   }
2549
2550   return TRUE;
2551 }
2552
2553 GST_PLUGIN_DEFINE (GST_VERSION_MAJOR,
2554     GST_VERSION_MINOR,
2555     subparse,
2556     "Subtitle parsing",
2557     plugin_init, VERSION, "LGPL", GST_PACKAGE_NAME, GST_PACKAGE_ORIGIN)