From e7b42af896131f265c9b3f14f5b5c670500eea68 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Sebastian=20Dr=C3=B6ge?= <slomo@circular-chaos.org>
Date: Mon, 13 Oct 2008 08:58:29 +0000
Subject: [PATCH] gst/subparse/: Add support for UTF16/UTF32 subtitles as long
 as the first bytes of the first buffer contain the BOM. ...

Original commit message from CVS:
* gst/subparse/Makefile.am:
* gst/subparse/gstsubparse.c: (gst_sub_parse_dispose),
(gst_sub_parse_class_init), (gst_sub_parse_init),
(gst_convert_to_utf8), (detect_encoding), (convert_encoding),
(get_next_line), (gst_sub_parse_data_format_autodetect),
(feed_textbuf), (handle_buffer), (gst_sub_parse_change_state),
(gst_subparse_type_find):
* gst/subparse/gstsubparse.h:
Add support for UTF16/UTF32 subtitles as long as the first bytes of
the first buffer contain the BOM. This also adds support for other
encodings that allow NUL bytes via the encoding property.
Fixes bugs #552237 and #456788.
---
 ChangeLog                  |  15 ++++
 gst/subparse/Makefile.am   |   4 +-
 gst/subparse/gstsubparse.c | 171 +++++++++++++++++++++++++++++++++++++--------
 gst/subparse/gstsubparse.h |   7 +-
 4 files changed, 163 insertions(+), 34 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a5be1ec..39f5863 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2008-10-13  Sebastian DrÃ¶ge  <sebastian.droege@collabora.co.uk>
 
+	* gst/subparse/Makefile.am:
+	* gst/subparse/gstsubparse.c: (gst_sub_parse_dispose),
+	(gst_sub_parse_class_init), (gst_sub_parse_init),
+	(gst_convert_to_utf8), (detect_encoding), (convert_encoding),
+	(get_next_line), (gst_sub_parse_data_format_autodetect),
+	(feed_textbuf), (handle_buffer), (gst_sub_parse_change_state),
+	(gst_subparse_type_find):
+	* gst/subparse/gstsubparse.h:
+	Add support for UTF16/UTF32 subtitles as long as the first bytes of
+	the first buffer contain the BOM. This also adds support for other
+	encodings that allow NUL bytes via the encoding property.
+	Fixes bugs #552237 and #456788.
+
+2008-10-13  Sebastian DrÃ¶ge  <sebastian.droege@collabora.co.uk>
+
 	* gst-libs/gst/tag/tags.c: (gst_tag_image_data_to_image_buffer):
 	Don't drop the last byte of image tags if they're not an URI list.
 	Fixes bug #556066.
diff --git a/gst/subparse/Makefile.am b/gst/subparse/Makefile.am
index 18e091a..e958242 100644
--- a/gst/subparse/Makefile.am
+++ b/gst/subparse/Makefile.am
@@ -17,9 +17,9 @@ libgstsubparse_la_SOURCES = \
 	mpl2parse.c \
 	mpl2parse.h
 
-libgstsubparse_la_CFLAGS = $(GST_CFLAGS)
+libgstsubparse_la_CFLAGS = $(GST_CFLAGS) $(GST_BASE_CFLAGS)
 libgstsubparse_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
-libgstsubparse_la_LIBADD = $(GST_LIBS)
+libgstsubparse_la_LIBADD = $(GST_LIBS) $(GST_BASE_LIBS)
 
 noinst_HEADERS = \
 	gstssaparse.h \
diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c
index 5fbcfdd..8200f1b 100644
--- a/gst/subparse/gstsubparse.c
+++ b/gst/subparse/gstsubparse.c
@@ -142,6 +142,17 @@ gst_sub_parse_dispose (GObject * object)
     g_free (subparse->encoding);
     subparse->encoding = NULL;
   }
+
+  if (subparse->detected_encoding) {
+    g_free (subparse->detected_encoding);
+    subparse->detected_encoding = NULL;
+  }
+
+  if (subparse->adapter) {
+    gst_object_unref (subparse->adapter);
+    subparse->adapter = NULL;
+  }
+
   if (subparse->textbuf) {
     g_string_free (subparse->textbuf, TRUE);
     subparse->textbuf = NULL;
@@ -169,10 +180,10 @@ gst_sub_parse_class_init (GstSubParseClass * klass)
 
   g_object_class_install_property (object_class, PROP_ENCODING,
       g_param_spec_string ("subtitle-encoding", "subtitle charset encoding",
-          "Encoding to assume if input subtitles are not in UTF-8 encoding. "
-          "If not set, the GST_SUBTITLE_ENCODING environment variable will "
-          "be checked for an encoding to use. If that is not set either, "
-          "ISO-8859-15 will be assumed.", DEFAULT_ENCODING,
+          "Encoding to assume if input subtitles are not in UTF-8 or any other "
+          "Unicode encoding. If not set, the GST_SUBTITLE_ENCODING environment "
+          "variable will be checked for an encoding to use. If that is not set "
+          "either, ISO-8859-15 will be assumed.", DEFAULT_ENCODING,
           G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
 }
 
@@ -197,6 +208,8 @@ gst_sub_parse_init (GstSubParse * subparse)
   gst_segment_init (&subparse->segment, GST_FORMAT_TIME);
   subparse->need_segment = TRUE;
   subparse->encoding = g_strdup (DEFAULT_ENCODING);
+  subparse->detected_encoding = NULL;
+  subparse->adapter = gst_adapter_new ();
 }
 
 /*
@@ -304,21 +317,88 @@ gst_sub_parse_get_property (GObject * object, guint prop_id,
 }
 
 static gchar *
-convert_encoding (GstSubParse * self, const gchar * str, gsize len)
+gst_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding,
+    gsize * consumed, GError ** err)
+{
+  gchar *ret = NULL;
+
+  *consumed = 0;
+  ret =
+      g_convert_with_fallback (str, len, "UTF-8", encoding, "*", consumed, NULL,
+      err);
+  if (ret == NULL)
+    return ret;
+
+  /* + 3 to skip UTF-8 BOM if it was added */
+  len = strlen (ret);
+  if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB
+      && (guint8) ret[2] == 0xBF)
+    g_memmove (ret, ret + 3, len + 1 - 3);
+
+  return ret;
+}
+
+static gchar *
+detect_encoding (const gchar * str, gsize len)
+{
+  if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB
+      && (guint8) str[2] == 0xBF)
+    return g_strdup ("UTF-8");
+
+  if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF)
+    return g_strdup ("UTF-16BE");
+
+  if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE)
+    return g_strdup ("UTF-16LE");
+
+  if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00
+      && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF)
+    return g_strdup ("UTF-32BE");
+
+  if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE
+      && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00)
+    return g_strdup ("UTF-32LE");
+
+  return NULL;
+}
+
+static gchar *
+convert_encoding (GstSubParse * self, const gchar * str, gsize len,
+    gsize * consumed)
 {
   const gchar *encoding;
   GError *err = NULL;
-  gchar *ret;
+  gchar *ret = NULL;
 
+  *consumed = 0;
+
+  /* First try any detected encoding */
+  if (self->detected_encoding) {
+    ret =
+        gst_convert_to_utf8 (str, len, self->detected_encoding, consumed, &err);
+
+    if (!err)
+      return ret;
+
+    GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
+        encoding, err->message);
+    g_free (self->detected_encoding);
+    self->detected_encoding = NULL;
+    g_error_free (err);
+  }
+
+  /* Otherwise check if it's UTF8 */
   if (self->valid_utf8) {
     if (g_utf8_validate (str, len, NULL)) {
       GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
+      *consumed = len;
       return g_strndup (str, len);
     }
     GST_INFO_OBJECT (self, "invalid UTF-8!");
     self->valid_utf8 = FALSE;
   }
 
+  /* Else try fallback */
   encoding = self->encoding;
   if (encoding == NULL || *encoding == '\0') {
     encoding = g_getenv ("GST_SUBTITLE_ENCODING");
@@ -331,8 +411,7 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len)
     }
   }
 
-  ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
-      NULL, &err);
+  ret = gst_convert_to_utf8 (str, len, encoding, consumed, &err);
 
   if (err) {
     GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
@@ -340,8 +419,7 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len)
     g_error_free (err);
 
     /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
-    ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
-        NULL, NULL, NULL);
+    ret = gst_convert_to_utf8 (str, len, "ISO-8859-15", consumed, NULL);
   }
 
   GST_LOG_OBJECT (self,
@@ -373,7 +451,7 @@ get_next_line (GstSubParse * self)
   }
 
   line_len = line_end - self->textbuf->str;
-  line = convert_encoding (self, self->textbuf->str, line_len);
+  line = g_strndup (self->textbuf->str, line_len);
   self->textbuf = g_string_erase (self->textbuf, 0,
       line_len + (have_r ? 2 : 1));
   return line;
@@ -922,11 +1000,6 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
     }
   }
 
-  /* If the string contains a UTF-8 BOM drop it */
-  if ((guint8) match_str[0] == 0xEF && (guint8) match_str[1] == 0xBB
-      && (guint8) match_str[2] == 0xBF)
-    match_str += 3;
-
   if (regexec (&mdvd_rx, match_str, 0, NULL, 0) == 0) {
     GST_LOG ("MicroDVD (frame based) format detected");
     return GST_SUB_PARSE_FORMAT_MDVDSUB;
@@ -1026,6 +1099,8 @@ static void
 feed_textbuf (GstSubParse * self, GstBuffer * buf)
 {
   gboolean discont;
+  gsize consumed;
+  gchar *input = NULL;
 
   discont = GST_BUFFER_IS_DISCONT (buf);
 
@@ -1040,6 +1115,7 @@ feed_textbuf (GstSubParse * self, GstBuffer * buf)
     /* flush the parser state */
     parser_state_init (&self->state);
     g_string_truncate (self->textbuf, 0);
+    gst_adapter_clear (self->adapter);
 #ifndef GST_DISABLE_XML
     sami_context_reset (&self->state);
 #endif
@@ -1048,12 +1124,22 @@ feed_textbuf (GstSubParse * self, GstBuffer * buf)
      * subtitles which are discontinuous by nature. */
   }
 
-  self->textbuf = g_string_append_len (self->textbuf,
-      (gchar *) GST_BUFFER_DATA (buf), GST_BUFFER_SIZE (buf));
   self->offset = GST_BUFFER_OFFSET (buf) + GST_BUFFER_SIZE (buf);
   self->next_offset = self->offset;
 
-  gst_buffer_unref (buf);
+  gst_adapter_push (self->adapter, buf);
+
+  input =
+      convert_encoding (self, (const gchar *) gst_adapter_peek (self->adapter,
+          gst_adapter_available (self->adapter)),
+      (gsize) gst_adapter_available (self->adapter), &consumed);
+
+  if (input && consumed > 0) {
+    self->textbuf = g_string_append (self->textbuf, input);
+    gst_adapter_flush (self->adapter, consumed);
+  }
+
+  g_free (input);
 }
 
 static GstFlowReturn
@@ -1063,6 +1149,13 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
   GstCaps *caps = NULL;
   gchar *line, *subtitle;
 
+  if (self->first_buffer) {
+    self->detected_encoding =
+        detect_encoding ((gchar *) GST_BUFFER_DATA (buf),
+        GST_BUFFER_SIZE (buf));
+    self->first_buffer = FALSE;
+  }
+
   feed_textbuf (self, buf);
 
   /* make sure we know the format */
@@ -1080,15 +1173,6 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
   while ((line = get_next_line (self)) && !self->flushing) {
     guint offset = 0;
 
-    /* If this is the first line and it contains a UTF-8 BOM drop it */
-    if (self->first_line && strlen (line) >= 3 &&
-        (guint8) line[0] == 0xEF && (guint8) line[1] == 0xBB
-        && (guint8) line[2] == 0xBF) {
-      offset = 3;
-    }
-
-    self->first_line = FALSE;
-
     /* Set segment on our parser state machine */
     self->state.segment = &self->segment;
     /* Now parse the line, out of segment lines will just return NULL */
@@ -1268,8 +1352,11 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
       self->next_offset = 0;
       self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
       self->valid_utf8 = TRUE;
-      self->first_line = TRUE;
+      self->first_buffer = TRUE;
+      g_free (self->detected_encoding);
+      self->detected_encoding = NULL;
       g_string_truncate (self->textbuf, 0);
+      gst_adapter_clear (self->adapter);
       break;
     default:
       break;
@@ -1320,12 +1407,34 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private)
   const guint8 *data;
   GstCaps *caps;
   gchar *str;
+  gchar *encoding = NULL;
 
-  if (!(data = gst_type_find_peek (tf, 0, 36)))
+  if (!(data = gst_type_find_peek (tf, 0, 129)))
     return;
 
   /* make sure string passed to _autodetect() is NUL-terminated */
-  str = g_strndup ((gchar *) data, 35);
+  str = g_malloc0 (129);
+  memcpy (str, data, 128);
+
+  if ((encoding = detect_encoding (str, 128)) != NULL) {
+    gchar *converted_str;
+    GError *err = NULL;
+    gsize tmp;
+
+    converted_str = gst_convert_to_utf8 (str, 128, encoding, &tmp, &err);
+    if (converted_str == NULL) {
+      GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding,
+          err->message);
+      g_error_free (err);
+      g_free (encoding);
+      g_free (str);
+      return;
+    }
+    g_free (str);
+
+    str = converted_str;
+  }
+
   format = gst_sub_parse_data_format_autodetect (str);
   g_free (str);
 
diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h
index 510c3e8..f40ccee 100644
--- a/gst/subparse/gstsubparse.h
+++ b/gst/subparse/gstsubparse.h
@@ -22,6 +22,7 @@
 #define __GST_SUBPARSE_H__
 
 #include <gst/gst.h>
+#include <gst/base/gstadapter.h>
 
 GST_DEBUG_CATEGORY_EXTERN (sub_parse_debug);
 #define GST_CAT_DEFAULT sub_parse_debug
@@ -73,6 +74,9 @@ struct _GstSubParse {
 
   GstPad *sinkpad,*srcpad;
 
+  /* contains the input in the input encoding */
+  GstAdapter *adapter;
+  /* contains the UTF-8 decoded input */
   GString *textbuf;
 
   GstSubParseFormat parser_type;
@@ -92,9 +96,10 @@ struct _GstSubParse {
   
   gboolean flushing;
   gboolean valid_utf8;
+  gchar   *detected_encoding;
   gchar   *encoding;
 
-  gboolean first_line;
+  gboolean first_buffer;
 };
 
 struct _GstSubParseClass {
-- 
2.7.4