From f9cc8757c9a1ba59b3b201d0816873c9108e2a29 Mon Sep 17 00:00:00 2001
From: Mathieu Duponchelle <mathieu@centricular.com>
Date: Fri, 20 Mar 2020 19:09:17 +0100
Subject: [PATCH] subparse: convert from pango-markup to utf8 ..

when downstream requires it

Change-Id: I855f284401d8f5abbc4a1b1351f541c9883c92e4
---
 gst/subparse/gstsubparse.c      | 131 +++++++++++++++++++++++++++++++++++++++-
 gst/subparse/gstsubparse.h      |   4 +-
 tests/check/elements/subparse.c |  35 +++++++++++
 3 files changed, 168 insertions(+), 2 deletions(-)
diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c
index c702a33..a046840 100644
--- a/gst/subparse/gstsubparse.c
+++ b/gst/subparse/gstsubparse.c
@@ -206,6 +206,9 @@ gst_sub_parse_init (GstSubParse * subparse)
 
   subparse->textbuf = g_string_new (NULL);
   subparse->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
+#ifdef TIZEN_FEATURE_UPSTREAM
+  subparse->strip_pango_markup = FALSE;
+#endif
   subparse->flushing = FALSE;
   gst_segment_init (&subparse->segment, GST_FORMAT_TIME);
   subparse->need_segment = TRUE;
@@ -1876,11 +1879,101 @@ feed_textbuf (GstSubParse * self, GstBuffer * buf)
   g_free (input);
 }
 
+#ifdef TIZEN_FEATURE_UPSTREAM
+static void
+xml_text (GMarkupParseContext * context,
+    const gchar * text, gsize text_len, gpointer user_data, GError ** error)
+{
+  gchar **accum = (gchar **) user_data;
+  gchar *concat;
+
+  if (*accum) {
+    concat = g_strconcat (*accum, text, NULL);
+    g_free (*accum);
+    *accum = concat;
+  } else {
+    *accum = g_strdup (text);
+  }
+}
+
+static gchar *
+strip_pango_markup (gchar * markup, GError ** error)
+{
+  GMarkupParser parser = { 0, };
+  GMarkupParseContext *context;
+  gchar *accum = NULL;
+
+  parser.text = xml_text;
+  context = g_markup_parse_context_new (&parser, 0, &accum, NULL);
+
+  g_markup_parse_context_parse (context, "<root>", 6, NULL);
+  g_markup_parse_context_parse (context, markup, strlen (markup), error);
+  g_markup_parse_context_parse (context, "</root>", 7, NULL);
+  if (*error)
+    goto error;
+
+  g_markup_parse_context_end_parse (context, error);
+  if (*error)
+    goto error;
+
+done:
+  g_markup_parse_context_free (context);
+  return accum;
+
+error:
+  g_free (accum);
+  accum = NULL;
+  goto done;
+}
+
+static gboolean
+gst_sub_parse_negotiate (GstSubParse * self, GstCaps * preferred)
+{
+  GstCaps *caps;
+  gboolean ret = FALSE;
+  const GstStructure *s1, *s2;
+
+  caps = gst_pad_get_allowed_caps (self->srcpad);
+
+  s1 = gst_caps_get_structure (preferred, 0);
+
+  if (!g_strcmp0 (gst_structure_get_string (s1, "format"), "utf8")) {
+    GstCaps *intersected = gst_caps_intersect (caps, preferred);
+    gst_caps_unref (caps);
+    caps = intersected;
+  }
+
+  caps = gst_caps_fixate (caps);
+
+  if (gst_caps_is_empty (caps)) {
+    goto done;
+  }
+
+  s2 = gst_caps_get_structure (caps, 0);
+
+  self->strip_pango_markup =
+      !g_strcmp0 (gst_structure_get_string (s2, "format"), "utf8")
+      && !g_strcmp0 (gst_structure_get_string (s1, "format"), "pango-markup");
+
+  if (self->strip_pango_markup) {
+    GST_INFO_OBJECT (self, "We will convert from pango-markup to utf8");
+  }
+
+  ret = gst_pad_set_caps (self->srcpad, caps);
+
+done:
+  gst_caps_unref (caps);
+  return ret;
+}
+#endif
+
 static GstFlowReturn
 handle_buffer (GstSubParse * self, GstBuffer * buf)
 {
   GstFlowReturn ret = GST_FLOW_OK;
+#ifndef TIZEN_FEATURE_UPSTREAM
   GstCaps *caps = NULL;
+#endif
   gchar *line, *subtitle;
   gboolean need_tags = FALSE;
 #ifdef TIZEN_FEATURE_SUBPARSE_MODIFICATION
@@ -1902,6 +1995,20 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
 
   /* make sure we know the format */
   if (G_UNLIKELY (self->parser_type == GST_SUB_PARSE_FORMAT_UNKNOWN)) {
+#ifdef TIZEN_FEATURE_UPSTREAM
+    GstCaps *preferred;
+
+    if (!(preferred = gst_sub_parse_format_autodetect (self))) {
+      return GST_FLOW_NOT_NEGOTIATED;
+    }
+
+    if (!gst_sub_parse_negotiate (self, preferred)) {
+      gst_caps_unref (preferred);
+      return GST_FLOW_NOT_NEGOTIATED;
+    }
+
+    gst_caps_unref (preferred);
+#else
     if (!(caps = gst_sub_parse_format_autodetect (self))) {
       return GST_FLOW_EOS;
     }
@@ -1910,6 +2017,7 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
       return GST_FLOW_EOS;
     }
     gst_caps_unref (caps);
+#endif
     need_tags = TRUE;
   }
 
@@ -1960,8 +2068,26 @@ handle_buffer (GstSubParse * self, GstBuffer * buf)
     }
 #endif
     if (subtitle) {
-      guint subtitle_len = strlen (subtitle);
+#ifdef TIZEN_FEATURE_UPSTREAM
+      guint subtitle_len;
+
+      if (self->strip_pango_markup) {
+        GError *error = NULL;
+        gchar *stripped;
+
+        if ((stripped = strip_pango_markup (subtitle, &error))) {
+          g_free (subtitle);
+          subtitle = stripped;
+        } else {
+          GST_WARNING_OBJECT (self, "Failed to strip pango markup: %s",
+              error->message);
+        }
+      }
 
+      subtitle_len = strlen (subtitle);
+#else
+      guint subtitle_len = strlen (subtitle);
+#endif
       /* +1 for terminating NUL character */
       buf = gst_buffer_new_and_alloc (subtitle_len + 1);
 
@@ -2137,6 +2263,9 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
       /* format detection will init the parser state */
       self->offset = 0;
       self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
+#ifdef TIZEN_FEATURE_UPSTREAM
+      self->strip_pango_markup = FALSE;
+#endif
       self->valid_utf8 = TRUE;
       self->first_buffer = TRUE;
       g_free (self->detected_encoding);
diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h
index af7520e..59f6d75 100644
--- a/gst/subparse/gstsubparse.h
+++ b/gst/subparse/gstsubparse.h
@@ -118,7 +118,9 @@ struct _GstSubParse {
   gboolean valid_utf8;
   gchar   *detected_encoding;
   gchar   *encoding;
-
+#ifdef TIZEN_FEATURE_UPSTREAM
+  gboolean strip_pango_markup;
+#endif
   gboolean first_buffer;
 
   /* used by frame based parsers */
diff --git a/tests/check/elements/subparse.c b/tests/check/elements/subparse.c
index e6c75e0..c20ee3e 100644
--- a/tests/check/elements/subparse.c
+++ b/tests/check/elements/subparse.c
@@ -22,6 +22,9 @@
 #endif
 
 #include <gst/check/gstcheck.h>
+#ifdef TIZEN_FEATURE_UPSTREAM
+#include <gst/check/gstharness.h>
+#endif
 
 #include <string.h>
 
@@ -1036,6 +1039,35 @@ GST_START_TEST (test_lrc)
 
 GST_END_TEST;
 
+#ifdef TIZEN_FEATURE_UPSTREAM
+GST_START_TEST (test_raw_conversion)
+{
+  GstHarness *h;
+  GstBuffer *buffer;
+  GstMapInfo map;
+
+  h = gst_harness_new ("subparse");
+
+  gst_harness_set_src_caps_str (h, "application/x-subtitle");
+  gst_harness_set_sink_caps_str (h, "text/x-raw, format=utf8");
+
+  buffer = buffer_from_static_string (srt_input[5].in);
+
+  buffer = gst_harness_push_and_pull (h, buffer);
+
+  gst_buffer_map (buffer, &map, GST_MAP_READ);
+  fail_unless_equals_int (map.size, 3);
+  fail_unless_equals_string ((gchar *) map.data, "Six");
+  gst_buffer_unmap (buffer, &map);
+
+  gst_clear_buffer (&buffer);
+
+  gst_harness_teardown (h);
+}
+
+GST_END_TEST;
+#endif
+
 /* TODO:
  *  - add/modify tests so that lines aren't dogfed to the parsers in complete
  *    lines or sets of complete lines, but rather in random chunks
@@ -1071,6 +1103,9 @@ subparse_suite (void)
   tcase_add_test (tc_chain, test_sami_bad_entities);
   tcase_add_test (tc_chain, test_sami_comment);
   tcase_add_test (tc_chain, test_lrc);
+#ifdef TIZEN_FEATURE_UPSTREAM
+  tcase_add_test (tc_chain, test_raw_conversion);
+#endif
   return s;
 }
 
-- 
2.7.4