From b6f8d0544c1a42304563b52a687864314dbb4a49 Mon Sep 17 00:00:00 2001 From: =?utf8?q?St=C3=A9phane=20Cerveau?= Date: Mon, 30 Nov 2020 11:00:30 +0100 Subject: [PATCH] subparse: allow per feature registration Split plugin into features including elements and device providers which can be indiviually registered during a static build. More details here: i https://gitlab.freedesktop.org/gstreamer/gst-build/-/merge_requests/199 https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/661 Part-of: --- gst/subparse/gstssaparse.c | 6 + gst/subparse/gstsubparse.c | 420 ++---------------------------------- gst/subparse/gstsubparse.h | 19 +- gst/subparse/gstsubparseelement.c | 426 +++++++++++++++++++++++++++++++++++++ gst/subparse/gstsubparseelements.h | 58 +++++ gst/subparse/gstsubparseplugin.c | 45 ++++ gst/subparse/meson.build | 2 + 7 files changed, 553 insertions(+), 423 deletions(-) create mode 100644 gst/subparse/gstsubparseelement.c create mode 100644 gst/subparse/gstsubparseelements.h create mode 100644 gst/subparse/gstsubparseplugin.c diff --git a/gst/subparse/gstssaparse.c b/gst/subparse/gstssaparse.c index c849c08..a9cdcd1 100644 --- a/gst/subparse/gstssaparse.c +++ b/gst/subparse/gstssaparse.c @@ -28,8 +28,11 @@ #include #include "gstssaparse.h" +#include "gstsubparseelements.h" + GST_DEBUG_CATEGORY_STATIC (ssa_parse_debug); +#undef GST_CAT_DEFAULT #define GST_CAT_DEFAULT ssa_parse_debug static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink", @@ -46,6 +49,9 @@ static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src", #define gst_ssa_parse_parent_class parent_class G_DEFINE_TYPE (GstSsaParse, gst_ssa_parse, GST_TYPE_ELEMENT); +GST_ELEMENT_REGISTER_DEFINE_WITH_CODE (ssaparse, "ssaparse", + GST_RANK_PRIMARY, GST_TYPE_SSA_PARSE, sub_parse_element_init (plugin)); + static GstStateChangeReturn gst_ssa_parse_change_state (GstElement * element, GstStateChange transition); diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index 382e430..e9bb24e 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -32,13 +32,13 @@ #include #include "gstsubparse.h" + #include "gstssaparse.h" #include "samiparse.h" #include "tmplayerparse.h" #include "mpl2parse.h" #include "qttextparse.h" - -GST_DEBUG_CATEGORY (sub_parse_debug); +#include "gstsubparseelements.h" #define DEFAULT_ENCODING NULL #define ATTRIBUTE_REGEX "\\s?[a-zA-Z0-9\\. \t\\(\\)]*" @@ -93,8 +93,11 @@ static GstFlowReturn gst_sub_parse_chain (GstPad * sinkpad, GstObject * parent, #define gst_sub_parse_parent_class parent_class G_DEFINE_TYPE (GstSubParse, gst_sub_parse, GST_TYPE_ELEMENT); -static void -gst_sub_parse_dispose (GObject * object) +GST_ELEMENT_REGISTER_DEFINE_WITH_CODE (subparse, "subparse", + GST_RANK_PRIMARY, GST_TYPE_SUBPARSE, sub_parse_element_init (plugin)) + + + static void gst_sub_parse_dispose (GObject * object) { GstSubParse *subparse = GST_SUBPARSE (object); @@ -392,52 +395,9 @@ gst_sub_parse_get_format_description (GstSubParseFormat format) return NULL; } -static gchar * -gst_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding, - gsize * consumed, GError ** err) -{ - gchar *ret = NULL; - - *consumed = 0; - /* The char cast is necessary in glib < 2.24 */ - ret = - g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*", - consumed, NULL, err); - if (ret == NULL) - return ret; - - /* + 3 to skip UTF-8 BOM if it was added */ - len = strlen (ret); - if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB - && (guint8) ret[2] == 0xBF) - memmove (ret, ret + 3, len + 1 - 3); - - return ret; -} - -static gchar * -detect_encoding (const gchar * str, gsize len) -{ - if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB - && (guint8) str[2] == 0xBF) - return g_strdup ("UTF-8"); - - if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF) - return g_strdup ("UTF-16BE"); - - if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE) - return g_strdup ("UTF-16LE"); - if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00 - && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF) - return g_strdup ("UTF-32BE"); - if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE - && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00) - return g_strdup ("UTF-32LE"); - return NULL; -} static gchar * convert_encoding (GstSubParse * self, const gchar * str, gsize len, @@ -452,7 +412,8 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, /* First try any detected encoding */ if (self->detected_encoding) { ret = - gst_convert_to_utf8 (str, len, self->detected_encoding, consumed, &err); + gst_sub_parse_gst_convert_to_utf8 (str, len, self->detected_encoding, + consumed, &err); if (!err) return ret; @@ -488,7 +449,7 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, } } - ret = gst_convert_to_utf8 (str, len, encoding, consumed, &err); + ret = gst_sub_parse_gst_convert_to_utf8 (str, len, encoding, consumed, &err); if (err) { GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s", @@ -496,7 +457,9 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, g_clear_error (&err); /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */ - ret = gst_convert_to_utf8 (str, len, "ISO-8859-15", consumed, NULL); + ret = + gst_sub_parse_gst_convert_to_utf8 (str, len, "ISO-8859-15", consumed, + NULL); } GST_LOG_OBJECT (self, @@ -1416,184 +1379,7 @@ parser_state_dispose (GstSubParse * self, ParserState * state) state->allowed_tags = NULL; } -/* regex type enum */ -typedef enum -{ - GST_SUB_PARSE_REGEX_UNKNOWN = 0, - GST_SUB_PARSE_REGEX_MDVDSUB = 1, - GST_SUB_PARSE_REGEX_SUBRIP = 2, - GST_SUB_PARSE_REGEX_DKS = 3, - GST_SUB_PARSE_REGEX_VTT = 4, -} GstSubParseRegex; - -static gpointer -gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype) -{ - gpointer result = NULL; - GError *gerr = NULL; - switch (regtype) { - case GST_SUB_PARSE_REGEX_MDVDSUB: - result = - (gpointer) g_regex_new ("^\\{[0-9]+\\}\\{[0-9]+\\}", - G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr); - if (result == NULL) { - g_warning ("Compilation of mdvd regex failed: %s", gerr->message); - g_clear_error (&gerr); - } - break; - case GST_SUB_PARSE_REGEX_SUBRIP: - result = (gpointer) - g_regex_new ("^[\\s\\n]*[\\n]? {0,3}[ 0-9]{1,4}\\s*(\x0d)?\x0a" - " ?[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,3}" - " +--> +[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,2}", - G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr); - if (result == NULL) { - g_warning ("Compilation of subrip regex failed: %s", gerr->message); - g_clear_error (&gerr); - } - break; - case GST_SUB_PARSE_REGEX_DKS: - result = (gpointer) g_regex_new ("^\\[[0-9]+:[0-9]+:[0-9]+\\].*", - G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr); - if (result == NULL) { - g_warning ("Compilation of dks regex failed: %s", gerr->message); - g_clear_error (&gerr); - } - break; - case GST_SUB_PARSE_REGEX_VTT: - result = (gpointer) - g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0, - &gerr); - if (result == NULL) { - g_warning ("Compilation of vtt regex failed: %s", gerr->message); - g_error_free (gerr); - } - break; - default: - GST_WARNING ("Trying to allocate regex of unknown type %u", regtype); - } - return result; -} - -/* - * FIXME: maybe we should pass along a second argument, the preceding - * text buffer, because that is how this originally worked, even though - * I don't really see the use of that. - */ - -static GstSubParseFormat -gst_sub_parse_data_format_autodetect (gchar * match_str) -{ - guint n1, n2, n3; - - static GOnce mdvd_rx_once = G_ONCE_INIT; - static GOnce subrip_rx_once = G_ONCE_INIT; - static GOnce dks_rx_once = G_ONCE_INIT; - static GOnce vtt_rx_once = G_ONCE_INIT; - - GRegex *mdvd_grx; - GRegex *subrip_grx; - GRegex *dks_grx; - GRegex *vtt_grx; - - g_once (&mdvd_rx_once, - (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, - (gpointer) GST_SUB_PARSE_REGEX_MDVDSUB); - g_once (&subrip_rx_once, - (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, - (gpointer) GST_SUB_PARSE_REGEX_SUBRIP); - g_once (&dks_rx_once, - (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, - (gpointer) GST_SUB_PARSE_REGEX_DKS); - g_once (&vtt_rx_once, - (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, - (gpointer) GST_SUB_PARSE_REGEX_VTT); - - mdvd_grx = (GRegex *) mdvd_rx_once.retval; - subrip_grx = (GRegex *) subrip_rx_once.retval; - dks_grx = (GRegex *) dks_rx_once.retval; - vtt_grx = (GRegex *) vtt_rx_once.retval; - - if (g_regex_match (mdvd_grx, match_str, 0, NULL)) { - GST_LOG ("MicroDVD (frame based) format detected"); - return GST_SUB_PARSE_FORMAT_MDVDSUB; - } - if (g_regex_match (subrip_grx, match_str, 0, NULL)) { - GST_LOG ("SubRip (time based) format detected"); - return GST_SUB_PARSE_FORMAT_SUBRIP; - } - if (g_regex_match (dks_grx, match_str, 0, NULL)) { - GST_LOG ("DKS (time based) format detected"); - return GST_SUB_PARSE_FORMAT_DKS; - } - if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) { - GST_LOG ("WebVTT (time based) format detected"); - return GST_SUB_PARSE_FORMAT_VTT; - } - - if (!strncmp (match_str, "FORMAT=TIME", 11)) { - GST_LOG ("MPSub (time based) format detected"); - return GST_SUB_PARSE_FORMAT_MPSUB; - } - if (strstr (match_str, "") != NULL || - strstr (match_str, "") != NULL) { - GST_LOG ("SAMI (time based) format detected"); - return GST_SUB_PARSE_FORMAT_SAMI; - } - /* we're boldly assuming the first subtitle appears within the first hour */ - if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 || - sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 || - sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 || - sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 || - sscanf (match_str, "00:%02u:%02u,%u=", &n1, &n2, &n3) == 3) { - GST_LOG ("TMPlayer (time based) format detected"); - return GST_SUB_PARSE_FORMAT_TMPLAYER; - } - if (sscanf (match_str, "[%u][%u]", &n1, &n2) == 2) { - GST_LOG ("MPL2 (time based) format detected"); - return GST_SUB_PARSE_FORMAT_MPL2; - } - if (strstr (match_str, "[INFORMATION]") != NULL) { - GST_LOG ("SubViewer (time based) format detected"); - return GST_SUB_PARSE_FORMAT_SUBVIEWER; - } - if (strstr (match_str, "{QTtext}") != NULL) { - GST_LOG ("QTtext (time based) format detected"); - return GST_SUB_PARSE_FORMAT_QTTEXT; - } - /* We assume the LRC file starts immediately */ - if (match_str[0] == '[') { - gboolean all_lines_good = TRUE; - gchar **split; - gchar **ptr; - - ptr = split = g_strsplit (match_str, "\n", -1); - while (*ptr && *(ptr + 1)) { - gchar *str = *ptr; - gint len = strlen (str); - - if (sscanf (str, "[%u:%02u.%02u]", &n1, &n2, &n3) == 3 || - sscanf (str, "[%u:%02u.%03u]", &n1, &n2, &n3) == 3) { - all_lines_good = TRUE; - } else if (str[len - 1] == ']' && strchr (str, ':') != NULL) { - all_lines_good = TRUE; - } else { - all_lines_good = FALSE; - break; - } - - ptr++; - } - g_strfreev (split); - - if (all_lines_good) - return GST_SUB_PARSE_FORMAT_LRC; - } - - GST_DEBUG ("no subtitle format detected"); - return GST_SUB_PARSE_FORMAT_UNKNOWN; -} static GstCaps * gst_sub_parse_format_autodetect (GstSubParse * self) @@ -1823,7 +1609,8 @@ handle_buffer (GstSubParse * self, GstBuffer * buf) GstMapInfo map; gst_buffer_map (buf, &map, GST_MAP_READ); - self->detected_encoding = detect_encoding ((gchar *) map.data, map.size); + self->detected_encoding = + gst_sub_parse_detect_encoding ((gchar *) map.data, map.size); gst_buffer_unmap (buf, &map); self->first_buffer = FALSE; self->state.fps_n = self->fps_n; @@ -2081,180 +1868,3 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition) return ret; } - -/* - * Typefind support. - */ - -/* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so; - * also, give different subtitle formats really different types */ -static GstStaticCaps mpl2_caps = -GST_STATIC_CAPS ("application/x-subtitle-mpl2"); -#define SUB_CAPS (gst_static_caps_get (&sub_caps)) - -static GstStaticCaps tmp_caps = -GST_STATIC_CAPS ("application/x-subtitle-tmplayer"); -#define TMP_CAPS (gst_static_caps_get (&tmp_caps)) - -static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle"); -#define MPL2_CAPS (gst_static_caps_get (&mpl2_caps)) - -static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami"); -#define SAMI_CAPS (gst_static_caps_get (&smi_caps)) - -static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks"); -#define DKS_CAPS (gst_static_caps_get (&dks_caps)) - -static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt"); -#define VTT_CAPS (gst_static_caps_get (&vtt_caps)) - -static GstStaticCaps qttext_caps = -GST_STATIC_CAPS ("application/x-subtitle-qttext"); -#define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps)) - -static GstStaticCaps lrc_caps = GST_STATIC_CAPS ("application/x-subtitle-lrc"); -#define LRC_CAPS (gst_static_caps_get (&lrc_caps)) - -static void -gst_subparse_type_find (GstTypeFind * tf, gpointer private) -{ - GstSubParseFormat format; - const guint8 *data; - GstCaps *caps; - gchar *str; - gchar *encoding = NULL; - const gchar *end; - - if (!(data = gst_type_find_peek (tf, 0, 129))) - return; - - /* make sure string passed to _autodetect() is NUL-terminated */ - str = g_malloc0 (129); - memcpy (str, data, 128); - - if ((encoding = detect_encoding (str, 128)) != NULL) { - gchar *converted_str; - GError *err = NULL; - gsize tmp; - - converted_str = gst_convert_to_utf8 (str, 128, encoding, &tmp, &err); - if (converted_str == NULL) { - GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding, - err->message); - g_clear_error (&err); - } else { - g_free (str); - str = converted_str; - } - g_free (encoding); - } - - /* Check if at least the first 120 chars are valid UTF8, - * otherwise convert as always */ - if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) { - gchar *converted_str; - gsize tmp; - const gchar *enc; - - enc = g_getenv ("GST_SUBTITLE_ENCODING"); - if (enc == NULL || *enc == '\0') { - /* if local encoding is UTF-8 and no encoding specified - * via the environment variable, assume ISO-8859-15 */ - if (g_get_charset (&enc)) { - enc = "ISO-8859-15"; - } - } - converted_str = gst_convert_to_utf8 (str, 128, enc, &tmp, NULL); - if (converted_str != NULL) { - g_free (str); - str = converted_str; - } - } - - format = gst_sub_parse_data_format_autodetect (str); - g_free (str); - - switch (format) { - case GST_SUB_PARSE_FORMAT_MDVDSUB: - GST_DEBUG ("MicroDVD format detected"); - caps = SUB_CAPS; - break; - case GST_SUB_PARSE_FORMAT_SUBRIP: - GST_DEBUG ("SubRip format detected"); - caps = SUB_CAPS; - break; - case GST_SUB_PARSE_FORMAT_MPSUB: - GST_DEBUG ("MPSub format detected"); - caps = SUB_CAPS; - break; - case GST_SUB_PARSE_FORMAT_SAMI: - GST_DEBUG ("SAMI (time-based) format detected"); - caps = SAMI_CAPS; - break; - case GST_SUB_PARSE_FORMAT_TMPLAYER: - GST_DEBUG ("TMPlayer (time based) format detected"); - caps = TMP_CAPS; - break; - /* FIXME: our MPL2 typefinding is not really good enough to warrant - * returning a high probability (however, since we registered our - * typefinder here with a rank of MARGINAL we should pretty much only - * be called if most other typefinders have already run */ - case GST_SUB_PARSE_FORMAT_MPL2: - GST_DEBUG ("MPL2 (time based) format detected"); - caps = MPL2_CAPS; - break; - case GST_SUB_PARSE_FORMAT_SUBVIEWER: - GST_DEBUG ("SubViewer format detected"); - caps = SUB_CAPS; - break; - case GST_SUB_PARSE_FORMAT_DKS: - GST_DEBUG ("DKS format detected"); - caps = DKS_CAPS; - break; - case GST_SUB_PARSE_FORMAT_QTTEXT: - GST_DEBUG ("QTtext format detected"); - caps = QTTEXT_CAPS; - break; - case GST_SUB_PARSE_FORMAT_LRC: - GST_DEBUG ("LRC format detected"); - caps = LRC_CAPS; - break; - case GST_SUB_PARSE_FORMAT_VTT: - GST_DEBUG ("WebVTT format detected"); - caps = VTT_CAPS; - break; - default: - case GST_SUB_PARSE_FORMAT_UNKNOWN: - GST_DEBUG ("no subtitle format detected"); - return; - } - - /* if we're here, it's ok */ - gst_type_find_suggest (tf, GST_TYPE_FIND_MAXIMUM, caps); -} - -static gboolean -plugin_init (GstPlugin * plugin) -{ - GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser"); - - if (!gst_type_find_register (plugin, "subparse_typefind", GST_RANK_MARGINAL, - gst_subparse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt", - SUB_CAPS, NULL, NULL)) - return FALSE; - - if (!gst_element_register (plugin, "subparse", - GST_RANK_PRIMARY, GST_TYPE_SUBPARSE) || - !gst_element_register (plugin, "ssaparse", - GST_RANK_PRIMARY, GST_TYPE_SSA_PARSE)) { - return FALSE; - } - - return TRUE; -} - -GST_PLUGIN_DEFINE (GST_VERSION_MAJOR, - GST_VERSION_MINOR, - subparse, - "Subtitle parsing", - plugin_init, VERSION, "LGPL", GST_PACKAGE_NAME, GST_PACKAGE_ORIGIN) diff --git a/gst/subparse/gstsubparse.h b/gst/subparse/gstsubparse.h index 6e156fb..7a1e9b0 100644 --- a/gst/subparse/gstsubparse.h +++ b/gst/subparse/gstsubparse.h @@ -24,30 +24,13 @@ #include #include -GST_DEBUG_CATEGORY_EXTERN (sub_parse_debug); -#define GST_CAT_DEFAULT sub_parse_debug +#include "gstsubparseelements.h" G_BEGIN_DECLS #define GST_TYPE_SUBPARSE (gst_sub_parse_get_type ()) G_DECLARE_FINAL_TYPE (GstSubParse, gst_sub_parse, GST, SUBPARSE, GstElement) -/* format enum */ -typedef enum -{ - GST_SUB_PARSE_FORMAT_UNKNOWN = 0, - GST_SUB_PARSE_FORMAT_MDVDSUB = 1, - GST_SUB_PARSE_FORMAT_SUBRIP = 2, - GST_SUB_PARSE_FORMAT_MPSUB = 3, - GST_SUB_PARSE_FORMAT_SAMI = 4, - GST_SUB_PARSE_FORMAT_TMPLAYER = 5, - GST_SUB_PARSE_FORMAT_MPL2 = 6, - GST_SUB_PARSE_FORMAT_SUBVIEWER = 7, - GST_SUB_PARSE_FORMAT_DKS = 8, - GST_SUB_PARSE_FORMAT_QTTEXT = 9, - GST_SUB_PARSE_FORMAT_LRC = 10, - GST_SUB_PARSE_FORMAT_VTT = 11 -} GstSubParseFormat; typedef struct { int state; diff --git a/gst/subparse/gstsubparseelement.c b/gst/subparse/gstsubparseelement.c new file mode 100644 index 0000000..7b40fd5 --- /dev/null +++ b/gst/subparse/gstsubparseelement.c @@ -0,0 +1,426 @@ +/* GStreamer + * Copyright (C) 2020 Huawei Technologies Co., Ltd. + * @Author: Stéphane Cerveau + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include + +#include "gstsubparseelements.h" + +GST_DEBUG_CATEGORY (sub_parse_debug); + +/* regex type enum */ +typedef enum +{ + GST_SUB_PARSE_REGEX_UNKNOWN = 0, + GST_SUB_PARSE_REGEX_MDVDSUB = 1, + GST_SUB_PARSE_REGEX_SUBRIP = 2, + GST_SUB_PARSE_REGEX_DKS = 3, + GST_SUB_PARSE_REGEX_VTT = 4, +} GstSubParseRegex; + +static gpointer +gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype) +{ + gpointer result = NULL; + GError *gerr = NULL; + switch (regtype) { + case GST_SUB_PARSE_REGEX_MDVDSUB: + result = + (gpointer) g_regex_new ("^\\{[0-9]+\\}\\{[0-9]+\\}", + G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr); + if (result == NULL) { + g_warning ("Compilation of mdvd regex failed: %s", gerr->message); + g_clear_error (&gerr); + } + break; + case GST_SUB_PARSE_REGEX_SUBRIP: + result = (gpointer) + g_regex_new ("^[\\s\\n]*[\\n]? {0,3}[ 0-9]{1,4}\\s*(\x0d)?\x0a" + " ?[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,3}" + " +--> +[0-9]{1,2}: ?[0-9]{1,2}: ?[0-9]{1,2}[,.] {0,2}[0-9]{1,2}", + G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr); + if (result == NULL) { + g_warning ("Compilation of subrip regex failed: %s", gerr->message); + g_clear_error (&gerr); + } + break; + case GST_SUB_PARSE_REGEX_DKS: + result = (gpointer) g_regex_new ("^\\[[0-9]+:[0-9]+:[0-9]+\\].*", + G_REGEX_RAW | G_REGEX_OPTIMIZE, 0, &gerr); + if (result == NULL) { + g_warning ("Compilation of dks regex failed: %s", gerr->message); + g_clear_error (&gerr); + } + break; + case GST_SUB_PARSE_REGEX_VTT: + result = (gpointer) + g_regex_new ("^(\\xef\\xbb\\xbf)?WEBVTT[\\xa\\xd\\x20\\x9]", 0, 0, + &gerr); + if (result == NULL) { + g_warning ("Compilation of vtt regex failed: %s", gerr->message); + g_error_free (gerr); + } + break; + + default: + GST_WARNING ("Trying to allocate regex of unknown type %u", regtype); + } + return result; +} + +/* + * FIXME: maybe we should pass along a second argument, the preceding + * text buffer, because that is how this originally worked, even though + * I don't really see the use of that. + */ + +GstSubParseFormat +gst_sub_parse_data_format_autodetect (gchar * match_str) +{ + guint n1, n2, n3; + + static GOnce mdvd_rx_once = G_ONCE_INIT; + static GOnce subrip_rx_once = G_ONCE_INIT; + static GOnce dks_rx_once = G_ONCE_INIT; + static GOnce vtt_rx_once = G_ONCE_INIT; + + GRegex *mdvd_grx; + GRegex *subrip_grx; + GRegex *dks_grx; + GRegex *vtt_grx; + + g_once (&mdvd_rx_once, + (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, + (gpointer) GST_SUB_PARSE_REGEX_MDVDSUB); + g_once (&subrip_rx_once, + (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, + (gpointer) GST_SUB_PARSE_REGEX_SUBRIP); + g_once (&dks_rx_once, + (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, + (gpointer) GST_SUB_PARSE_REGEX_DKS); + g_once (&vtt_rx_once, + (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, + (gpointer) GST_SUB_PARSE_REGEX_VTT); + + mdvd_grx = (GRegex *) mdvd_rx_once.retval; + subrip_grx = (GRegex *) subrip_rx_once.retval; + dks_grx = (GRegex *) dks_rx_once.retval; + vtt_grx = (GRegex *) vtt_rx_once.retval; + + if (g_regex_match (mdvd_grx, match_str, 0, NULL)) { + GST_LOG ("MicroDVD (frame based) format detected"); + return GST_SUB_PARSE_FORMAT_MDVDSUB; + } + if (g_regex_match (subrip_grx, match_str, 0, NULL)) { + GST_LOG ("SubRip (time based) format detected"); + return GST_SUB_PARSE_FORMAT_SUBRIP; + } + if (g_regex_match (dks_grx, match_str, 0, NULL)) { + GST_LOG ("DKS (time based) format detected"); + return GST_SUB_PARSE_FORMAT_DKS; + } + if (g_regex_match (vtt_grx, match_str, 0, NULL) == TRUE) { + GST_LOG ("WebVTT (time based) format detected"); + return GST_SUB_PARSE_FORMAT_VTT; + } + + if (!strncmp (match_str, "FORMAT=TIME", 11)) { + GST_LOG ("MPSub (time based) format detected"); + return GST_SUB_PARSE_FORMAT_MPSUB; + } + if (strstr (match_str, "") != NULL || + strstr (match_str, "") != NULL) { + GST_LOG ("SAMI (time based) format detected"); + return GST_SUB_PARSE_FORMAT_SAMI; + } + /* we're boldly assuming the first subtitle appears within the first hour */ + if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 || + sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 || + sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 || + sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 || + sscanf (match_str, "00:%02u:%02u,%u=", &n1, &n2, &n3) == 3) { + GST_LOG ("TMPlayer (time based) format detected"); + return GST_SUB_PARSE_FORMAT_TMPLAYER; + } + if (sscanf (match_str, "[%u][%u]", &n1, &n2) == 2) { + GST_LOG ("MPL2 (time based) format detected"); + return GST_SUB_PARSE_FORMAT_MPL2; + } + if (strstr (match_str, "[INFORMATION]") != NULL) { + GST_LOG ("SubViewer (time based) format detected"); + return GST_SUB_PARSE_FORMAT_SUBVIEWER; + } + if (strstr (match_str, "{QTtext}") != NULL) { + GST_LOG ("QTtext (time based) format detected"); + return GST_SUB_PARSE_FORMAT_QTTEXT; + } + /* We assume the LRC file starts immediately */ + if (match_str[0] == '[') { + gboolean all_lines_good = TRUE; + gchar **split; + gchar **ptr; + + ptr = split = g_strsplit (match_str, "\n", -1); + while (*ptr && *(ptr + 1)) { + gchar *str = *ptr; + gint len = strlen (str); + + if (sscanf (str, "[%u:%02u.%02u]", &n1, &n2, &n3) == 3 || + sscanf (str, "[%u:%02u.%03u]", &n1, &n2, &n3) == 3) { + all_lines_good = TRUE; + } else if (str[len - 1] == ']' && strchr (str, ':') != NULL) { + all_lines_good = TRUE; + } else { + all_lines_good = FALSE; + break; + } + + ptr++; + } + g_strfreev (split); + + if (all_lines_good) + return GST_SUB_PARSE_FORMAT_LRC; + } + + GST_DEBUG ("no subtitle format detected"); + return GST_SUB_PARSE_FORMAT_UNKNOWN; +} + +gchar * +gst_sub_parse_gst_convert_to_utf8 (const gchar * str, gsize len, + const gchar * encoding, gsize * consumed, GError ** err) +{ + gchar *ret = NULL; + + *consumed = 0; + /* The char cast is necessary in glib < 2.24 */ + ret = + g_convert_with_fallback (str, len, "UTF-8", encoding, (char *) "*", + consumed, NULL, err); + if (ret == NULL) + return ret; + + /* + 3 to skip UTF-8 BOM if it was added */ + len = strlen (ret); + if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB + && (guint8) ret[2] == 0xBF) + memmove (ret, ret + 3, len + 1 - 3); + + return ret; +} + +gchar * +gst_sub_parse_detect_encoding (const gchar * str, gsize len) +{ + if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB + && (guint8) str[2] == 0xBF) + return g_strdup ("UTF-8"); + + if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF) + return g_strdup ("UTF-16BE"); + + if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE) + return g_strdup ("UTF-16LE"); + + if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00 + && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF) + return g_strdup ("UTF-32BE"); + + if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE + && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00) + return g_strdup ("UTF-32LE"); + + return NULL; +} + +/* + * Typefind support. + */ + +/* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so; + * also, give different subtitle formats really different types */ +static GstStaticCaps mpl2_caps = +GST_STATIC_CAPS ("application/x-subtitle-mpl2"); +#define SUB_CAPS (gst_static_caps_get (&sub_caps)) + +static GstStaticCaps tmp_caps = +GST_STATIC_CAPS ("application/x-subtitle-tmplayer"); +#define TMP_CAPS (gst_static_caps_get (&tmp_caps)) + +static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle"); +#define MPL2_CAPS (gst_static_caps_get (&mpl2_caps)) + +static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami"); +#define SAMI_CAPS (gst_static_caps_get (&smi_caps)) + +static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks"); +#define DKS_CAPS (gst_static_caps_get (&dks_caps)) + +static GstStaticCaps vtt_caps = GST_STATIC_CAPS ("application/x-subtitle-vtt"); +#define VTT_CAPS (gst_static_caps_get (&vtt_caps)) + +static GstStaticCaps qttext_caps = +GST_STATIC_CAPS ("application/x-subtitle-qttext"); +#define QTTEXT_CAPS (gst_static_caps_get (&qttext_caps)) + +static GstStaticCaps lrc_caps = GST_STATIC_CAPS ("application/x-subtitle-lrc"); +#define LRC_CAPS (gst_static_caps_get (&lrc_caps)) + +static void +gst_sub_parse_type_find (GstTypeFind * tf, gpointer private) +{ + GstSubParseFormat format; + const guint8 *data; + GstCaps *caps; + gchar *str; + gchar *encoding = NULL; + const gchar *end; + + if (!(data = gst_type_find_peek (tf, 0, 129))) + return; + + /* make sure string passed to _autodetect() is NUL-terminated */ + str = g_malloc0 (129); + memcpy (str, data, 128); + + if ((encoding = gst_sub_parse_detect_encoding (str, 128)) != NULL) { + gchar *converted_str; + GError *err = NULL; + gsize tmp; + + converted_str = + gst_sub_parse_gst_convert_to_utf8 (str, 128, encoding, &tmp, &err); + if (converted_str == NULL) { + GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding, + err->message); + g_clear_error (&err); + } else { + g_free (str); + str = converted_str; + } + g_free (encoding); + } + + /* Check if at least the first 120 chars are valid UTF8, + * otherwise convert as always */ + if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) { + gchar *converted_str; + gsize tmp; + const gchar *enc; + + enc = g_getenv ("GST_SUBTITLE_ENCODING"); + if (enc == NULL || *enc == '\0') { + /* if local encoding is UTF-8 and no encoding specified + * via the environment variable, assume ISO-8859-15 */ + if (g_get_charset (&enc)) { + enc = "ISO-8859-15"; + } + } + converted_str = + gst_sub_parse_gst_convert_to_utf8 (str, 128, enc, &tmp, NULL); + if (converted_str != NULL) { + g_free (str); + str = converted_str; + } + } + + format = gst_sub_parse_data_format_autodetect (str); + g_free (str); + + switch (format) { + case GST_SUB_PARSE_FORMAT_MDVDSUB: + GST_DEBUG ("MicroDVD format detected"); + caps = SUB_CAPS; + break; + case GST_SUB_PARSE_FORMAT_SUBRIP: + GST_DEBUG ("SubRip format detected"); + caps = SUB_CAPS; + break; + case GST_SUB_PARSE_FORMAT_MPSUB: + GST_DEBUG ("MPSub format detected"); + caps = SUB_CAPS; + break; + case GST_SUB_PARSE_FORMAT_SAMI: + GST_DEBUG ("SAMI (time-based) format detected"); + caps = SAMI_CAPS; + break; + case GST_SUB_PARSE_FORMAT_TMPLAYER: + GST_DEBUG ("TMPlayer (time based) format detected"); + caps = TMP_CAPS; + break; + /* FIXME: our MPL2 typefinding is not really good enough to warrant + * returning a high probability (however, since we registered our + * typefinder here with a rank of MARGINAL we should pretty much only + * be called if most other typefinders have already run */ + case GST_SUB_PARSE_FORMAT_MPL2: + GST_DEBUG ("MPL2 (time based) format detected"); + caps = MPL2_CAPS; + break; + case GST_SUB_PARSE_FORMAT_SUBVIEWER: + GST_DEBUG ("SubViewer format detected"); + caps = SUB_CAPS; + break; + case GST_SUB_PARSE_FORMAT_DKS: + GST_DEBUG ("DKS format detected"); + caps = DKS_CAPS; + break; + case GST_SUB_PARSE_FORMAT_QTTEXT: + GST_DEBUG ("QTtext format detected"); + caps = QTTEXT_CAPS; + break; + case GST_SUB_PARSE_FORMAT_LRC: + GST_DEBUG ("LRC format detected"); + caps = LRC_CAPS; + break; + case GST_SUB_PARSE_FORMAT_VTT: + GST_DEBUG ("WebVTT format detected"); + caps = VTT_CAPS; + break; + default: + case GST_SUB_PARSE_FORMAT_UNKNOWN: + GST_DEBUG ("no subtitle format detected"); + return; + } + + /* if we're here, it's ok */ + gst_type_find_suggest (tf, GST_TYPE_FIND_MAXIMUM, caps); +} + +GST_TYPE_FIND_REGISTER_DEFINE (subparse, "subparse_typefind", GST_RANK_MARGINAL, + gst_sub_parse_type_find, "srt,sub,mpsub,mdvd,smi,txt,dks,vtt", SUB_CAPS, + NULL, NULL) + + gboolean +sub_parse_element_init (GstPlugin * plugin) +{ + static gsize res = FALSE; + gboolean ret = TRUE; + if (g_once_init_enter (&res)) { + GST_DEBUG_CATEGORY_INIT (sub_parse_debug, "subparse", 0, ".sub parser"); + + ret |= GST_TYPE_FIND_REGISTER (subparse, plugin); + + g_once_init_leave (&res, TRUE); + } + return ret; +} diff --git a/gst/subparse/gstsubparseelements.h b/gst/subparse/gstsubparseelements.h new file mode 100644 index 0000000..222ce16 --- /dev/null +++ b/gst/subparse/gstsubparseelements.h @@ -0,0 +1,58 @@ +/* GStreamer + * Copyright (C) <2002> David A. Schleef + * Copyright (C) <1999> Erik Walthinsen + * Copyright (C) 2020 Huawei Technologies Co., Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __GST_SUBPARSE_ELEMENT_H__ +#define __GST_SUBPARSE_ELEMENT_H__ + +#include + +/* format enum */ +typedef enum +{ + GST_SUB_PARSE_FORMAT_UNKNOWN = 0, + GST_SUB_PARSE_FORMAT_MDVDSUB = 1, + GST_SUB_PARSE_FORMAT_SUBRIP = 2, + GST_SUB_PARSE_FORMAT_MPSUB = 3, + GST_SUB_PARSE_FORMAT_SAMI = 4, + GST_SUB_PARSE_FORMAT_TMPLAYER = 5, + GST_SUB_PARSE_FORMAT_MPL2 = 6, + GST_SUB_PARSE_FORMAT_SUBVIEWER = 7, + GST_SUB_PARSE_FORMAT_DKS = 8, + GST_SUB_PARSE_FORMAT_QTTEXT = 9, + GST_SUB_PARSE_FORMAT_LRC = 10, + GST_SUB_PARSE_FORMAT_VTT = 11 +} GstSubParseFormat; + + +G_GNUC_INTERNAL GstSubParseFormat gst_sub_parse_data_format_autodetect (gchar * match_str); +G_GNUC_INTERNAL gchar * gst_sub_parse_detect_encoding (const gchar * str, gsize len); +G_GNUC_INTERNAL gchar * gst_sub_parse_gst_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding, + gsize * consumed, GError ** err); +G_GNUC_INTERNAL gboolean sub_parse_element_init (GstPlugin * plugin); + +GST_ELEMENT_REGISTER_DECLARE (subparse); +GST_ELEMENT_REGISTER_DECLARE (ssaparse); + +GST_TYPE_FIND_REGISTER_DECLARE (subparse); + +GST_DEBUG_CATEGORY_EXTERN (sub_parse_debug); +#define GST_CAT_DEFAULT sub_parse_debug + +#endif /* __GST_SUBPARSE_ELEMENT_H__ */ diff --git a/gst/subparse/gstsubparseplugin.c b/gst/subparse/gstsubparseplugin.c new file mode 100644 index 0000000..81d113d --- /dev/null +++ b/gst/subparse/gstsubparseplugin.c @@ -0,0 +1,45 @@ +/* GStreamer + * Copyright (C) <1999> Erik Walthinsen + * Copyright (C) 2004 Ronald S. Bultje + * Copyright (C) 2006 Tim-Philipp Müller + * Copyright (C) 2016 Philippe Normand + * Copyright (C) 2016 Jan Schmidt + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "gstsubparseelements.h" + +static gboolean +plugin_init (GstPlugin * plugin) +{ + gboolean ret = FALSE; + + ret |= GST_ELEMENT_REGISTER (subparse, plugin); + ret |= GST_ELEMENT_REGISTER (ssaparse, plugin); + + return ret; +} + +GST_PLUGIN_DEFINE (GST_VERSION_MAJOR, + GST_VERSION_MINOR, + subparse, + "Subtitle parsing", + plugin_init, VERSION, "LGPL", GST_PACKAGE_NAME, GST_PACKAGE_ORIGIN) diff --git a/gst/subparse/meson.build b/gst/subparse/meson.build index 9a76601..7be6c2b 100644 --- a/gst/subparse/meson.build +++ b/gst/subparse/meson.build @@ -1,6 +1,8 @@ subparse_sources = [ 'gstssaparse.c', 'gstsubparse.c', + 'gstsubparseelement.c', + 'gstsubparseplugin.c', 'samiparse.c', 'tmplayerparse.c', 'mpl2parse.c', -- 2.7.4