From e67aa28de9df1deae94697dcc32ef8c3e7c35fa4 Mon Sep 17 00:00:00 2001 From: Vincent Penquerc'h Date: Fri, 30 Sep 2011 20:00:50 +0100 Subject: [PATCH] typefind: typefind UTF-16 and UTF-32 This avoids the MP3 typefinder from getting the highest score every time it thinks there's something it might possibly be able to parse. https://bugzilla.gnome.org/show_bug.cgi?id=607619 --- gst/typefind/Makefile.am | 2 +- gst/typefind/gsttypefindfunctions.c | 157 ++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 1 deletion(-) diff --git a/gst/typefind/Makefile.am b/gst/typefind/Makefile.am index 3258b15..f64cdc3 100644 --- a/gst/typefind/Makefile.am +++ b/gst/typefind/Makefile.am @@ -7,7 +7,7 @@ libgsttypefindfunctions_la_CFLAGS = \ libgsttypefindfunctions_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS) libgsttypefindfunctions_la_LIBADD = \ $(top_builddir)/gst-libs/gst/pbutils/libgstpbutils-@GST_MAJORMINOR@.la \ - $(GST_LIBS) $(GIO_LIBS) + $(GST_BASE_LIBS) $(GST_LIBS) $(GIO_LIBS) libgsttypefindfunctions_la_LIBTOOLFLAGS = --tag=disable-static diff --git a/gst/typefind/gsttypefindfunctions.c b/gst/typefind/gsttypefindfunctions.c index 1675add..22ad2fb 100644 --- a/gst/typefind/gsttypefindfunctions.c +++ b/gst/typefind/gsttypefindfunctions.c @@ -41,6 +41,7 @@ #include #include +#include GST_DEBUG_CATEGORY_STATIC (type_find_debug); #define GST_CAT_DEFAULT type_find_debug @@ -206,6 +207,157 @@ utf8_type_find (GstTypeFind * tf, gpointer unused) gst_type_find_suggest (tf, (start_prob + mid_prob) / 2, UTF8_CAPS); } +/*** text/utf-16 and text/utf-32} ***/ +/* While UTF-8 is unicode too, using text/plain for UTF-16 and UTF-32 + is going to break stuff. */ + +typedef struct +{ + size_t bomlen; + const char *const bom; + gboolean (*checker) (const guint8 *, gint, gint); + int boost; + int endianness; +} GstUnicodeTester; + +static gboolean +check_utf16 (const guint8 * data, gint len, gint endianness) +{ + GstByteReader br; + guint16 high, low; + + if (len & 1) + return FALSE; + + gst_byte_reader_init (&br, data, len); + while (len >= 2) { + /* test first for a single 16 bit value in the BMP */ + if (endianness == G_BIG_ENDIAN) + gst_byte_reader_get_uint16_be (&br, &high); + else + gst_byte_reader_get_uint16_le (&br, &high); + if (high >= 0xD800 && high <= 0xDBFF) { + /* start of a surrogate pair */ + if (len < 4) + return FALSE; + len -= 2; + if (endianness == G_BIG_ENDIAN) + gst_byte_reader_get_uint16_be (&br, &low); + else + gst_byte_reader_get_uint16_le (&br, &low); + if (low >= 0xDC00 && low <= 0xDFFF) { + /* second half of the surrogate pair */ + } else + return FALSE; + } else { + if (high >= 0xDC00 && high <= 0xDFFF) + return FALSE; + } + len -= 2; + } + return TRUE; +} + +static gboolean +check_utf32 (const guint8 * data, gint len, gint endianness) +{ + if (len & 3) + return FALSE; + while (len > 3) { + guint32 v; + if (endianness == G_BIG_ENDIAN) + v = (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3]; + else + v = (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0]; + if (v >= 0x10FFFF) + return FALSE; + data += 4; + len -= 4; + } + return TRUE; +} + +static void +unicode_type_find (GstTypeFind * tf, const GstUnicodeTester * tester, + guint n_tester, const char *media_type) +{ + size_t n; + gint len = 4; + const guint8 *data = gst_type_find_peek (tf, 0, len); + int prob = -1; + const gint max_scan_size = 256 * 1024; + int endianness; + + if (!data) { + len = 2; + data = gst_type_find_peek (tf, 0, len); + if (!data) + return; + } + + /* find a large enough size that works */ + while (len < max_scan_size) { + size_t newlen = len << 1; + const guint8 *newdata = gst_type_find_peek (tf, 0, newlen); + if (!newdata) + break; + len = newlen; + data = newdata; + } + + for (n = 0; n < n_tester; ++n) { + int bom_boost = 0, tmpprob; + if (len >= tester[n].bomlen) { + if (!memcmp (data, tester[n].bom, tester[n].bomlen)) + bom_boost = tester[n].boost; + } + if (!(*tester[n].checker) (data, len, tester[n].endianness)) + continue; + tmpprob = GST_TYPE_FIND_POSSIBLE - 20 + bom_boost; + if (tmpprob > prob) { + prob = tmpprob; + endianness = tester[n].endianness; + } + } + + if (prob > 0) { + GST_DEBUG ("This is valid %s %s", media_type, + endianness == G_BIG_ENDIAN ? "be" : "le"); + gst_type_find_suggest_simple (tf, prob, media_type, + "endianness", G_TYPE_INT, endianness, NULL); + } +} + +static GstStaticCaps utf16_caps = GST_STATIC_CAPS ("text/utf-16"); + +#define UTF16_CAPS gst_static_caps_get(&utf16_caps) + +static void +utf16_type_find (GstTypeFind * tf, gpointer unused) +{ + static const GstUnicodeTester utf16tester[2] = { + {2, "\xff\xfe", check_utf16, 10, G_LITTLE_ENDIAN}, + {2, "\xfe\xff", check_utf16, 20, G_BIG_ENDIAN}, + }; + unicode_type_find (tf, utf16tester, G_N_ELEMENTS (utf16tester), + "text/utf-16"); +} + +static GstStaticCaps utf32_caps = GST_STATIC_CAPS ("text/utf-32"); + +#define UTF32_CAPS gst_static_caps_get(&utf32_caps) + +static void +utf32_type_find (GstTypeFind * tf, gpointer unused) +{ + static const GstUnicodeTester utf32tester[2] = { + {4, "\xff\xfe\x00\x00", check_utf32, 10, G_LITTLE_ENDIAN}, + {4, "\x00\x00\xfe\xff", check_utf32, 20, G_BIG_ENDIAN} + }; + unicode_type_find (tf, utf32tester, G_N_ELEMENTS (utf32tester), + "text/utf-32"); +} + /*** text/uri-list ***/ static GstStaticCaps uri_caps = GST_STATIC_CAPS ("text/uri-list"); @@ -4262,6 +4414,7 @@ plugin_init (GstPlugin * plugin) static const gchar *rm_exts[] = { "ra", "ram", "rm", "rmvb", NULL }; static const gchar *swf_exts[] = { "swf", "swfl", NULL }; static const gchar *utf8_exts[] = { "txt", NULL }; + static const gchar *unicode_exts[] = { "txt", NULL }; static const gchar *wav_exts[] = { "wav", NULL }; static const gchar *aiff_exts[] = { "aiff", "aif", "aifc", NULL }; static const gchar *svx_exts[] = { "iff", "svx", NULL }; @@ -4436,6 +4589,10 @@ plugin_init (GstPlugin * plugin) flv_exts, "FLV", 3, GST_TYPE_FIND_MAXIMUM); TYPE_FIND_REGISTER (plugin, "text/plain", GST_RANK_MARGINAL, utf8_type_find, utf8_exts, UTF8_CAPS, NULL, NULL); + TYPE_FIND_REGISTER (plugin, "text/utf-16", GST_RANK_MARGINAL, utf16_type_find, + unicode_exts, UTF16_CAPS, NULL, NULL); + TYPE_FIND_REGISTER (plugin, "text/utf-32", GST_RANK_MARGINAL, utf32_type_find, + unicode_exts, UTF32_CAPS, NULL, NULL); TYPE_FIND_REGISTER (plugin, "text/uri-list", GST_RANK_MARGINAL, uri_type_find, uri_exts, URI_CAPS, NULL, NULL); TYPE_FIND_REGISTER (plugin, "application/x-hls", GST_RANK_MARGINAL, -- 2.7.4