gst-plugins-base: typefind: typefind UTF-16 and UTF-32
Vincent Penquerc'h
vincent at kemper.freedesktop.org
Mon Nov 28 07:59:01 PST 2011
Module: gst-plugins-base
Branch: master
Commit: e67aa28de9df1deae94697dcc32ef8c3e7c35fa4
URL: http://cgit.freedesktop.org/gstreamer/gst-plugins-base/commit/?id=e67aa28de9df1deae94697dcc32ef8c3e7c35fa4
Author: Vincent Penquerc'h <vincent.penquerch at collabora.co.uk>
Date: Fri Sep 30 20:00:50 2011 +0100
typefind: typefind UTF-16 and UTF-32
This avoids the MP3 typefinder from getting the highest score
every time it thinks there's something it might possibly be
able to parse.
https://bugzilla.gnome.org/show_bug.cgi?id=607619
---
gst/typefind/Makefile.am | 2 +-
gst/typefind/gsttypefindfunctions.c | 157 +++++++++++++++++++++++++++++++++++
2 files changed, 158 insertions(+), 1 deletions(-)
diff --git a/gst/typefind/Makefile.am b/gst/typefind/Makefile.am
index 3258b15..f64cdc3 100644
--- a/gst/typefind/Makefile.am
+++ b/gst/typefind/Makefile.am
@@ -7,7 +7,7 @@ libgsttypefindfunctions_la_CFLAGS = \
libgsttypefindfunctions_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
libgsttypefindfunctions_la_LIBADD = \
$(top_builddir)/gst-libs/gst/pbutils/libgstpbutils- at GST_MAJORMINOR@.la \
- $(GST_LIBS) $(GIO_LIBS)
+ $(GST_BASE_LIBS) $(GST_LIBS) $(GIO_LIBS)
libgsttypefindfunctions_la_LIBTOOLFLAGS = --tag=disable-static
diff --git a/gst/typefind/gsttypefindfunctions.c b/gst/typefind/gsttypefindfunctions.c
index 1675add..22ad2fb 100644
--- a/gst/typefind/gsttypefindfunctions.c
+++ b/gst/typefind/gsttypefindfunctions.c
@@ -41,6 +41,7 @@
#include <ctype.h>
#include <gst/pbutils/pbutils.h>
+#include <gst/base/gstbytereader.h>
GST_DEBUG_CATEGORY_STATIC (type_find_debug);
#define GST_CAT_DEFAULT type_find_debug
@@ -206,6 +207,157 @@ utf8_type_find (GstTypeFind * tf, gpointer unused)
gst_type_find_suggest (tf, (start_prob + mid_prob) / 2, UTF8_CAPS);
}
+/*** text/utf-16 and text/utf-32} ***/
+/* While UTF-8 is unicode too, using text/plain for UTF-16 and UTF-32
+ is going to break stuff. */
+
+typedef struct
+{
+ size_t bomlen;
+ const char *const bom;
+ gboolean (*checker) (const guint8 *, gint, gint);
+ int boost;
+ int endianness;
+} GstUnicodeTester;
+
+static gboolean
+check_utf16 (const guint8 * data, gint len, gint endianness)
+{
+ GstByteReader br;
+ guint16 high, low;
+
+ if (len & 1)
+ return FALSE;
+
+ gst_byte_reader_init (&br, data, len);
+ while (len >= 2) {
+ /* test first for a single 16 bit value in the BMP */
+ if (endianness == G_BIG_ENDIAN)
+ gst_byte_reader_get_uint16_be (&br, &high);
+ else
+ gst_byte_reader_get_uint16_le (&br, &high);
+ if (high >= 0xD800 && high <= 0xDBFF) {
+ /* start of a surrogate pair */
+ if (len < 4)
+ return FALSE;
+ len -= 2;
+ if (endianness == G_BIG_ENDIAN)
+ gst_byte_reader_get_uint16_be (&br, &low);
+ else
+ gst_byte_reader_get_uint16_le (&br, &low);
+ if (low >= 0xDC00 && low <= 0xDFFF) {
+ /* second half of the surrogate pair */
+ } else
+ return FALSE;
+ } else {
+ if (high >= 0xDC00 && high <= 0xDFFF)
+ return FALSE;
+ }
+ len -= 2;
+ }
+ return TRUE;
+}
+
+static gboolean
+check_utf32 (const guint8 * data, gint len, gint endianness)
+{
+ if (len & 3)
+ return FALSE;
+ while (len > 3) {
+ guint32 v;
+ if (endianness == G_BIG_ENDIAN)
+ v = (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
+ else
+ v = (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
+ if (v >= 0x10FFFF)
+ return FALSE;
+ data += 4;
+ len -= 4;
+ }
+ return TRUE;
+}
+
+static void
+unicode_type_find (GstTypeFind * tf, const GstUnicodeTester * tester,
+ guint n_tester, const char *media_type)
+{
+ size_t n;
+ gint len = 4;
+ const guint8 *data = gst_type_find_peek (tf, 0, len);
+ int prob = -1;
+ const gint max_scan_size = 256 * 1024;
+ int endianness;
+
+ if (!data) {
+ len = 2;
+ data = gst_type_find_peek (tf, 0, len);
+ if (!data)
+ return;
+ }
+
+ /* find a large enough size that works */
+ while (len < max_scan_size) {
+ size_t newlen = len << 1;
+ const guint8 *newdata = gst_type_find_peek (tf, 0, newlen);
+ if (!newdata)
+ break;
+ len = newlen;
+ data = newdata;
+ }
+
+ for (n = 0; n < n_tester; ++n) {
+ int bom_boost = 0, tmpprob;
+ if (len >= tester[n].bomlen) {
+ if (!memcmp (data, tester[n].bom, tester[n].bomlen))
+ bom_boost = tester[n].boost;
+ }
+ if (!(*tester[n].checker) (data, len, tester[n].endianness))
+ continue;
+ tmpprob = GST_TYPE_FIND_POSSIBLE - 20 + bom_boost;
+ if (tmpprob > prob) {
+ prob = tmpprob;
+ endianness = tester[n].endianness;
+ }
+ }
+
+ if (prob > 0) {
+ GST_DEBUG ("This is valid %s %s", media_type,
+ endianness == G_BIG_ENDIAN ? "be" : "le");
+ gst_type_find_suggest_simple (tf, prob, media_type,
+ "endianness", G_TYPE_INT, endianness, NULL);
+ }
+}
+
+static GstStaticCaps utf16_caps = GST_STATIC_CAPS ("text/utf-16");
+
+#define UTF16_CAPS gst_static_caps_get(&utf16_caps)
+
+static void
+utf16_type_find (GstTypeFind * tf, gpointer unused)
+{
+ static const GstUnicodeTester utf16tester[2] = {
+ {2, "\xff\xfe", check_utf16, 10, G_LITTLE_ENDIAN},
+ {2, "\xfe\xff", check_utf16, 20, G_BIG_ENDIAN},
+ };
+ unicode_type_find (tf, utf16tester, G_N_ELEMENTS (utf16tester),
+ "text/utf-16");
+}
+
+static GstStaticCaps utf32_caps = GST_STATIC_CAPS ("text/utf-32");
+
+#define UTF32_CAPS gst_static_caps_get(&utf32_caps)
+
+static void
+utf32_type_find (GstTypeFind * tf, gpointer unused)
+{
+ static const GstUnicodeTester utf32tester[2] = {
+ {4, "\xff\xfe\x00\x00", check_utf32, 10, G_LITTLE_ENDIAN},
+ {4, "\x00\x00\xfe\xff", check_utf32, 20, G_BIG_ENDIAN}
+ };
+ unicode_type_find (tf, utf32tester, G_N_ELEMENTS (utf32tester),
+ "text/utf-32");
+}
+
/*** text/uri-list ***/
static GstStaticCaps uri_caps = GST_STATIC_CAPS ("text/uri-list");
@@ -4262,6 +4414,7 @@ plugin_init (GstPlugin * plugin)
static const gchar *rm_exts[] = { "ra", "ram", "rm", "rmvb", NULL };
static const gchar *swf_exts[] = { "swf", "swfl", NULL };
static const gchar *utf8_exts[] = { "txt", NULL };
+ static const gchar *unicode_exts[] = { "txt", NULL };
static const gchar *wav_exts[] = { "wav", NULL };
static const gchar *aiff_exts[] = { "aiff", "aif", "aifc", NULL };
static const gchar *svx_exts[] = { "iff", "svx", NULL };
@@ -4436,6 +4589,10 @@ plugin_init (GstPlugin * plugin)
flv_exts, "FLV", 3, GST_TYPE_FIND_MAXIMUM);
TYPE_FIND_REGISTER (plugin, "text/plain", GST_RANK_MARGINAL, utf8_type_find,
utf8_exts, UTF8_CAPS, NULL, NULL);
+ TYPE_FIND_REGISTER (plugin, "text/utf-16", GST_RANK_MARGINAL, utf16_type_find,
+ unicode_exts, UTF16_CAPS, NULL, NULL);
+ TYPE_FIND_REGISTER (plugin, "text/utf-32", GST_RANK_MARGINAL, utf32_type_find,
+ unicode_exts, UTF32_CAPS, NULL, NULL);
TYPE_FIND_REGISTER (plugin, "text/uri-list", GST_RANK_MARGINAL, uri_type_find,
uri_exts, URI_CAPS, NULL, NULL);
TYPE_FIND_REGISTER (plugin, "application/x-hls", GST_RANK_MARGINAL,
More information about the gstreamer-commits
mailing list