telepathy-idle: Sanitize incoming messages to remove UTF-8 non-characters.

Will Thompson wjt at kemper.freedesktop.org
Mon Oct 29 03:18:37 PDT 2012


Module: telepathy-idle
Branch: master
Commit: 79425a010af79070a6b93c25deb5690cc72daf87
URL:    http://cgit.freedesktop.org/telepathy/telepathy-idle/commit/?id=79425a010af79070a6b93c25deb5690cc72daf87

Author: Will Thompson <will.thompson at collabora.co.uk>
Date:   Sun Oct 10 00:26:53 2010 +0100

Sanitize incoming messages to remove UTF-8 non-characters.

https://bugs.freedesktop.org/show_bug.cgi?id=30741

---

 src/idle-connection.c                  |   41 +++++++++++++++++++++++++
 tests/twisted/Makefile.am              |    1 +
 tests/twisted/messages/invalid-utf8.py |   52 ++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 0 deletions(-)

diff --git a/src/idle-connection.c b/src/idle-connection.c
index 3a64922..c9e1829 100644
--- a/src/idle-connection.c
+++ b/src/idle-connection.c
@@ -1391,6 +1391,37 @@ static gboolean idle_connection_hton(IdleConnection *obj, const gchar *input, gc
 	return TRUE;
 }
 
+#define U_FFFD_REPLACEMENT_CHARACTER_UTF8 "\357\277\275"
+
+static gchar *
+idle_salvage_utf8 (gchar *supposed_utf8, gssize bytes)
+{
+	GString *salvaged = g_string_sized_new (bytes);
+	const gchar *end;
+	gchar *ret;
+	gsize ret_len;
+
+	while (!g_utf8_validate (supposed_utf8, bytes, &end)) {
+		gssize valid_bytes = end - supposed_utf8;
+
+		g_string_append_len (salvaged, supposed_utf8, valid_bytes);
+		g_string_append_len (salvaged, U_FFFD_REPLACEMENT_CHARACTER_UTF8, 3);
+
+		supposed_utf8 += (valid_bytes + 1);
+		bytes -= (valid_bytes + 1);
+	}
+
+	g_string_append_len (salvaged, supposed_utf8, bytes);
+
+	ret_len = salvaged->len;
+	ret = g_string_free (salvaged, FALSE);
+
+	/* It had better be valid now… */
+	g_return_val_if_fail (g_utf8_validate (ret, ret_len, NULL), ret);
+	return ret;
+}
+
+
 static gchar *
 idle_connection_ntoh(IdleConnection *obj, const gchar *input) {
 	IdleConnectionPrivate *priv = IDLE_CONNECTION_GET_PRIVATE(obj);
@@ -1415,6 +1446,16 @@ idle_connection_ntoh(IdleConnection *obj, const gchar *input) {
 			if (*p & (1 << 7))
 				*p = '?';
 		}
+	} else if (!g_utf8_validate (ret, bytes_written, NULL)) {
+		/* Annoyingly g_convert(UTF-8, UTF-8) doesn't filter out well-formed
+		 * non-characters, so we have to do some further processing.
+		 */
+		gchar *salvaged;
+
+		IDLE_DEBUG("Invalid UTF-8, salvaging what we can...");
+		salvaged = idle_salvage_utf8(ret, bytes_written);
+		g_free(ret);
+		ret = salvaged;
 	}
 
 	return ret;
diff --git a/tests/twisted/Makefile.am b/tests/twisted/Makefile.am
index e916242..da7e1e5 100644
--- a/tests/twisted/Makefile.am
+++ b/tests/twisted/Makefile.am
@@ -17,6 +17,7 @@ TWISTED_TESTS = \
 		channels/muc-channel-topic.py \
 		messages/accept-invalid-nicks.py \
 		messages/contactinfo-request.py \
+		messages/invalid-utf8.py \
 		messages/messages-iface.py \
 		messages/message-order.py \
 		messages/leading-space.py \
diff --git a/tests/twisted/messages/invalid-utf8.py b/tests/twisted/messages/invalid-utf8.py
new file mode 100644
index 0000000..9f3d057
--- /dev/null
+++ b/tests/twisted/messages/invalid-utf8.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+"""
+Test that incoming messages containing well-formed but invalid UTF-8 code
+points don't make Idle fall off the bus. This is a regression test for
+<https://bugs.freedesktop.org/show_bug.cgi?id=30741>.
+"""
+
+from idletest import exec_test
+from servicetest import assertEquals
+
+def test(q, bus, conn, stream):
+    conn.Connect()
+    q.expect('dbus-signal', signal='StatusChanged', args=[0, 1])
+
+    test_with_message(q, stream, ["I'm no ", " Buddhist"])
+    # Check that valid exotic characters don't get lost
+    test_with_message(q, stream, [u"björk"] * 5)
+
+    test_with_message(q, stream, ["", "lolllllll"])
+    test_with_message(q, stream, ["hello", ""])
+    test_with_message(q, stream, "I am a stabbing robot".split(" "))
+
+# This is the UTF-8 encoding of U+FDD2, which is not a valid Unicode character.
+WELL_FORMED_BUT_INVALID_UTF8_BYTES = "\xef\xb7\x92"
+
+def test_with_message(q, stream, parts):
+    invalid_utf8 = WELL_FORMED_BUT_INVALID_UTF8_BYTES.join(
+        part.encode('utf-8') for part in parts)
+
+    # Idle's default character set is UTF-8. We send it a message which is
+    # basically UTF-8, except that one of its code points is invalid.
+    stream.sendMessage('PRIVMSG', stream.nick, ':%s' % invalid_utf8,
+        prefix='remoteuser')
+
+    # Idle should signal that *something* was received. If it hasn't validated
+    # & sanitized the message properly, the dbus-daemon will kick it off.
+    signal = q.expect('dbus-signal', signal='MessageReceived')
+
+    message_parts = signal.args[0]
+    text_plain = message_parts[1]
+    content = text_plain['content']
+
+    # Don't make any assumption about how many U+FFFD REPLACEMENT CHARACTERs
+    # are used to replace surprising bytes.
+    received_parts = [ part for part in content.split(u"\ufffd")
+                       if part != u''
+                     ]
+    assertEquals(filter(lambda s: s != u'', parts), received_parts)
+
+if __name__ == '__main__':
+    exec_test(test)
+



More information about the telepathy-commits mailing list