telepathy-idle: Sanitize incoming messages to remove UTF-8 non-characters.
Will Thompson
wjt at kemper.freedesktop.org
Mon Oct 29 03:18:37 PDT 2012
Module: telepathy-idle
Branch: master
Commit: 79425a010af79070a6b93c25deb5690cc72daf87
URL: http://cgit.freedesktop.org/telepathy/telepathy-idle/commit/?id=79425a010af79070a6b93c25deb5690cc72daf87
Author: Will Thompson <will.thompson at collabora.co.uk>
Date: Sun Oct 10 00:26:53 2010 +0100
Sanitize incoming messages to remove UTF-8 non-characters.
https://bugs.freedesktop.org/show_bug.cgi?id=30741
---
src/idle-connection.c | 41 +++++++++++++++++++++++++
tests/twisted/Makefile.am | 1 +
tests/twisted/messages/invalid-utf8.py | 52 ++++++++++++++++++++++++++++++++
3 files changed, 94 insertions(+), 0 deletions(-)
diff --git a/src/idle-connection.c b/src/idle-connection.c
index 3a64922..c9e1829 100644
--- a/src/idle-connection.c
+++ b/src/idle-connection.c
@@ -1391,6 +1391,37 @@ static gboolean idle_connection_hton(IdleConnection *obj, const gchar *input, gc
return TRUE;
}
+#define U_FFFD_REPLACEMENT_CHARACTER_UTF8 "\357\277\275"
+
+static gchar *
+idle_salvage_utf8 (gchar *supposed_utf8, gssize bytes)
+{
+ GString *salvaged = g_string_sized_new (bytes);
+ const gchar *end;
+ gchar *ret;
+ gsize ret_len;
+
+ while (!g_utf8_validate (supposed_utf8, bytes, &end)) {
+ gssize valid_bytes = end - supposed_utf8;
+
+ g_string_append_len (salvaged, supposed_utf8, valid_bytes);
+ g_string_append_len (salvaged, U_FFFD_REPLACEMENT_CHARACTER_UTF8, 3);
+
+ supposed_utf8 += (valid_bytes + 1);
+ bytes -= (valid_bytes + 1);
+ }
+
+ g_string_append_len (salvaged, supposed_utf8, bytes);
+
+ ret_len = salvaged->len;
+ ret = g_string_free (salvaged, FALSE);
+
+ /* It had better be valid now⦠*/
+ g_return_val_if_fail (g_utf8_validate (ret, ret_len, NULL), ret);
+ return ret;
+}
+
+
static gchar *
idle_connection_ntoh(IdleConnection *obj, const gchar *input) {
IdleConnectionPrivate *priv = IDLE_CONNECTION_GET_PRIVATE(obj);
@@ -1415,6 +1446,16 @@ idle_connection_ntoh(IdleConnection *obj, const gchar *input) {
if (*p & (1 << 7))
*p = '?';
}
+ } else if (!g_utf8_validate (ret, bytes_written, NULL)) {
+ /* Annoyingly g_convert(UTF-8, UTF-8) doesn't filter out well-formed
+ * non-characters, so we have to do some further processing.
+ */
+ gchar *salvaged;
+
+ IDLE_DEBUG("Invalid UTF-8, salvaging what we can...");
+ salvaged = idle_salvage_utf8(ret, bytes_written);
+ g_free(ret);
+ ret = salvaged;
}
return ret;
diff --git a/tests/twisted/Makefile.am b/tests/twisted/Makefile.am
index e916242..da7e1e5 100644
--- a/tests/twisted/Makefile.am
+++ b/tests/twisted/Makefile.am
@@ -17,6 +17,7 @@ TWISTED_TESTS = \
channels/muc-channel-topic.py \
messages/accept-invalid-nicks.py \
messages/contactinfo-request.py \
+ messages/invalid-utf8.py \
messages/messages-iface.py \
messages/message-order.py \
messages/leading-space.py \
diff --git a/tests/twisted/messages/invalid-utf8.py b/tests/twisted/messages/invalid-utf8.py
new file mode 100644
index 0000000..9f3d057
--- /dev/null
+++ b/tests/twisted/messages/invalid-utf8.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+"""
+Test that incoming messages containing well-formed but invalid UTF-8 code
+points don't make Idle fall off the bus. This is a regression test for
+<https://bugs.freedesktop.org/show_bug.cgi?id=30741>.
+"""
+
+from idletest import exec_test
+from servicetest import assertEquals
+
+def test(q, bus, conn, stream):
+ conn.Connect()
+ q.expect('dbus-signal', signal='StatusChanged', args=[0, 1])
+
+ test_with_message(q, stream, ["I'm no ", " Buddhist"])
+ # Check that valid exotic characters don't get lost
+ test_with_message(q, stream, [u"björk"] * 5)
+
+ test_with_message(q, stream, ["", "lolllllll"])
+ test_with_message(q, stream, ["hello", ""])
+ test_with_message(q, stream, "I am a stabbing robot".split(" "))
+
+# This is the UTF-8 encoding of U+FDD2, which is not a valid Unicode character.
+WELL_FORMED_BUT_INVALID_UTF8_BYTES = "\xef\xb7\x92"
+
+def test_with_message(q, stream, parts):
+ invalid_utf8 = WELL_FORMED_BUT_INVALID_UTF8_BYTES.join(
+ part.encode('utf-8') for part in parts)
+
+ # Idle's default character set is UTF-8. We send it a message which is
+ # basically UTF-8, except that one of its code points is invalid.
+ stream.sendMessage('PRIVMSG', stream.nick, ':%s' % invalid_utf8,
+ prefix='remoteuser')
+
+ # Idle should signal that *something* was received. If it hasn't validated
+ # & sanitized the message properly, the dbus-daemon will kick it off.
+ signal = q.expect('dbus-signal', signal='MessageReceived')
+
+ message_parts = signal.args[0]
+ text_plain = message_parts[1]
+ content = text_plain['content']
+
+ # Don't make any assumption about how many U+FFFD REPLACEMENT CHARACTERs
+ # are used to replace surprising bytes.
+ received_parts = [ part for part in content.split(u"\ufffd")
+ if part != u''
+ ]
+ assertEquals(filter(lambda s: s != u'', parts), received_parts)
+
+if __name__ == '__main__':
+ exec_test(test)
+
More information about the telepathy-commits
mailing list