[systemd-commits] 3 commits - src/libsystemd-terminal src/shared src/test

Mon Dec 22 11:33:00 PST 2014

src/libsystemd-terminal/subterm.c          |    5 +
 src/libsystemd-terminal/term-parser.c      |   43 --------------
 src/libsystemd-terminal/term-screen.c      |    3 -
 src/libsystemd-terminal/term.h             |    1 
 src/libsystemd-terminal/test-term-parser.c |    3 -
 src/shared/json.c                          |   81 ++++++++++++++++++++-------
 src/shared/utf8.c                          |   85 +++++++++++++++++++++++------
 src/shared/utf8.h                          |   14 ++++
 src/test/test-json.c                       |    4 +
 src/test/test-utf8.c                       |   13 ++++
 10 files changed, 166 insertions(+), 86 deletions(-)

New commits:
commit 9bae67d49b861b1f142f1a1e27753fe08e63ade7
Author: Tom Gundersen <teg at jklm.no>
Date:   Mon Dec 22 14:53:40 2014 +0100

    shared: json - support escaping utf16 surrogate pairs
    
    We originally only supported escaping ucs2 encoded characters (as \uxxxx). This
    only covers the BMP. Support escaping also utf16 surrogate pairs (on the form
    \uxxxx\uyyyy) to cover all of unicode.

diff --git a/src/shared/json.c b/src/shared/json.c
index 47f801c..bb3d26f 100644
--- a/src/shared/json.c
+++ b/src/shared/json.c
@@ -53,6 +53,42 @@ static void inc_lines(unsigned *line, const char *s, size_t n) {
         }
 }
 
+static int unhex_ucs2(const char *c, uint16_t *ret) {
+        int aa, bb, cc, dd;
+        uint16_t x;
+
+        assert(c);
+        assert(ret);
+
+        aa = unhexchar(c[0]);
+        if (aa < 0)
+                return -EINVAL;
+
+        bb = unhexchar(c[1]);
+        if (bb < 0)
+                return -EINVAL;
+
+        cc = unhexchar(c[2]);
+        if (cc < 0)
+                return -EINVAL;
+
+        dd = unhexchar(c[3]);
+        if (dd < 0)
+                return -EINVAL;
+
+        x =     ((uint16_t) aa << 12) |
+                ((uint16_t) bb << 8) |
+                ((uint16_t) cc << 4) |
+                ((uint16_t) dd);
+
+        if (x <= 0)
+                return -EINVAL;
+
+        *ret = x;
+
+        return 0;
+}
+
 static int json_parse_string(const char **p, char **ret) {
         _cleanup_free_ char *s = NULL;
         size_t n = 0, allocated = 0;
@@ -119,39 +155,40 @@ static int json_parse_string(const char **p, char **ret) {
                         else if (*c == 't')
                                 ch = '\t';
                         else if (*c == 'u') {
-                                int aa, bb, cc, dd;
                                 uint16_t x;
+                                int r;
 
-                                aa = unhexchar(c[1]);
-                                if (aa < 0)
-                                        return -EINVAL;
+                                r = unhex_ucs2(c + 1, &x);
+                                if (r < 0)
+                                        return r;
 
-                                bb = unhexchar(c[2]);
-                                if (bb < 0)
-                                        return -EINVAL;
+                                c += 5;
 
-                                cc = unhexchar(c[3]);
-                                if (cc < 0)
-                                        return -EINVAL;
+                                if (!GREEDY_REALLOC(s, allocated, n + 4))
+                                        return -ENOMEM;
 
-                                dd = unhexchar(c[4]);
-                                if (dd < 0)
+                                if (!utf16_is_surrogate(x))
+                                        n += utf8_encode_unichar(s + n, x);
+                                else if (utf16_is_trailing_surrogate(x))
                                         return -EINVAL;
+                                else {
+                                        uint16_t y;
 
+                                        if (c[0] != '\\' || c[1] != 'u')
+                                                return -EINVAL;
 
-                                x =     ((uint16_t) aa << 12) |
-                                        ((uint16_t) bb << 8) |
-                                        ((uint16_t) cc << 4) |
-                                        ((uint16_t) dd);
+                                        r = unhex_ucs2(c + 2, &y);
+                                        if (r < 0)
+                                                return r;
 
-                                if (x <= 0)
-                                        return -EINVAL;
+                                        c += 6;
 
-                                if (!GREEDY_REALLOC(s, allocated, n + 4))
-                                        return -ENOMEM;
+                                        if (!utf16_is_trailing_surrogate(y))
+                                                return -EINVAL;
+
+                                        n += utf8_encode_unichar(s + n, utf16_surrogate_pair_to_unichar(x, y));
+                                }
 
-                                n += utf8_encode_unichar(s + n, x);
-                                c += 5;
                                 continue;
                         } else
                                 return -EINVAL;
diff --git a/src/test/test-json.c b/src/test/test-json.c
index e53e8ed..b091318 100644
--- a/src/test/test-json.c
+++ b/src/test/test-json.c
@@ -99,6 +99,9 @@ int main(int argc, char *argv[]) {
         test_one("\"\xef\xbf\xbd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\ufffd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\uf\"", -EINVAL);
+        test_one("\"\\ud800a\"", -EINVAL);
+        test_one("\"\\udc00\\udc00\"", -EINVAL);
+        test_one("\"\\ud801\\udc37\"", JSON_STRING, "\xf0\x90\x90\xb7", JSON_END);
 
         return 0;
 }

commit 04166cb7dd90918385835f246c43d8ec22af0d68
Author: Tom Gundersen <teg at jklm.no>
Date:   Mon Dec 22 12:57:05 2014 +0100

    shared: utf8 - support decoding the full utf16
    
    We originally only supported the BMP (i.e., we treated UTF-16 as UCS-2).

diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index ab57f42..013c110 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -310,12 +310,42 @@ char *utf16_to_utf8(const void *s, size_t length) {
         const uint8_t *f;
         char *r, *t;
 
-        r = new(char, (length*3+1)/2 + 1);
+        r = new(char, (length * 4 + 1) / 2 + 1);
         if (!r)
                 return NULL;
 
-        for (f = s, t = r; f < (const uint8_t*) s + length; f += 2)
-                t += utf8_encode_unichar(t, (f[1] << 8) | f[0]);
+        f = s;
+        t = r;
+
+        while (f < (const uint8_t*) s + length) {
+                uint16_t w1, w2;
+
+                /* see RFC 2781 section 2.2 */
+
+                w1 = f[1] << 8 | f[0];
+                f += 2;
+
+                if (!utf16_is_surrogate(w1)) {
+                        t += utf8_encode_unichar(t, w1);
+
+                        continue;
+                }
+
+                if (utf16_is_trailing_surrogate(w1))
+                        continue;
+                else if (f >= (const uint8_t*) s + length)
+                        break;
+
+                w2 = f[1] << 8 | f[0];
+                f += 2;
+
+                if (!utf16_is_trailing_surrogate(w2)) {
+                        f -= 2;
+                        continue;
+                }
+
+                t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
+        }
 
         *t = 0;
         return r;
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index 3d5a4c3..77f6634 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -41,3 +41,15 @@ char *utf16_to_utf8(const void *s, size_t length);
 
 int utf8_encoded_valid_unichar(const char *str);
 int utf8_encoded_to_unichar(const char *str);
+
+static inline bool utf16_is_surrogate(uint16_t c) {
+        return (0xd800 <= c && c <= 0xdfff);
+}
+
+static inline bool utf16_is_trailing_surrogate(uint16_t c) {
+        return (0xdc00 <= c && c <= 0xdfff);
+}
+
+static inline uint32_t utf16_surrogate_pair_to_unichar(uint16_t lead, uint16_t trail) {
+                return ((lead - 0xd800) << 10) + (trail - 0xdc00) + 0x10000;
+}
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
index 3399f2b..befa385 100644
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -93,6 +93,18 @@ static void test_utf8_escaping_printable(void) {
         assert_se(utf8_is_valid(p6));
 }
 
+static void test_utf16_to_utf8(void) {
+        char *a = NULL;
+        const uint16_t utf16[] = { 'a', 0xd800, 'b', 0xdc00, 'c', 0xd801, 0xdc37 };
+        const char utf8[] = { 'a', 'b', 'c', 0xf0, 0x90, 0x90, 0xb7, 0 };
+
+        a = utf16_to_utf8(utf16, 14);
+        assert_se(a);
+        assert_se(streq(a, utf8));
+
+        free(a);
+}
+
 int main(int argc, char *argv[]) {
         test_utf8_is_valid();
         test_utf8_is_printable();
@@ -100,6 +112,7 @@ int main(int argc, char *argv[]) {
         test_utf8_encoded_valid_unichar();
         test_utf8_escaping();
         test_utf8_escaping_printable();
+        test_utf16_to_utf8();
 
         return 0;
 }

commit 2bb4c7e384c31de4727f1330da3f4de2f0bb7784
Author: Tom Gundersen <teg at jklm.no>
Date:   Mon Dec 22 00:58:26 2014 +0100

    shared: utf8 - support ucs4 -> utf8
    
    Originally we only supported ucs2, so move the ucs4 version from libsystemd-terminal to shared
    and use that everywhere.

diff --git a/src/libsystemd-terminal/subterm.c b/src/libsystemd-terminal/subterm.c
index 78efc9d..7c119ac 100644
--- a/src/libsystemd-terminal/subterm.c
+++ b/src/libsystemd-terminal/subterm.c
@@ -41,6 +41,7 @@
 #include "sd-event.h"
 #include "term-internal.h"
 #include "util.h"
+#include "utf8.h"
 
 typedef struct Output Output;
 typedef struct Terminal Terminal;
@@ -459,7 +460,7 @@ static int output_draw_cell_fn(term_screen *screen,
                 output_printf(o, " ");
         } else {
                 for (k = 0; k < n_ch; ++k) {
-                        ulen = term_utf8_encode(utf8, ch[k]);
+                        ulen = utf8_encode_unichar(utf8, ch[k]);
                         output_write(o, utf8, ulen);
                 }
         }
@@ -625,7 +626,7 @@ static int terminal_push_tmp(Terminal *t, uint32_t ucs4) {
 
         assert(t);
 
-        len = term_utf8_encode(buf, ucs4);
+        len = utf8_encode_unichar(buf, ucs4);
         if (len < 1)
                 return 0;
 
diff --git a/src/libsystemd-terminal/term-parser.c b/src/libsystemd-terminal/term-parser.c
index d8206a4..8dc1da2 100644
--- a/src/libsystemd-terminal/term-parser.c
+++ b/src/libsystemd-terminal/term-parser.c
@@ -152,49 +152,6 @@ void term_attr_to_argb32(const term_attr *attr, uint32_t *fg, uint32_t *bg, cons
 }
 
 /**
- * term_utf8_encode() - Encode single UCS-4 character as UTF-8
- * @out_utf8: output buffer of at least 4 bytes or NULL
- * @g: UCS-4 character to encode
- *
- * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
- * The length of the character is returned. It is not zero-terminated! If the
- * output buffer is NULL, only the length is returned.
- *
- * Returns: The length in bytes that the UTF-8 representation does or would
- *          occupy.
- */
-size_t term_utf8_encode(char *out_utf8, uint32_t g) {
-        if (g < (1 << 7)) {
-                if (out_utf8)
-                        out_utf8[0] = g & 0x7f;
-                return 1;
-        } else if (g < (1 << 11)) {
-                if (out_utf8) {
-                        out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
-                        out_utf8[1] = 0x80 | (g & 0x3f);
-                }
-                return 2;
-        } else if (g < (1 << 16)) {
-                if (out_utf8) {
-                        out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
-                        out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
-                        out_utf8[2] = 0x80 | (g & 0x3f);
-                }
-                return 3;
-        } else if (g < (1 << 21)) {
-                if (out_utf8) {
-                        out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
-                        out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
-                        out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
-                        out_utf8[3] = 0x80 | (g & 0x3f);
-                }
-                return 4;
-        } else {
-                return 0;
-        }
-}
-
-/**
  * term_utf8_decode() - Try decoding the next UCS-4 character
  * @p: decoder object to operate on or NULL
  * @out_len: output storage for pointer to decoded UCS-4 string or NULL
diff --git a/src/libsystemd-terminal/term-screen.c b/src/libsystemd-terminal/term-screen.c
index f021ffe..0e38ff4 100644
--- a/src/libsystemd-terminal/term-screen.c
+++ b/src/libsystemd-terminal/term-screen.c
@@ -51,6 +51,7 @@
 #include "macro.h"
 #include "term-internal.h"
 #include "util.h"
+#include "utf8.h"
 
 int term_screen_new(term_screen **out, term_screen_write_fn write_fn, void *write_fn_data, term_screen_cmd_fn cmd_fn, void *cmd_fn_data) {
         _cleanup_(term_screen_unrefp) term_screen *screen = NULL;
@@ -4107,7 +4108,7 @@ static char *screen_map_key(term_screen *screen,
 
         /* map unicode keys */
         for (i = 0; i < n_syms; ++i)
-                p += term_utf8_encode(p, ucs4[i]);
+                p += utf8_encode_unichar(p, ucs4[i]);
 
         return p;
 }
diff --git a/src/libsystemd-terminal/term.h b/src/libsystemd-terminal/term.h
index eae6c63..1a78a81 100644
--- a/src/libsystemd-terminal/term.h
+++ b/src/libsystemd-terminal/term.h
@@ -112,7 +112,6 @@ struct term_utf8 {
         unsigned int valid : 1;
 };
 
-size_t term_utf8_encode(char *out_utf8, uint32_t g);
 size_t term_utf8_decode(term_utf8 *p, uint32_t **out_buf, char c);
 
 /*
diff --git a/src/libsystemd-terminal/test-term-parser.c b/src/libsystemd-terminal/test-term-parser.c
index e8d5dcf..e22614d 100644
--- a/src/libsystemd-terminal/test-term-parser.c
+++ b/src/libsystemd-terminal/test-term-parser.c
@@ -30,6 +30,7 @@
 #include "macro.h"
 #include "term-internal.h"
 #include "util.h"
+#include "utf8.h"
 
 static void test_term_utf8_invalid(void) {
         term_utf8 p = { };
@@ -74,7 +75,7 @@ static void test_term_utf8_range(void) {
         /* Convert all ucs-4 chars to utf-8 and back */
 
         for (i = 0; i < 0x10FFFF; ++i) {
-                ulen = term_utf8_encode(u8, i);
+                ulen = utf8_encode_unichar(u8, i);
                 if (!ulen)
                         continue;
 
diff --git a/src/shared/json.c b/src/shared/json.c
index f1495e9..47f801c 100644
--- a/src/shared/json.c
+++ b/src/shared/json.c
@@ -150,7 +150,7 @@ static int json_parse_string(const char **p, char **ret) {
                                 if (!GREEDY_REALLOC(s, allocated, n + 4))
                                         return -ENOMEM;
 
-                                n += utf8_encode_unichar(x, s + n);
+                                n += utf8_encode_unichar(s + n, x);
                                 c += 5;
                                 continue;
                         } else
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 03a0abe..ab57f42 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -263,21 +263,46 @@ char *ascii_is_valid(const char *str) {
         return (char*) str;
 }
 
-int utf8_encode_unichar(uint16_t c, char *p) {
-        uint8_t *t = (uint8_t*) p;
-
-        if (c < 0x80) {
-                t[0] = (uint8_t) c;
+/**
+ * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
+ * @out_utf8: output buffer of at least 4 bytes or NULL
+ * @g: UCS-4 character to encode
+ *
+ * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
+ * The length of the character is returned. It is not zero-terminated! If the
+ * output buffer is NULL, only the length is returned.
+ *
+ * Returns: The length in bytes that the UTF-8 representation does or would
+ *          occupy.
+ */
+size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {
+        if (g < (1 << 7)) {
+                if (out_utf8)
+                        out_utf8[0] = g & 0x7f;
                 return 1;
-        } else if (c < 0x800) {
-                t[0] = (uint8_t) (0xc0 | (c >> 6));
-                t[1] = (uint8_t) (0x80 | (c & 0x3f));
+        } else if (g < (1 << 11)) {
+                if (out_utf8) {
+                        out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
+                        out_utf8[1] = 0x80 | (g & 0x3f);
+                }
                 return 2;
-        } else {
-                t[0] = (uint8_t) (0xe0 | (c >> 12));
-                t[1] = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
-                t[2] = (uint8_t) (0x80 | (c & 0x3f));
+        } else if (g < (1 << 16)) {
+                if (out_utf8) {
+                        out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
+                        out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
+                        out_utf8[2] = 0x80 | (g & 0x3f);
+                }
                 return 3;
+        } else if (g < (1 << 21)) {
+                if (out_utf8) {
+                        out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
+                        out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
+                        out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
+                        out_utf8[3] = 0x80 | (g & 0x3f);
+                }
+                return 4;
+        } else {
+                return 0;
         }
 }
 
@@ -290,7 +315,7 @@ char *utf16_to_utf8(const void *s, size_t length) {
                 return NULL;
 
         for (f = s, t = r; f < (const uint8_t*) s + length; f += 2)
-                t += utf8_encode_unichar((f[1] << 8) | f[0], t);
+                t += utf8_encode_unichar(t, (f[1] << 8) | f[0]);
 
         *t = 0;
         return r;
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index dcf8588..3d5a4c3 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -36,7 +36,7 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool newline) _pu
 char *utf8_escape_invalid(const char *s);
 char *utf8_escape_non_printable(const char *str);
 
-int utf8_encode_unichar(uint16_t c, char *p);
+size_t utf8_encode_unichar(char *out_utf8, uint32_t g);
 char *utf16_to_utf8(const void *s, size_t length);
 
 int utf8_encoded_valid_unichar(const char *str);
diff --git a/src/test/test-json.c b/src/test/test-json.c
index 0076835..e53e8ed 100644
--- a/src/test/test-json.c
+++ b/src/test/test-json.c
@@ -98,6 +98,7 @@ int main(int argc, char *argv[]) {
         test_one("{\"foo\" : [true, false]}", JSON_OBJECT_OPEN, JSON_STRING, "foo", JSON_COLON, JSON_ARRAY_OPEN, JSON_BOOLEAN, true, JSON_COMMA, JSON_BOOLEAN, false, JSON_ARRAY_CLOSE, JSON_OBJECT_CLOSE, JSON_END);
         test_one("\"\xef\xbf\xbd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
         test_one("\"\\ufffd\"", JSON_STRING, "\xef\xbf\xbd", JSON_END);
+        test_one("\"\\uf\"", -EINVAL);
 
         return 0;
 }