[systemd-devel] [PATCH 4/4] utf8: implement utf8_filter, apply to env file reading

Dave Reisner dreisner at archlinux.org
Wed Sep 18 12:56:55 PDT 2013


This bring backs utf8_filter from the dead, but separates the
implementation from utf8_validate, since the two functions really
shouldn't be one and the same. The new version offers the option to
replace invalid utf8 sequences with a single character rather than
silently dropping them.

Using this new function, resolve some FIXMEs in src/shared/fileio.c.
---
Is it worth whining (via log_warning) about the invalid utf8 sequences when
replacing them?

 src/shared/fileio.c    | 78 +++++++++++++++++++++++++++-----------------------
 src/shared/utf8.c      | 37 ++++++++++++++++++++++++
 src/shared/utf8.h      |  1 +
 src/test/test-fileio.c | 12 ++++++--
 src/test/test-utf8.c   | 25 ++++++++++++++++
 5 files changed, 114 insertions(+), 39 deletions(-)

diff --git a/src/shared/fileio.c b/src/shared/fileio.c
index 7b12c96..3d02f3e 100644
--- a/src/shared/fileio.c
+++ b/src/shared/fileio.c
@@ -461,35 +461,38 @@ fail:
 
 static int parse_env_file_push(const char *filename, unsigned line,
                                const char *key, char *value, void *userdata) {
+        const char *k;
+        va_list* ap = (va_list*) userdata;
+        va_list aq;
+
         assert(utf8_validate(key));
 
-        if (value && !utf8_validate(value))
-                /* FIXME: filter UTF-8 */
-                log_error("%s:%u: invalid UTF-8 for key %s: '%s', ignoring.",
-                          filename, line, key, value);
-        else {
-                const char *k;
-                va_list* ap = (va_list*) userdata;
-                va_list aq;
+        if (value && !utf8_validate(value)) {
+                char *filtered = utf8_filter(value, '_');
+                if (filtered == NULL)
+                        return log_oom();
 
-                va_copy(aq, *ap);
+                free(value);
+                value = filtered;
+        }
 
-                while ((k = va_arg(aq, const char *))) {
-                        char **v;
+        va_copy(aq, *ap);
 
-                        v = va_arg(aq, char **);
+        while ((k = va_arg(aq, const char *))) {
+                char **v;
 
-                        if (streq(key, k)) {
-                                va_end(aq);
-                                free(*v);
-                                *v = value;
-                                return 1;
-                        }
-                }
+                v = va_arg(aq, char **);
 
-                va_end(aq);
+                if (streq(key, k)) {
+                        va_end(aq);
+                        free(*v);
+                        *v = value;
+                        return 1;
+                }
         }
 
+        va_end(aq);
+
         free(value);
         return 0;
 }
@@ -513,26 +516,29 @@ int parse_env_file(
 
 static int load_env_file_push(const char *filename, unsigned line,
                               const char *key, char *value, void *userdata) {
+        char ***m = userdata;
+        char *p;
+        int r;
+
         assert(utf8_validate(key));
 
-        if (value && !utf8_validate(value))
-                /* FIXME: filter UTF-8 */
-                log_error("%s:%u: invalid UTF-8 for key %s: '%s', ignoring.",
-                          filename, line, key, value);
-        else {
-                char ***m = userdata;
-                char *p;
-                int r;
+        if (value && !utf8_validate(value)) {
+                char *filtered = utf8_filter(value, '_');
+                if (filtered == NULL)
+                        return log_oom();
 
-                p = strjoin(key, "=", strempty(value), NULL);
-                if (!p)
-                        return -ENOMEM;
+                free(value);
+                value = filtered;
+        }
 
-                r = strv_push(m, p);
-                if (r < 0) {
-                        free(p);
-                        return r;
-                }
+        p = strjoin(key, "=", strempty(value), NULL);
+        if (!p)
+                return -ENOMEM;
+
+        r = strv_push(m, p);
+        if (r < 0) {
+                free(p);
+                return r;
         }
 
         free(value);
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index ee15f80..80684d8 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -170,6 +170,43 @@ const char *utf8_validate(const char *str) {
         return str;
 }
 
+char *utf8_filter(const char *str, char substitute) {
+        const uint8_t *p;
+        char *o, *out;
+        bool last_invalid = false;
+
+        if (str == NULL)
+                return NULL;
+
+        out = new0(char, strlen(str) + 1);
+        if (out == NULL)
+                return NULL;
+        o = out;
+
+        for (p = (const uint8_t*) str; *p; ) {
+                int len = utf8_encoded_valid_unichar((const char *)p);
+
+                if (len < 0) {
+                        /* avoid writing multiple substitution chars for a
+                         * "single" invalid unicode char */
+                        if (substitute != '\0' && !last_invalid)
+                                *o++ = substitute;
+                        p++;
+
+                        last_invalid = true;
+                        continue;
+                }
+
+                memcpy(o, p, len);
+                last_invalid = false;
+
+                o += len;
+                p += len;
+        }
+
+        return out;
+}
+
 char *ascii_is_valid(const char *str) {
         const char *p;
 
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index d843eea..d328d67 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -31,6 +31,7 @@ char *ascii_is_valid(const char *s) _pure_;
 bool utf8_is_printable(const char* str, size_t length) _pure_;
 
 char *ascii_filter(const char *s);
+char *utf8_filter(const char *s, char substitute);
 
 char *utf16_to_utf8(const void *s, size_t length);
 
diff --git a/src/test/test-fileio.c b/src/test/test-fileio.c
index 525354b..81f38b0 100644
--- a/src/test/test-fileio.c
+++ b/src/test/test-fileio.c
@@ -34,7 +34,8 @@ static void test_parse_env_file(void) {
         int fd, r;
         FILE *f;
         _cleanup_free_ char *one = NULL, *two = NULL, *three = NULL, *four = NULL, *five = NULL,
-                        *six = NULL, *seven = NULL, *eight = NULL, *nine = NULL, *ten = NULL;
+                        *six = NULL, *seven = NULL, *eight = NULL, *nine = NULL, *ten = NULL,
+                        *invalid = NULL;
         _cleanup_strv_free_ char **a = NULL, **b = NULL;
         char **i;
         unsigned k;
@@ -63,7 +64,8 @@ static void test_parse_env_file(void) {
               "seven=\"sevenval\" #nocomment\n"
               "eight=eightval #nocomment\n"
               "export nine=nineval\n"
-              "ten=", f);
+              "ten=\n"
+              "invalid=uni\341\204ode", f);
 
         fflush(f);
         fclose(f);
@@ -84,7 +86,8 @@ static void test_parse_env_file(void) {
         assert_se(streq(a[7], "eight=eightval #nocomment"));
         assert_se(streq(a[8], "export nine=nineval"));
         assert_se(streq(a[9], "ten="));
-        assert_se(a[10] == NULL);
+        assert_se(streq(a[10], "invalid=uni_ode"));
+        assert_se(a[11] == NULL);
 
         strv_env_clean_log(a, "test");
 
@@ -106,6 +109,7 @@ static void test_parse_env_file(void) {
                        "eight", &eight,
                        "export nine", &nine,
                        "ten", &ten,
+                       "invalid", &invalid,
                        NULL);
 
         assert_se(r >= 0);
@@ -120,6 +124,7 @@ static void test_parse_env_file(void) {
         log_info("eight=[%s]", strna(eight));
         log_info("export nine=[%s]", strna(nine));
         log_info("ten=[%s]", strna(nine));
+        log_info("invalid=[%s]", strna(invalid));
 
         assert_se(streq(one, "BAR"));
         assert_se(streq(two, "bar"));
@@ -131,6 +136,7 @@ static void test_parse_env_file(void) {
         assert_se(streq(eight, "eightval #nocomment"));
         assert_se(streq(nine, "nineval"));
         assert_se(ten == NULL);
+        assert_se(streq(invalid, "uni_ode"));
 
         r = write_env_file(p, a);
         assert_se(r >= 0);
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
index e753950..9b5229a 100644
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -65,12 +65,37 @@ static void test_utf8_encoded_valid_unichar(void) {
 
 }
 
+static void test_utf8_filter(void) {
+        char *f;
+
+        f = utf8_filter("foo\341\204bar", '_');
+        puts(f);
+        assert_se(streq(f, "foo_bar"));
+        free(f);
+
+        f = utf8_filter("foo\341\204bar", '\0');
+        puts(f);
+        assert_se(streq(f, "foobar"));
+        free(f);
+
+        f = utf8_filter("\341\204\341\204\341\204", '\0');
+        puts(f);
+        assert_se(streq(f, ""));
+        free(f);
+
+        f = utf8_filter("\341\204\341\204\341\204", '?');
+        puts(f);
+        assert_se(streq(f, "?"));
+        free(f);
+}
+
 int main(int argc, char *argv[]) {
         test_utf8_validate();
         test_utf8_is_printable();
         test_ascii_is_valid();
         test_ascii_filter();
         test_utf8_encoded_valid_unichar();
+        test_utf8_filter();
 
         return 0;
 }
-- 
1.8.4



More information about the systemd-devel mailing list