[systemd-devel] [PATCH 4/4] utf8: implement utf8_filter, apply to env file reading
Dave Reisner
dreisner at archlinux.org
Wed Sep 18 12:56:55 PDT 2013
This bring backs utf8_filter from the dead, but separates the
implementation from utf8_validate, since the two functions really
shouldn't be one and the same. The new version offers the option to
replace invalid utf8 sequences with a single character rather than
silently dropping them.
Using this new function, resolve some FIXMEs in src/shared/fileio.c.
---
Is it worth whining (via log_warning) about the invalid utf8 sequences when
replacing them?
src/shared/fileio.c | 78 +++++++++++++++++++++++++++-----------------------
src/shared/utf8.c | 37 ++++++++++++++++++++++++
src/shared/utf8.h | 1 +
src/test/test-fileio.c | 12 ++++++--
src/test/test-utf8.c | 25 ++++++++++++++++
5 files changed, 114 insertions(+), 39 deletions(-)
diff --git a/src/shared/fileio.c b/src/shared/fileio.c
index 7b12c96..3d02f3e 100644
--- a/src/shared/fileio.c
+++ b/src/shared/fileio.c
@@ -461,35 +461,38 @@ fail:
static int parse_env_file_push(const char *filename, unsigned line,
const char *key, char *value, void *userdata) {
+ const char *k;
+ va_list* ap = (va_list*) userdata;
+ va_list aq;
+
assert(utf8_validate(key));
- if (value && !utf8_validate(value))
- /* FIXME: filter UTF-8 */
- log_error("%s:%u: invalid UTF-8 for key %s: '%s', ignoring.",
- filename, line, key, value);
- else {
- const char *k;
- va_list* ap = (va_list*) userdata;
- va_list aq;
+ if (value && !utf8_validate(value)) {
+ char *filtered = utf8_filter(value, '_');
+ if (filtered == NULL)
+ return log_oom();
- va_copy(aq, *ap);
+ free(value);
+ value = filtered;
+ }
- while ((k = va_arg(aq, const char *))) {
- char **v;
+ va_copy(aq, *ap);
- v = va_arg(aq, char **);
+ while ((k = va_arg(aq, const char *))) {
+ char **v;
- if (streq(key, k)) {
- va_end(aq);
- free(*v);
- *v = value;
- return 1;
- }
- }
+ v = va_arg(aq, char **);
- va_end(aq);
+ if (streq(key, k)) {
+ va_end(aq);
+ free(*v);
+ *v = value;
+ return 1;
+ }
}
+ va_end(aq);
+
free(value);
return 0;
}
@@ -513,26 +516,29 @@ int parse_env_file(
static int load_env_file_push(const char *filename, unsigned line,
const char *key, char *value, void *userdata) {
+ char ***m = userdata;
+ char *p;
+ int r;
+
assert(utf8_validate(key));
- if (value && !utf8_validate(value))
- /* FIXME: filter UTF-8 */
- log_error("%s:%u: invalid UTF-8 for key %s: '%s', ignoring.",
- filename, line, key, value);
- else {
- char ***m = userdata;
- char *p;
- int r;
+ if (value && !utf8_validate(value)) {
+ char *filtered = utf8_filter(value, '_');
+ if (filtered == NULL)
+ return log_oom();
- p = strjoin(key, "=", strempty(value), NULL);
- if (!p)
- return -ENOMEM;
+ free(value);
+ value = filtered;
+ }
- r = strv_push(m, p);
- if (r < 0) {
- free(p);
- return r;
- }
+ p = strjoin(key, "=", strempty(value), NULL);
+ if (!p)
+ return -ENOMEM;
+
+ r = strv_push(m, p);
+ if (r < 0) {
+ free(p);
+ return r;
}
free(value);
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index ee15f80..80684d8 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -170,6 +170,43 @@ const char *utf8_validate(const char *str) {
return str;
}
+char *utf8_filter(const char *str, char substitute) {
+ const uint8_t *p;
+ char *o, *out;
+ bool last_invalid = false;
+
+ if (str == NULL)
+ return NULL;
+
+ out = new0(char, strlen(str) + 1);
+ if (out == NULL)
+ return NULL;
+ o = out;
+
+ for (p = (const uint8_t*) str; *p; ) {
+ int len = utf8_encoded_valid_unichar((const char *)p);
+
+ if (len < 0) {
+ /* avoid writing multiple substitution chars for a
+ * "single" invalid unicode char */
+ if (substitute != '\0' && !last_invalid)
+ *o++ = substitute;
+ p++;
+
+ last_invalid = true;
+ continue;
+ }
+
+ memcpy(o, p, len);
+ last_invalid = false;
+
+ o += len;
+ p += len;
+ }
+
+ return out;
+}
+
char *ascii_is_valid(const char *str) {
const char *p;
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index d843eea..d328d67 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -31,6 +31,7 @@ char *ascii_is_valid(const char *s) _pure_;
bool utf8_is_printable(const char* str, size_t length) _pure_;
char *ascii_filter(const char *s);
+char *utf8_filter(const char *s, char substitute);
char *utf16_to_utf8(const void *s, size_t length);
diff --git a/src/test/test-fileio.c b/src/test/test-fileio.c
index 525354b..81f38b0 100644
--- a/src/test/test-fileio.c
+++ b/src/test/test-fileio.c
@@ -34,7 +34,8 @@ static void test_parse_env_file(void) {
int fd, r;
FILE *f;
_cleanup_free_ char *one = NULL, *two = NULL, *three = NULL, *four = NULL, *five = NULL,
- *six = NULL, *seven = NULL, *eight = NULL, *nine = NULL, *ten = NULL;
+ *six = NULL, *seven = NULL, *eight = NULL, *nine = NULL, *ten = NULL,
+ *invalid = NULL;
_cleanup_strv_free_ char **a = NULL, **b = NULL;
char **i;
unsigned k;
@@ -63,7 +64,8 @@ static void test_parse_env_file(void) {
"seven=\"sevenval\" #nocomment\n"
"eight=eightval #nocomment\n"
"export nine=nineval\n"
- "ten=", f);
+ "ten=\n"
+ "invalid=uni\341\204ode", f);
fflush(f);
fclose(f);
@@ -84,7 +86,8 @@ static void test_parse_env_file(void) {
assert_se(streq(a[7], "eight=eightval #nocomment"));
assert_se(streq(a[8], "export nine=nineval"));
assert_se(streq(a[9], "ten="));
- assert_se(a[10] == NULL);
+ assert_se(streq(a[10], "invalid=uni_ode"));
+ assert_se(a[11] == NULL);
strv_env_clean_log(a, "test");
@@ -106,6 +109,7 @@ static void test_parse_env_file(void) {
"eight", &eight,
"export nine", &nine,
"ten", &ten,
+ "invalid", &invalid,
NULL);
assert_se(r >= 0);
@@ -120,6 +124,7 @@ static void test_parse_env_file(void) {
log_info("eight=[%s]", strna(eight));
log_info("export nine=[%s]", strna(nine));
log_info("ten=[%s]", strna(nine));
+ log_info("invalid=[%s]", strna(invalid));
assert_se(streq(one, "BAR"));
assert_se(streq(two, "bar"));
@@ -131,6 +136,7 @@ static void test_parse_env_file(void) {
assert_se(streq(eight, "eightval #nocomment"));
assert_se(streq(nine, "nineval"));
assert_se(ten == NULL);
+ assert_se(streq(invalid, "uni_ode"));
r = write_env_file(p, a);
assert_se(r >= 0);
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
index e753950..9b5229a 100644
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -65,12 +65,37 @@ static void test_utf8_encoded_valid_unichar(void) {
}
+static void test_utf8_filter(void) {
+ char *f;
+
+ f = utf8_filter("foo\341\204bar", '_');
+ puts(f);
+ assert_se(streq(f, "foo_bar"));
+ free(f);
+
+ f = utf8_filter("foo\341\204bar", '\0');
+ puts(f);
+ assert_se(streq(f, "foobar"));
+ free(f);
+
+ f = utf8_filter("\341\204\341\204\341\204", '\0');
+ puts(f);
+ assert_se(streq(f, ""));
+ free(f);
+
+ f = utf8_filter("\341\204\341\204\341\204", '?');
+ puts(f);
+ assert_se(streq(f, "?"));
+ free(f);
+}
+
int main(int argc, char *argv[]) {
test_utf8_validate();
test_utf8_is_printable();
test_ascii_is_valid();
test_ascii_filter();
test_utf8_encoded_valid_unichar();
+ test_utf8_filter();
return 0;
}
--
1.8.4
More information about the systemd-devel
mailing list