[systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable
David Herrmann
dh.herrmann at gmail.com
Wed Nov 19 03:35:14 PST 2014
Hi
On Wed, Nov 12, 2014 at 11:49 AM, WaLyong Cho <walyong.cho at samsung.com> wrote:
> ---
> src/shared/utf8.c | 39 +++++++++++++++++++++++++++++++++++++++
> src/shared/utf8.h | 1 +
> src/test/test-utf8.c | 25 +++++++++++++++++++++++++
> 3 files changed, 65 insertions(+)
>
> diff --git a/src/shared/utf8.c b/src/shared/utf8.c
> index 8702ceb..0b6c38e 100644
> --- a/src/shared/utf8.c
> +++ b/src/shared/utf8.c
> @@ -212,6 +212,45 @@ char *utf8_escape_invalid(const char *str) {
> return p;
> }
>
> +char *utf8_escape_non_printable(const char *str) {
> + char *p, *s;
> +
> + assert(str);
> +
> + p = s = malloc(strlen(str) * 4 + 1);
> + if (!p)
> + return NULL;
> +
> + while (*str) {
> + int len;
> +
> + len = utf8_encoded_valid_unichar(str);
> + if (len > 0) {
> + if (utf8_is_printable(str, len)) {
> + s = mempcpy(s, str, len);
> + str += len;
> + } else {
> + if ((*str < ' ') || (*str >= 127)) {
> + *(s++) = '\\';
> + *(s++) = 'x';
> + *(s++) = hexchar((int) *str >> 4);
> + *(s++) = hexchar((int) *str);
> + } else
> + *(s++) = *str;
> +
> + str += 1;
This part is wrong. You cannot rely on ``*str'' to be the correct
Unicode value for the character. utf8_is_printable() returns false
also for multi-byte UTF8 characters. By taking it unmodified, it will
include the UTF8 management bits, which we really don't want here.
If you really want this, I'd prefer if you decode each UTF8 character,
and if it is non-printable you print "\uABCD" or "\UABCDWXYZ" (like
C++ does) as a 6-byte or 10-byte sequence. Other characters are just
printed normally.
Thanks
David
> + }
> + } else {
> + s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER));
> + str += 1;
> + }
> + }
> +
> + *s = '\0';
> +
> + return p;
> +}
> +
> char *ascii_is_valid(const char *str) {
> const char *p;
>
> diff --git a/src/shared/utf8.h b/src/shared/utf8.h
> index c087995..1fe1a35 100644
> --- a/src/shared/utf8.h
> +++ b/src/shared/utf8.h
> @@ -30,6 +30,7 @@
> const char *utf8_is_valid(const char *s) _pure_;
> char *ascii_is_valid(const char *s) _pure_;
> char *utf8_escape_invalid(const char *s);
> +char *utf8_escape_non_printable(const char *str);
>
> bool utf8_is_printable_newline(const char* str, size_t length, bool newline) _pure_;
> _pure_ static inline bool utf8_is_printable(const char* str, size_t length) {
> diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
> index b7d988f..6dde63c 100644
> --- a/src/test/test-utf8.c
> +++ b/src/test/test-utf8.c
> @@ -66,12 +66,37 @@ static void test_utf8_escaping(void) {
> assert_se(utf8_is_valid(p3));
> }
>
> +static void test_utf8_escaping_printable(void) {
> + _cleanup_free_ char *p1, *p2, *p3, *p4, *p5;
> +
> + p1 = utf8_escape_non_printable("goo goo goo");
> + puts(p1);
> + assert_se(utf8_is_valid(p1));
> +
> + p2 = utf8_escape_non_printable("\341\204\341\204");
> + puts(p2);
> + assert_se(utf8_is_valid(p2));
> +
> + p3 = utf8_escape_non_printable("\341\204");
> + puts(p3);
> + assert_se(utf8_is_valid(p3));
> +
> + p4 = utf8_escape_non_printable("ąę\n가너도루\n1234\n\341\204\341\204\n\001 \019\20\a");
> + puts(p4);
> + assert_se(utf8_is_valid(p4));
> +
> + p5 = utf8_escape_non_printable("\001 \019\20\a");
> + puts(p5);
> + assert_se(utf8_is_valid(p5));
> +}
> +
> int main(int argc, char *argv[]) {
> test_utf8_is_valid();
> test_utf8_is_printable();
> test_ascii_is_valid();
> test_utf8_encoded_valid_unichar();
> test_utf8_escaping();
> + test_utf8_escaping_printable();
>
> return 0;
> }
> --
> 1.9.3
>
> _______________________________________________
> systemd-devel mailing list
> systemd-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/systemd-devel
More information about the systemd-devel
mailing list