[systemd-devel] [PATCH 1/3] util: ellipsize_mem: do not print partial utf-8 characters
Shawn Landden
shawn at churchofgit.com
Wed Sep 11 07:47:37 PDT 2013
---
Makefile.am | 7 +++
TODO | 4 --
src/shared/utf8.c | 120 +++++++++++++++++++++++++++++++++++++++++++++
src/shared/utf8.h | 100 +++++++++++++++++++++++++++++++++++++
src/shared/util.c | 83 +++++++++++++++++++++++++++++--
src/shared/util.h | 3 ++
src/test/test-wellipsize.c | 42 ++++++++++++++++
7 files changed, 351 insertions(+), 8 deletions(-)
create mode 100644 src/test/test-wellipsize.c
diff --git a/Makefile.am b/Makefile.am
index cdbfdea..8813299 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1124,6 +1124,7 @@ tests += \
test-unit-file \
test-util \
test-date \
+ test-wellipsize \
test-sleep \
test-replace-var \
test-sched-prio \
@@ -1302,6 +1303,12 @@ test_date_SOURCES = \
test_date_LDADD = \
libsystemd-core.la
+test_wellipsize_SOURCES = \
+ src/test/test-wellipsize.c
+
+test_wellipsize_LDADD = \
+ libsystemd-core.la
+
test_sleep_SOURCES = \
src/test/test-sleep.c
diff --git a/TODO b/TODO
index fe305ec..a77ebe5 100644
--- a/TODO
+++ b/TODO
@@ -19,10 +19,6 @@ Bugfixes:
* properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point.
-* ellipsize_mem must take into account multi-byte unicode characters, and
- - make the resulting line the requested number of *characters*, not *bytes*,
- - avoid truncuating multi-byte sequences in the middle.
-
* When we detect invalid UTF-8, we cant't use it in an error message:
log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue);
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 655cc77..a9308b5 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -22,6 +22,11 @@
/* This file is based on the GLIB utf8 validation functions. The
* original license text follows. */
+/* gunicode.h - Unicode manipulation functions
+ *
+ * Copyright (C) 1999, 2000 Tom Tromey
+ * Copyright 2000, 2005 Red Hat, Inc.
+ */
/* gutf8.c - Operations on UTF-8 strings.
*
* Copyright (C) 1999 Tom Tromey
@@ -317,3 +322,118 @@ char *utf16_to_utf8(const void *s, size_t length) {
return r;
}
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Finds the previous UTF-8 character in the string before @p.
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char() instead.
+ *
+ * Return value: a pointer to the found character.
+ **/
+char *
+utf8_prev_char (const char *p)
+{
+ while (1)
+ {
+ p--;
+ if ((*p & 0xc0) != 0x80)
+ return (char *)p;
+ }
+}
+
+/**
+ * g_utf8_get_char:
+ * @p: a pointer to Unicode character encoded as UTF-8
+ *
+ * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
+ * If @p does not point to a valid UTF-8 encoded character, results are
+ * undefined. If you are not sure that the bytes are complete
+ * valid Unicode characters, you should use g_utf8_get_char_validated()
+ * instead.
+ *
+ * Return value: the resulting character
+ **/
+unichar
+utf8_get_char (const char *p)
+{
+ int i, mask = 0, len;
+ unichar result;
+ unsigned char c = (unsigned char) *p;
+
+ UTF8_COMPUTE (c, mask, len);
+ if (len == -1)
+ return (unichar)-1;
+ UTF8_GET (result, p, i, mask, len);
+
+ return result;
+}
+
+struct Interval
+{
+ unichar start, end;
+};
+
+static int
+interval_compare (const void *key, const void *elt)
+{
+ unichar c = (unichar) (long) (key);
+ struct Interval *interval = (struct Interval *)elt;
+
+ if (c < interval->start)
+ return -1;
+ if (c > interval->end)
+ return +1;
+
+ return 0;
+}
+
+/*
+ * NOTE:
+ *
+ * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
+ * generated from the Unicode Character Database's file
+ * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
+ * in this way:
+ *
+ * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt
+ *
+ * Last update for Unicode 6.0.
+ */
+
+/**
+ * g_unichar_iswide:
+ * @c: a Unicode character
+ *
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ *
+ * Return value: %TRUE if the character is wide
+ **/
+bool
+unichar_iswide (unichar c)
+{
+ /* See NOTE earlier for how to update this table. */
+ static const struct Interval wide[] = {
+ {0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+ {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
+ {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
+ {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
+ {0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
+ {0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
+ {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
+ {0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
+ 0x1F248}, {0x1F250, 0x1F251}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}
+ };
+
+ if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0],
+ interval_compare))
+ return true;
+
+ return false;
+}
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index f805ea6..f1be180 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -34,3 +34,103 @@ char *utf8_filter(const char *s);
char *ascii_filter(const char *s);
char *utf16_to_utf8(const void *s, size_t length);
+
+#define unichar uint32_t
+
+char *utf8_prev_char (const char *p);
+unichar utf8_get_char (const char *p);
+
+#define UTF8_COMPUTE(Char, Mask, Len) \
+ if (Char < 128) \
+ { \
+ Len = 1; \
+ Mask = 0x7f; \
+ } \
+ else if ((Char & 0xe0) == 0xc0) \
+ { \
+ Len = 2; \
+ Mask = 0x1f; \
+ } \
+ else if ((Char & 0xf0) == 0xe0) \
+ { \
+ Len = 3; \
+ Mask = 0x0f; \
+ } \
+ else if ((Char & 0xf8) == 0xf0) \
+ { \
+ Len = 4; \
+ Mask = 0x07; \
+ } \
+ else if ((Char & 0xfc) == 0xf8) \
+ { \
+ Len = 5; \
+ Mask = 0x03; \
+ } \
+ else if ((Char & 0xfe) == 0xfc) \
+ { \
+ Len = 6; \
+ Mask = 0x01; \
+ } \
+ else \
+ Len = -1;
+
+#define UTF8_LENGTH(Char) \
+ ((Char) < 0x80 ? 1 : \
+ ((Char) < 0x800 ? 2 : \
+ ((Char) < 0x10000 ? 3 : \
+ ((Char) < 0x200000 ? 4 : \
+ ((Char) < 0x4000000 ? 5 : 6)))))
+
+
+#define UTF8_GET(Result, Chars, Count, Mask, Len) \
+ (Result) = (Chars)[0] & (Mask); \
+ for ((Count) = 1; (Count) < (Len); ++(Count)) \
+ { \
+ if (((Chars)[(Count)] & 0xc0) != 0x80) \
+ { \
+ (Result) = -1; \
+ break; \
+ } \
+ (Result) <<= 6; \
+ (Result) |= ((Chars)[(Count)] & 0x3f); \
+ }
+
+/*
+ * Check whether a Unicode (5.2) char is in a valid range.
+ *
+ * The first check comes from the Unicode guarantee to never encode
+ * a point above 0x0010ffff, since UTF-16 couldn't represent it.
+ *
+ * The second check covers surrogate pairs (category Cs).
+ *
+ * @param Char the character
+ */
+#define UNICODE_VALID(Char) \
+ ((Char) < 0x110000 && \
+ (((Char) & 0xFFFFF800) != 0xD800))
+
+static const char utf8_skip_data[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+
+/**
+ * g_utf8_next_char:
+ * @p: Pointer to the start of a valid UTF-8 character
+ *
+ * Skips to the next character in a UTF-8 string. The string must be
+ * valid; this macro is as fast as possible, and has no error-checking.
+ * You would use this macro to iterate over a string character by
+ * character. The macro returns the start of the next UTF-8 character.
+ * Before using this macro, use g_utf8_validate() to validate strings
+ * that may contain invalid UTF-8.
+ */
+#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const char *)(p)])
+
+bool unichar_iswide (unichar c);
\ No newline at end of file
diff --git a/src/shared/util.c b/src/shared/util.c
index 1dde8af..b791433 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -73,6 +73,7 @@
#include "hashmap.h"
#include "env-util.h"
#include "fileio.h"
+#include "utf8.h"
int saved_argc = 0;
char **saved_argv = NULL;
@@ -3285,8 +3286,8 @@ int running_in_chroot(void) {
a.st_ino != b.st_ino;
}
-char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
- size_t x;
+char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
+ size_t x, j;
char *r;
assert(s);
@@ -3305,17 +3306,91 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne
if (x > new_length - 3)
x = new_length - 3;
+ for (;(unsigned char)s[x-1] & 0x80;x--)
+ continue;
+
memcpy(r, s, x);
r[x] = '.';
r[x+1] = '.';
r[x+2] = '.';
+
+ for (j=(x+3);(unsigned char)s[j] & 0x80;j++)
+ continue;
+
memcpy(r + x + 3,
- s + old_length - (new_length - x - 3),
- new_length - x - 3);
+ s + old_length - (new_length - j),
+ new_length - j);
return r;
}
+char *ascii_ellipsize(const char *s, size_t length, unsigned percent) {
+ return ascii_ellipsize_mem(s, strlen(s), length, percent);
+}
+
+char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
+ size_t x;
+ char *e, *i;
+ unichar c;
+ unsigned j, k = 0;
+
+ assert(s);
+ assert(percent <= 100);
+ assert(new_length >= 3);
+
+ /* if no multibyte characters use ellipsize_mem for speed */
+ if (ascii_is_valid(s))
+ return ascii_ellipsize_mem(s, old_length, new_length, percent);
+
+ if (old_length <= 3 || old_length <= new_length)
+ return strndup(s, old_length);
+
+ if (!utf8_is_valid(s))
+ return NULL;
+
+ e = new0(char, MIN(new_length*4,old_length));
+ if (!e)
+ return NULL;
+
+ x = (new_length * percent) / 100;
+
+ if (x > new_length - 3)
+ x = new_length - 3;
+
+ for (i = (char *)s;k < x;i = utf8_next_char(i)) {
+ c = utf8_get_char(i);
+ k++;
+ if (unichar_iswide(c))
+ k++;
+ }
+
+ if (k > x) /* last character was wide and went over quota */
+ x++;
+
+ j = i - s;
+ memcpy(e, s, j);
+ e[j] = 0xe2;
+ e[j+1] = 0x80;
+ e[j+2] = 0xa6;
+
+ k = 0;
+ for (i = (char *)s + old_length;
+ k < new_length - x - 1;) {
+ i = utf8_prev_char(i);
+ c = utf8_get_char(i);
+ k++;
+ if (unichar_iswide(c))
+ k++;
+ }
+
+ if (k > new_length - x - 1) /* last (reverse) character was wide and went over quota */
+ i = utf8_next_char(i);
+
+ strcpy(e + j + 3, i);
+
+ return e;
+}
+
char *ellipsize(const char *s, size_t length, unsigned percent) {
return ellipsize_mem(s, strlen(s), length, percent);
}
diff --git a/src/shared/util.h b/src/shared/util.h
index 63f4e3d..eb21855 100644
--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -402,7 +402,10 @@ static inline const char *ansi_highlight_off(void) {
int running_in_chroot(void);
+char *assii_ellipsize(const char *s, size_t length, unsigned percent);
+char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent);
char *ellipsize(const char *s, size_t length, unsigned percent);
+ /* bytes columns */
char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent);
int touch(const char *path);
diff --git a/src/test/test-wellipsize.c b/src/test/test-wellipsize.c
new file mode 100644
index 0000000..f6db82c
--- /dev/null
+++ b/src/test/test-wellipsize.c
@@ -0,0 +1,42 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+ This file is part of systemd.
+
+ Copyright 2013 Shawn Landden
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <stdio.h>
+
+#include "util.h"
+#include "utf8.h"
+
+static void test_one(const char *p) {
+ _cleanup_free_ char *t = NULL;
+ t = ellipsize(p, 80, 70);
+ puts(t);
+}
+
+int main(int argc, char *argv[]) {
+ test_one("síêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´íêµì´");
+ test_one("æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½æ¥æ¬å½");
+ test_one("â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬â¬");
+ test_one("â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦â¦");
+ test_one("ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®ð®");
+ test_one("asdfnjaskdfnklasdgnjaskdghnkasdgfklasdfjkasdfjaksdfaskldfnaskldfnaskldfnaklsdfnaklsdfnklnaskjgdknl");
+
+ return 0;
+}
--
1.8.4.rc3
More information about the systemd-devel
mailing list