[systemd-devel] [PATCH] util, utf8: recognize wide characters in wellipsize_mem()

Wed Aug 28 15:55:22 PDT 2013

---
 src/shared/utf8.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/shared/utf8.h |  4 +++-
 src/shared/util.c | 19 ++++++++++++++---
 src/shared/util.h |  1 +
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 8a37c3a..607f0c1 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -372,4 +372,68 @@ utf8_get_char (const char *p)
   UTF8_GET (result, p, i, mask, len);
 
   return result;
+}
+
+struct Interval
+{
+  unichar start, end;
+};
+
+static int
+interval_compare (const void *key, const void *elt)
+{
+  unichar c = (unichar) (long) (key);
+  struct Interval *interval = (struct Interval *)elt;
+
+  if (c < interval->start)
+    return -1;
+  if (c > interval->end)
+    return +1;
+
+  return 0;
+}
+
+/*
+ * NOTE:
+ *
+ * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
+ * generated from the Unicode Character Database's file
+ * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
+ * in this way:
+ *
+ *   ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt
+ *
+ * Last update for Unicode 6.0.
+ */
+
+/**
+ * g_unichar_iswide:
+ * @c: a Unicode character
+ *
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ *
+ * Return value: %TRUE if the character is wide
+ **/
+bool
+unichar_iswide (unichar c)
+{
+  /* See NOTE earlier for how to update this table. */
+  static const struct Interval wide[] = {
+    {0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+    {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
+    {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
+    {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
+    {0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
+    {0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
+    {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
+    {0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
+    0x1F248}, {0x1F250, 0x1F251}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}
+  };
+
+  if (bsearch ((long)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0],
+	       interval_compare))
+    return true;
+
+  return false;
 }
\ No newline at end of file
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index 020bc27..f1be180 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -131,4 +131,6 @@ static const char utf8_skip_data[256] = {
  * Before using this macro, use g_utf8_validate() to validate strings
  * that may contain invalid UTF-8.
  */
-#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const char *)(p)])
\ No newline at end of file
+#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const char *)(p)])
+
+bool unichar_iswide (unichar c);
\ No newline at end of file
diff --git a/src/shared/util.c b/src/shared/util.c
index 58a1787..1c73b3e 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -3357,22 +3357,35 @@ char *wellipsize_mem(const char *s, size_t old_length, size_t new_length, unsign
         if (x > new_length - 3)
                 x = new_length - 3;
 
-        for (i = (char *)s;k < x;i = utf8_next_char(i))
+        for (i = (char *)s;k < x;i = utf8_next_char(i)) {
+                c = utf8_get_char(i);
                 k++;
+                if (unichar_iswide(c))
+                        k++;
+        }
+
+        if (k > x) /* last character was wide and went over quota */
+                x++;
 
         j = i - s;
         memcpy(e, s, j);
-        e[j] = '.';   /* TODO: use … tri-dot? */
-        e[j+1] = '.'; /* 0xe2 0x80 0xa6 */
+        e[j]   = '.'; /* TODO: use … tri-dot? */
+        e[j+1] = '.'; /* 0xE2 0x80 0xA6 */
         e[j+2] = '.';
 
         k = 0;
         for (i = (char *)s + old_length;
              k < new_length - x - 3;) {
                 i = utf8_prev_char(i);
+                c = utf8_get_char(i);
                 k++;
+                if (unichar_iswide(c))
+                        k++;
         }
 
+        if (k > new_length - x - 3) /* last (reverse) character was wide and went over quota */
+                i = utf8_next_char(i);
+
         strcpy(e + j + 3, i);
 
         return e;
diff --git a/src/shared/util.h b/src/shared/util.h
index 9b17db9..97d8697 100644
--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -405,6 +405,7 @@ int running_in_chroot(void);
 char *ellipsize(const char *s, size_t length, unsigned percent);
 char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent);
 char *wellipsize(const char *s, size_t length, unsigned percent);
+                                        /* bytes              columns */
 char *wellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent);
 
 int touch(const char *path);
-- 
1.8.4.rc3