[systemd-devel] [PATCH] util, utf8: new wellipsize and wellipsize_mem that take into account multi-byte characters

Sun Sep 8 16:17:36 PDT 2013

ping

On 8/28/13, Shawn Landden <shawn at churchofgit.com> wrote:
> This version counts all multibyte characters as 1 width, not taking into
> account double width cjk characters and zerowidth characters
> ---
>  TODO              |  4 ---
>  src/shared/utf8.c | 56 +++++++++++++++++++++++++++++++
>  src/shared/utf8.h | 98
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/shared/util.c | 55 +++++++++++++++++++++++++++++++
>  src/shared/util.h |  2 ++
>  5 files changed, 211 insertions(+), 4 deletions(-)
>
> diff --git a/TODO b/TODO
> index fe305ec..a77ebe5 100644
> --- a/TODO
> +++ b/TODO
> @@ -19,10 +19,6 @@ Bugfixes:
>
>  * properly handle .mount unit state tracking when two mount points are
> stacked one on top of another on the exact same mount point.
>
> -* ellipsize_mem must take into account multi-byte unicode characters, and
> -  - make the resulting line the requested number of *characters*, not
> *bytes*,
> -  - avoid truncuating multi-byte sequences in the middle.
> -
>  * When we detect invalid UTF-8, we cant't use it in an error message:
>    log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue);
>
> diff --git a/src/shared/utf8.c b/src/shared/utf8.c
> index 655cc77..8a37c3a 100644
> --- a/src/shared/utf8.c
> +++ b/src/shared/utf8.c
> @@ -22,6 +22,11 @@
>  /* This file is based on the GLIB utf8 validation functions. The
>   * original license text follows. */
>
> +/* gunicode.h - Unicode manipulation functions
> + *
> + *  Copyright (C) 1999, 2000 Tom Tromey
> + *  Copyright 2000, 2005 Red Hat, Inc.
> + */
>  /* gutf8.c - Operations on UTF-8 strings.
>   *
>   * Copyright (C) 1999 Tom Tromey
> @@ -317,3 +322,54 @@ char *utf16_to_utf8(const void *s, size_t length) {
>
>          return r;
>  }
> +
> +/**
> + * g_utf8_prev_char:
> + * @p: a pointer to a position within a UTF-8 encoded string
> + *
> + * Finds the previous UTF-8 character in the string before @p.
> + *
> + * @p does not have to be at the beginning of a UTF-8 character. No check
> + * is made to see if the character found is actually valid other than
> + * it starts with an appropriate byte. If @p might be the first
> + * character of the string, you must use g_utf8_find_prev_char() instead.
> + *
> + * Return value: a pointer to the found character.
> + **/
> +char *
> +utf8_prev_char (const char *p)
> +{
> +  while (1)
> +    {
> +      p--;
> +      if ((*p & 0xc0) != 0x80)
> +	return (char *)p;
> +    }
> +}
> +
> +/**
> + * g_utf8_get_char:
> + * @p: a pointer to Unicode character encoded as UTF-8
> + *
> + * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
> + * If @p does not point to a valid UTF-8 encoded character, results are
> + * undefined. If you are not sure that the bytes are complete
> + * valid Unicode characters, you should use g_utf8_get_char_validated()
> + * instead.
> + *
> + * Return value: the resulting character
> + **/
> +unichar
> +utf8_get_char (const char *p)
> +{
> +  int i, mask = 0, len;
> +  unichar result;
> +  unsigned char c = (unsigned char) *p;
> +
> +  UTF8_COMPUTE (c, mask, len);
> +  if (len == -1)
> +    return (unichar)-1;
> +  UTF8_GET (result, p, i, mask, len);
> +
> +  return result;
> +}
> \ No newline at end of file
> diff --git a/src/shared/utf8.h b/src/shared/utf8.h
> index f805ea6..020bc27 100644
> --- a/src/shared/utf8.h
> +++ b/src/shared/utf8.h
> @@ -34,3 +34,101 @@ char *utf8_filter(const char *s);
>  char *ascii_filter(const char *s);
>
>  char *utf16_to_utf8(const void *s, size_t length);
> +
> +#define unichar uint32_t
> +
> +char *utf8_prev_char (const char *p);
> +unichar utf8_get_char (const char *p);
> +
> +#define UTF8_COMPUTE(Char, Mask, Len)					      \
> +  if (Char < 128)							      \
> +    {									      \
> +      Len = 1;								      \
> +      Mask = 0x7f;							      \
> +    }									      \
> +  else if ((Char & 0xe0) == 0xc0)					      \
> +    {									      \
> +      Len = 2;								      \
> +      Mask = 0x1f;							      \
> +    }									      \
> +  else if ((Char & 0xf0) == 0xe0)					      \
> +    {									      \
> +      Len = 3;								      \
> +      Mask = 0x0f;							      \
> +    }									      \
> +  else if ((Char & 0xf8) == 0xf0)					      \
> +    {									      \
> +      Len = 4;								      \
> +      Mask = 0x07;							      \
> +    }									      \
> +  else if ((Char & 0xfc) == 0xf8)					      \
> +    {									      \
> +      Len = 5;								      \
> +      Mask = 0x03;							      \
> +    }									      \
> +  else if ((Char & 0xfe) == 0xfc)					      \
> +    {									      \
> +      Len = 6;								      \
> +      Mask = 0x01;							      \
> +    }									      \
> +  else									      \
> +    Len = -1;
> +
> +#define UTF8_LENGTH(Char)              \
> +  ((Char) < 0x80 ? 1 :                 \
> +   ((Char) < 0x800 ? 2 :               \
> +    ((Char) < 0x10000 ? 3 :            \
> +     ((Char) < 0x200000 ? 4 :          \
> +      ((Char) < 0x4000000 ? 5 : 6)))))
> +
> +
> +#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
> +  (Result) = (Chars)[0] & (Mask);					      \
> +  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
> +    {									      \
> +      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
> +	{								      \
> +	  (Result) = -1;						      \
> +	  break;							      \
> +	}								      \
> +      (Result) <<= 6;							      \
> +      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
> +    }
> +
> +/*
> + * Check whether a Unicode (5.2) char is in a valid range.
> + *
> + * The first check comes from the Unicode guarantee to never encode
> + * a point above 0x0010ffff, since UTF-16 couldn't represent it.
> + *
> + * The second check covers surrogate pairs (category Cs).
> + *
> + * @param Char the character
> + */
> +#define UNICODE_VALID(Char)                   \
> +    ((Char) < 0x110000 &&                     \
> +     (((Char) & 0xFFFFF800) != 0xD800))
> +
> +static const char utf8_skip_data[256] = {
> +  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> +  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> +  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
> +};
> +
> +/**
> + * g_utf8_next_char:
> + * @p: Pointer to the start of a valid UTF-8 character
> + *
> + * Skips to the next character in a UTF-8 string. The string must be
> + * valid; this macro is as fast as possible, and has no error-checking.
> + * You would use this macro to iterate over a string character by
> + * character. The macro returns the start of the next UTF-8 character.
> + * Before using this macro, use g_utf8_validate() to validate strings
> + * that may contain invalid UTF-8.
> + */
> +#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const char
> *)(p)])
> \ No newline at end of file
> diff --git a/src/shared/util.c b/src/shared/util.c
> index 9af99aa..58a1787 100644
> --- a/src/shared/util.c
> +++ b/src/shared/util.c
> @@ -73,6 +73,7 @@
>  #include "hashmap.h"
>  #include "env-util.h"
>  #include "fileio.h"
> +#include "utf8.h"
>
>  int saved_argc = 0;
>  char **saved_argv = NULL;
> @@ -3327,6 +3328,60 @@ char *ellipsize(const char *s, size_t length,
> unsigned percent) {
>          return ellipsize_mem(s, strlen(s), length, percent);
>  }
>
> +char *wellipsize_mem(const char *s, size_t old_length, size_t new_length,
> unsigned percent) {
> +        size_t x;
> +        char *e, *i;
> +        unichar c;
> +        unsigned j, k = 0;
> +
> +        assert(s);
> +        assert(percent <= 100);
> +        assert(new_length >= 3);
> +
> +        /* if no multibyte characters use ellipsize_mem for speed */
> +        if (ascii_is_valid(s))
> +                return ellipsize_mem(s, old_length, new_length, percent);
> +
> +        if (old_length <= 3 || old_length <= new_length)
> +                return strndup(s, old_length);
> +
> +        if (!utf8_is_valid(s))
> +                return NULL;
> +
> +        e = new0(char, new_length*4 < old_length ? new_length*4 :
> old_length);
> +        if (!e)
> +                return NULL;
> +
> +        x = (new_length * percent) / 100;
> +
> +        if (x > new_length - 3)
> +                x = new_length - 3;
> +
> +        for (i = (char *)s;k < x;i = utf8_next_char(i))
> +                k++;
> +
> +        j = i - s;
> +        memcpy(e, s, j);
> +        e[j] = '.';   /* TODO: use … tri-dot? */
> +        e[j+1] = '.'; /* 0xe2 0x80 0xa6 */
> +        e[j+2] = '.';
> +
> +        k = 0;
> +        for (i = (char *)s + old_length;
> +             k < new_length - x - 3;) {
> +                i = utf8_prev_char(i);
> +                k++;
> +        }
> +
> +        strcpy(e + j + 3, i);
> +
> +        return e;
> +}
> +
> +char *wellipsize(const char *s, size_t length, unsigned percent) {
> +        return wellipsize_mem(s, strlen(s), length, percent);
> +}
> +
>  int touch(const char *path) {
>          int fd;
>
> diff --git a/src/shared/util.h b/src/shared/util.h
> index 63f4e3d..9b17db9 100644
> --- a/src/shared/util.h
> +++ b/src/shared/util.h
> @@ -404,6 +404,8 @@ int running_in_chroot(void);
>
>  char *ellipsize(const char *s, size_t length, unsigned percent);
>  char *ellipsize_mem(const char *s, size_t old_length, size_t new_length,
> unsigned percent);
> +char *wellipsize(const char *s, size_t length, unsigned percent);
> +char *wellipsize_mem(const char *s, size_t old_length, size_t new_length,
> unsigned percent);
>
>  int touch(const char *path);
>
> --
> 1.8.4.rc3
>
>


-- 

---
Shawn Landden
+1 360 389 3001 (SMS preferred)