[Spice-devel] [PATCH spice-gtk 3/4] util: add unix2dos and dos2unix

Sat Aug 24 05:17:04 PDT 2013

On Sat, Aug 24, 2013 at 12:20 PM, Hans de Goede <hdegoede at redhat.com> wrote:
> Hi,
>
>
> On 08/23/2013 10:25 PM, Marc-André Lureau wrote:
>>
>> Convert line endings from/to LF/CRLF, in utf8.
>> ---
>>   gtk/spice-util-priv.h |   2 +
>>   gtk/spice-util.c      | 122
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 124 insertions(+)
>>
>> diff --git a/gtk/spice-util-priv.h b/gtk/spice-util-priv.h
>> index ee5a42d..cc559dc 100644
>> --- a/gtk/spice-util-priv.h
>> +++ b/gtk/spice-util-priv.h
>> @@ -29,6 +29,8 @@ gboolean spice_strv_contains(const GStrv strv, const
>> gchar *str);
>>   gchar* spice_uuid_to_string(const guint8 uuid[16]);
>>   const gchar* spice_yes_no(gboolean value);
>>   guint16 spice_make_scancode(guint scancode, gboolean release);
>> +gchar* spice_unix2dos(const gchar *str, gssize len, GError **error);
>> +gchar* spice_dos2unix(const gchar *str, gssize len, GError **error);
>>
>>   #if GLIB_CHECK_VERSION(2,32,0)
>>   #define STATIC_MUTEX            GMutex
>> diff --git a/gtk/spice-util.c b/gtk/spice-util.c
>> index 774a145..be10edc 100644
>> --- a/gtk/spice-util.c
>> +++ b/gtk/spice-util.c
>> @@ -19,6 +19,7 @@
>>   #ifdef HAVE_CONFIG_H
>>   # include "config.h"
>>   #endif
>> +
>>   #include <stdlib.h>
>>   #include <string.h>
>>   #include <glib-object.h>
>> @@ -245,3 +246,124 @@ guint16 spice_make_scancode(guint scancode, gboolean
>> release)
>>
>>       g_return_val_if_reached(0);
>>   }
>> +
>> +typedef enum {
>> +    NEWLINE_TYPE_LF,
>> +    NEWLINE_TYPE_CR_LF
>> +} NewlineType;
>> +
>> +static gssize get_line(const gchar *str, gsize len,
>> +                       NewlineType type, gsize *nl_len,
>> +                       GError **error)
>> +{
>> +    const gchar *p = str;
>> +    gsize nl = 0;
>> +
>> +    if (type == NEWLINE_TYPE_CR_LF) {
>> +        while ((p - str) < len) {
>> +            p = g_utf8_strchr(p, len, '\r');
>> +            if (!p)
>> +                break;
>> +            p = g_utf8_next_char(p);
>> +            if (g_utf8_get_char(p) == '\n') {
>> +                len = (p - str) - 1;
>> +                nl = 2;
>> +                break;
>> +            }
>> +        }
>> +    } else {
>> +        p = g_utf8_strchr(str, len, '\n');
>> +        if (p) {
>> +            len = p - str;
>> +            nl = 1;
>> +        }
>> +    }
>
>
> This looks way more complicated then it needs to be, in UTF-8
> 0x00 - 0x7f only are valid as a single-byte sequence. multi-byte
> encoded characters will never contain 0x00 - 0x7f. UTF-8 was designed
> this way, is so that existing string parsing code for non multi-byte
> encodings, which make look for example for ' " = or LF characters does
> not break when parsing strings with multi-byte characters in there.
>
> TL;DR: LF and CR will never be part of a multi byte character, so
> you can simple do: strstr(str, "\r\n") to find the CRLF.

g_utf8_strchr is implemented using a regular strstr. Speed shouldn't
be different here. I prefer to use utf8 functions on utf8 strings.

>
>
>> +
>> +    if (!g_utf8_validate(str, len, NULL)) {
>> +        g_set_error_literal(error, G_CONVERT_ERROR,
>> +                            G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
>> +                            "Invalid byte sequence in conversion input");
>> +        return -1;
>> +    }
>
>
> And once you simply treat this as a regular C-string without worrying
> about multi-byte encodings you can also drop this.

Actually, during implementation, I have encountered/produced invalid
utf8 that will break later on in gtk+, so I prefer to validate the
production.

>> +
>> +    *nl_len = nl;
>> +    return len;
>> +}
>> +
>> +
>> +static gchar* spice_convert_newlines(const gchar *str, gssize len,
>> +                                     NewlineType from,
>> +                                     NewlineType to,
>> +                                     GError **error)
>> +{
>> +    GError *err = NULL;
>> +    gssize length;
>> +    gsize nl;
>> +    GString *output;
>> +    gboolean free_segment = FALSE;
>> +    gint i;
>> +
>> +    g_return_val_if_fail(str != NULL, NULL);
>> +    g_return_val_if_fail(len >= -1, NULL);
>> +    g_return_val_if_fail(error == NULL || *error == NULL, NULL);
>> +    /* only 2 supported combinations */
>> +    g_return_val_if_fail((from == NEWLINE_TYPE_LF &&
>> +                          to == NEWLINE_TYPE_CR_LF) ||
>> +                         (from == NEWLINE_TYPE_CR_LF &&
>> +                          to == NEWLINE_TYPE_LF), NULL);
>> +
>> +    if (len == -1)
>> +        len = strlen(str);
>> +    /* sometime we get \0 terminated strings, skip that, or it fails
>> +       to utf8 validate line with \0 end */
>> +    else if (str[len] == 0)
>> +        len -= 1;
>> +
>> +    /* allocate worst case, if it's small enough, we don't care much,
>> +     * if it's big, malloc will put us in mmap'd region, and we can
>> +     * over allocate.
>> +     */
>> +    output = g_string_sized_new(len * 2 + 1);
>> +
>> +    for (i = 0; i < len; i += length + nl) {
>> +        length = get_line(str + i, len - i, from, &nl, error);
>> +        if (length < 0)
>> +            break;
>> +
>> +        g_string_append_len(output, str + i, length);
>> +
>> +        if (nl) {
>> +            /* let's not double \r if it's already in the line */
>> +            if (to == NEWLINE_TYPE_CR_LF &&
>> +                output->str[output->len - 1] != '\r')
>> +                g_string_append_c(output, '\r');
>> +
>> +            g_string_append_c(output, '\n');
>> +        }
>> +    }
>> +
>> +    if (err) {
>> +        g_propagate_error(error, err);
>> +        free_segment = TRUE;
>> +    }
>> +
>> +    return g_string_free(output, free_segment);
>> +}
>> +
>> +G_GNUC_INTERNAL
>> +gchar* spice_dos2unix(const gchar *str, gssize len, GError **error)
>> +{
>> +    return spice_convert_newlines(str, len,
>> +                                  NEWLINE_TYPE_CR_LF,
>> +                                  NEWLINE_TYPE_LF,
>> +                                  error);
>> +}
>> +
>> +G_GNUC_INTERNAL
>> +gchar* spice_unix2dos(const gchar *str, gssize len, GError **error)
>> +{
>> +    return spice_convert_newlines(str, len,
>> +                                  NEWLINE_TYPE_LF,
>> +                                  NEWLINE_TYPE_CR_LF,
>> +                                  error);
>> +}
>>
>
> Regards,
>
> Hans

-- 
Marc-André Lureau