[pulseaudio-discuss] [PATCH 1/2] pulse: Add a JSON-parsing library

Mon May 23 15:01:09 UTC 2016

On Tue, 2016-04-26 at 17:47 +0530, arun at accosted.net wrote:
> From: Arun Raghavan <git at arunraghavan.net>
> 
> Adding this to be able to drop dependency on json-c.
> 
> Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=95135
> ---
>  src/Makefile.am       |   8 +
>  src/pulse/json.c      | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/pulse/json.h      |  57 ++++++
>  src/tests/json-test.c | 240 +++++++++++++++++++++++++
>  4 files changed, 781 insertions(+)
>  create mode 100644 src/pulse/json.c
>  create mode 100644 src/pulse/json.h
>  create mode 100644 src/tests/json-test.c

This patch could also add src/json-test to .gitignore.

> diff --git a/src/Makefile.am b/src/Makefile.am
> index b600dfb..9d952fc 100644
> --- a/src/Makefile.am
> +++ b/src/Makefile.am
> @@ -248,6 +248,7 @@ TESTS_default = \
>  		thread-mainloop-test \
>  		utf8-test \
>  		format-test \
> +		json-test \
>  		get-binary-name-test \
>  		hook-list-test \
>  		memblock-test \
> @@ -375,6 +376,12 @@ format_test_CFLAGS = $(AM_CFLAGS) $(LIBCHECK_CFLAGS)
>  format_test_LDADD = $(AM_LDADD) libpulsecore- at PA_MAJORMINOR@.la libpulse.la libpulsecommon- at PA_MAJORMINOR@.la
>  format_test_LDFLAGS = $(AM_LDFLAGS) $(BINLDFLAGS) $(LIBCHECK_LIBS)
>  
> +json_test_SOURCES = tests/json-test.c
> +json_test_CFLAGS = $(AM_CFLAGS) $(LIBCHECK_CFLAGS)
> +json_test_LDADD = $(AM_LDADD) libpulse.la libpulsecommon- at PA_MAJORMINOR@.la
> +json_test_LDFLAGS = $(AM_LDFLAGS) $(BINLDFLAGS) $(LIBCHECK_LIBS)
> +
> +srbchannel_test_SOURCES = tests/srbchannel-test.c
>  srbchannel_test_SOURCES = tests/srbchannel-test.c

It's probably sufficient to set srbchannel_test_SOURCES just once.

>  srbchannel_test_CFLAGS = $(AM_CFLAGS) $(LIBCHECK_CFLAGS)
>  srbchannel_test_LDADD = $(AM_LDADD) libpulse.la libpulsecommon- at PA_MAJORMINOR@.la
> @@ -646,6 +653,7 @@ libpulsecommon_ at PA_MAJORMINOR@_la_SOURCES = \
>  		pulse/client-conf.c pulse/client-conf.h \
>  		pulse/fork-detect.c pulse/fork-detect.h \
>  		pulse/format.c pulse/format.h \
> +		pulse/json.c pulse/json.h \
>  		pulse/xmalloc.c pulse/xmalloc.h \
>  		pulse/proplist.c pulse/proplist.h \
>  		pulse/utf8.c pulse/utf8.h \
> diff --git a/src/pulse/json.c b/src/pulse/json.c
> new file mode 100644
> index 0000000..0e53902
> --- /dev/null
> +++ b/src/pulse/json.c
> @@ -0,0 +1,476 @@
> +/***
> +  This file is part of PulseAudio.
> +
> +  Copyright 2016 Arun Raghavan <mail at arunraghavan.net>
> +
> +  PulseAudio is free software; you can redistribute it and/or modify
> +  it under the terms of the GNU Lesser General Public License as published
> +  by the Free Software Foundation; either version 2.1 of the License,
> +  or (at your option) any later version.
> +
> +  PulseAudio is distributed in the hope that it will be useful, but
> +  WITHOUT ANY WARRANTY; without even the implied warranty of
> +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +  General Public License for more details.
> +
> +  You should have received a copy of the GNU Lesser General Public License
> +  along with PulseAudio; if not, see <http://www.gnu.org/licenses/>.
> +***/
> +
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct pa_json_object {
> +    PA_REFCNT_DECLARE;
> +    pa_json_type type;
> +
> +    union {
> +        int int_value;
> +        double double_value;
> +        bool bool_value;
> +        char *string_value;
> +        pa_hashmap *object_values; /* name -> object */
> +        pa_idxset *array_values; /* objects */
> +    };
> +};
> +
> +#define JSON_OBJECT_TYPE(o) ((o)->type)

What's the rationale for this macro? Why is using the macro better than
using o->type directly?

> +
> +static const char* parse_value(const char *str, const char *end, pa_json_object **obj);
> +
> +static pa_json_object* json_object_new(void) {
> +    pa_json_object *obj;
> +
> +    obj = pa_xnew0(pa_json_object, 1);
> +
> +    return obj;
> +}
> +
> +static bool is_whitespace(char c) {
> +    return c == '\t' || c == '\f' || c == '\r' || c == ' ';
> +}
> +
> +static bool is_digit(char c) {
> +    return c >= '0' && c <= '9';
> +}
> +
> +static bool is_end(const char c, const char *end) {
> +    if (!end)
> +        return c == '\0';
> +    else  {
> +        while (*end) {
> +            if (c == *end)
> +                return true;
> +            end++;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +static const char* consume_string(const char *str, const char *expect)
> +{

The opening brace got misplaced.

> +    while (*expect) {
> +        if (*str != *expect)
> +            return NULL;
> +
> +        str++;
> +        expect++;
> +    }
> +
> +    return str;
> +}
> +
> +static const char* parse_null(const char *str, pa_json_object *obj) {
> +    str = consume_string(str, "null");
> +
> +    if (str)
> +        obj->type = PA_JSON_TYPE_NULL;
> +
> +    return str;
> +}
> +
> +static const char* parse_boolean(const char *str, pa_json_object *obj) {
> +    const char *tmp;
> +
> +    tmp = consume_string(str, "true");
> +
> +    if (tmp) {
> +        obj->type = PA_JSON_TYPE_BOOL;
> +        obj->bool_value = true;
> +    } else {
> +        tmp = consume_string(str, "false");
> +
> +        if (str) {
> +            obj->type = PA_JSON_TYPE_BOOL;
> +            obj->bool_value = false;
> +        }
> +    }
> +
> +    return tmp;
> +}
> +
> +static const char* parse_string(const char *str, pa_json_object *obj) {
> +    pa_strbuf *buf = pa_strbuf_new();
> +
> +    str++; /* Consume leading '"' */
> +
> +    while (*str != '"') {
> +        if (*str != '\\') {
> +            /* Normal character, juts consume */
> +            pa_strbuf_putc(buf, *str);

Control characters 0x00 <= *str <= 0x1F should cause an error, since
they are disallowed by the JSON spec (0x00 is maybe special, since it
signifies end-of-string, but that should cause an error nevertheless).

Also, JSON data can be UTF-8, UTF-16 or UTF-32 encoded, but this code
doesn't take that into account. If we are fed valid multi-byte
sequences, this code probably can get confused. I feel it would be
safest to only support 7-bit ASCII, and disallow any characters above
0x7F.

> +        } else {
> +            /* Need to unescape */
> +            str++;
> +
> +            switch (*str) {
> +                case '"':
> +                case '\\':
> +                case '/':
> +                    pa_strbuf_putc(buf, *str);
> +                    break;
> +
> +                case 'b':
> +                    pa_strbuf_putc(buf, '\b' /* backspace */);
> +                    break;
> +
> +                case 'f':
> +                    pa_strbuf_putc(buf, '\f' /* form feed */);
> +                    break;
> +
> +                case 'n':
> +                    pa_strbuf_putc(buf, '\n' /* new line */);
> +                    break;
> +
> +                case 'r':
> +                    pa_strbuf_putc(buf, '\r' /* carriage return */);
> +                    break;
> +
> +                case 't':
> +                    pa_strbuf_putc(buf, '\t' /* horizontal tab */);
> +                    break;
> +
> +                case 'u':
> +                    pa_log("Unicode code points are currently unsupported");
> +                    goto error;
> +
> +                default:
> +                    pa_log("Unexepcted escape value: %c", *str);
> +                    goto error;
> +            }
> +        }
> +
> +        str++;
> +    }
> +
> +    if (*str != '"') {
> +        pa_log("Failed to parse remainder of string: %s", str);
> +        goto error;
> +    }
> +
> +    str++;
> +
> +    obj->type = PA_JSON_TYPE_STRING;
> +    obj->string_value = pa_strbuf_to_string_free(buf);
> +
> +    return str;
> +
> +error:
> +    pa_strbuf_free(buf);
> +    return NULL;
> +}
> +
> +static const char* parse_number(const char *str, pa_json_object *obj) {
> +    bool negative = false, has_fraction = false, has_exponent = false;
> +    unsigned int integer = 0;
> +    unsigned int fraction = 0;
> +    unsigned int fraction_digits = 0;
> +    int exponent = 0;
> +
> +    if (*str == '-') {
> +        negative = true;
> +        str++;
> +    }
> +
> +    if (*str == '0') {
> +        str++;
> +        goto fraction;
> +    }
> +
> +    while (is_digit(*str)) {
> +        integer = (integer * 10) + (*str - '0');
> +        str++;
> +    }

Missing overflow checking (imagine "12341234123412341234123412341234").

> +
> +fraction:
> +    if (*str == '.') {
> +        has_fraction = true;
> +        str++;
> +
> +        while (is_digit(*str)) {
> +            fraction = (fraction * 10) + (*str - '0');
> +            fraction_digits++;
> +            str++;
> +        }

Missing overflow checking.

> +    }
> +
> +    if (*str == 'e' || *str == 'E') {
> +        bool exponent_negative = false;
> +
> +        has_exponent = true;
> +        str++;
> +
> +        if (*str == '-') {
> +            exponent_negative = true;
> +            str++;
> +        } else if (*str == '+')
> +            str++;
> +
> +        while (is_digit(*str)) {
> +            exponent = (exponent * 10) + (*str - '0');
> +            str++;
> +        }

Missing overflow checking.

> +
> +        if (exponent_negative)
> +            exponent *= -1;
> +    }
> +
> +    if (has_fraction || has_exponent) {
> +        obj->type = PA_JSON_TYPE_DOUBLE;
> +        obj->double_value =
> +            (negative ? -1.0 : 1.0) * (integer + (double) fraction / pow(10, fraction_digits)) * pow(10, exponent);
> +    } else {
> +        obj->type = PA_JSON_TYPE_INT;
> +        obj->int_value = (negative ? -1 : 1) * integer;
> +    }
> +
> +    return str;
> +}

parse_number() doesn't seem to ever fail. Surely we should check that
we're not fed any random crap that happens to start with "-" or with a
digit? Maybe the idea is that parse_value() will fail if there's any
non-whitespace stuff left after parse_number() returns. In that case, I
think it would be good to have a comment about that in parse_number(),
and at least those cases need to be checked where some mandatory
content is missing (like "-", "1.", "1.e3" or "1e").

> +
> +static const char *parse_object(const char *str, pa_json_object *obj) {
> +    pa_json_object *name, *value;
> +
> +    obj->object_values = pa_hashmap_new_full(pa_idxset_string_hash_func, pa_idxset_string_compare_func,
> +                                             pa_xfree, (pa_free_cb_t) pa_json_object_unref);
> +
> +    while (*str != '}') {
> +        str++; /* Consume leading '{' or ',' */
> +
> +        str = parse_value(str, ":", &name);
> +        if (!str || JSON_OBJECT_TYPE(name) != PA_JSON_TYPE_STRING) {
> +            pa_log("Could not parse key for object: %s", str);

str is either NULL or points to after the parsed non-string value. Is
that really what you want to log?

> +            goto error;
> +        }
> +
> +        /* Consume the ':' */
> +        str++;
> +
> +        str = parse_value(str, ",}", &value);

As we parse nested objects, maybe we should check that we don't recurse
too deep. Consider {"a":{"a":{"a":{"a":{.... I remember libdbus used to
be vulnerable to this kind of an attack that made libdbus allocate so
deep stack that it crashed the application.

> +        if (!str) {
> +            pa_log("Could not parse value for object: %s", str);

str is NULL, don't log it.

> +            goto error;
> +        }
> +
> +        pa_hashmap_put(obj->object_values, pa_xstrdup(pa_json_object_get_string(name)), value);
> +        pa_json_object_unref(name);
> +    }
> +
> +    /* Drop trailing '}' */
> +    str++;
> +
> +    /* We now know the value was correctly parsed */
> +    obj->type = PA_JSON_TYPE_OBJECT;
> +
> +    return str;
> +
> +error:
> +    pa_hashmap_free(obj->object_values);

obj->object_values should be set to NULL.

> +    return NULL;
> +}
> +
> +static const char *parse_array(const char *str, pa_json_object *obj) {
> +    pa_json_object *value;
> +
> +    obj->array_values = pa_idxset_new(NULL, NULL);
> +
> +    while (*str != ']') {
> +        str++; /* Consume leading '[' or ',' */
> +
> +        /* Need to chew up whitespaces as a special case to deal with the
> +         * possibility of an empty array */
> +        while (is_whitespace(*str))
> +            str++;
> +
> +        if (*str == ']')
> +            break;
> +
> +        str = parse_value(str, ",]", &value);
> +        if (!str) {
> +            pa_log("Could not parse value for array: %s", str);

str is NULL, don't log it.

> +            goto error;
> +        }
> +
> +        pa_idxset_put(obj->array_values, value, NULL);
> +    }
> +
> +    /* Drop trailing ']' */
> +    str++;
> +
> +    /* We now know the value was correctly parsed */
> +    obj->type = PA_JSON_TYPE_ARRAY;
> +
> +    return str;
> +
> +error:
> +    pa_idxset_free(obj->array_values, (pa_free_cb_t) pa_json_object_unref);

obj->array_values should be set to NULL.

> +    return NULL;
> +}
> +
> +typedef enum {
> +    JSON_PARSER_STATE_INIT,
> +    JSON_PARSER_STATE_FINISH,
> +} json_parser_state;
> +
> +static const char* parse_value(const char *str, const char *end, pa_json_object **obj) {
> +    json_parser_state state = JSON_PARSER_STATE_INIT;
> +
> +    pa_return_val_if_fail(str != NULL, NULL);

I think a regular assertion would fit better here.

> +
> +    *obj = json_object_new();

I think it's better style to assign to obj only after the parsing has
succeeded. Now you're returning a pointer to freed data in case of
failure.

> +
> +    while (!is_end(*str, end)) {
> +        switch (state) {
> +            case JSON_PARSER_STATE_INIT:
> +                if (is_whitespace(*str)) {
> +                    str++;
> +                } else if (*str == 'n') {
> +                    str = parse_null(str, *obj);
> +                    state = JSON_PARSER_STATE_FINISH;
> +                } else if (*str == 't' || *str == 'f') {
> +                    str = parse_boolean(str, *obj);
> +                    state = JSON_PARSER_STATE_FINISH;
> +                } else if (*str == '"') {
> +                    str = parse_string(str, *obj);
> +                    state = JSON_PARSER_STATE_FINISH;
> +                } else if (is_digit(*str) || *str == '-') {
> +                    str = parse_number(str, *obj);
> +                    state = JSON_PARSER_STATE_FINISH;
> +                } else if (*str == '{') {
> +                    str = parse_object(str, *obj);
> +                    state = JSON_PARSER_STATE_FINISH;
> +                } else if (*str == '[') {
> +                    str = parse_array(str, *obj);
> +                    state = JSON_PARSER_STATE_FINISH;
> +                }

There's no final "else" branch here. If none of these checks evaluate
to true, that should be an error.

> +
> +                if (!str)
> +                    goto error;
> +
> +                break;
> +
> +            case JSON_PARSER_STATE_FINISH:
> +                /* Consume trailing whitespaces */
> +                if (is_whitespace(*str)) {
> +                    str++;
> +                } else {
> +                    goto error;
> +                }
> +        }
> +    }
> +
> +    if (JSON_OBJECT_TYPE(*obj) == PA_JSON_TYPE_INIT) {
> +        /* We didn't actually get any data */
> +        pa_log("No data while parsing json string: '%s' till '%s'", str, end);

end can be NULL so maybe use pa_strnull()? I don't know if this is
required. I think glibc handles NULL gracefully, but the manual page
for printf appears silent on this, so probably this is undefined
behaviour.

> +        goto error;
> +    }
> +
> +    return str;
> +
> +error:
> +    pa_json_object_unref(*obj);
> +    return NULL;
> +}
> +
> +
> +pa_json_object* pa_json_parse(const char *str) {
> +    pa_json_object *obj;
> +
> +    str = parse_value(str, NULL, &obj);
> +    pa_assert(*str == '\0');

This will crash on any parse failure...

> +
> +    return obj;
> +}
> +
> +pa_json_type pa_json_object_get_type(const pa_json_object *obj) {
> +    return JSON_OBJECT_TYPE(obj);
> +}
> +
> +void pa_json_object_unref(pa_json_object *obj) {
> +    if (PA_REFCNT_DEC(obj) > 0)
> +        return;
> +
> +    switch (JSON_OBJECT_TYPE(obj)) {
> +        case PA_JSON_TYPE_INIT:
> +        case PA_JSON_TYPE_INT:
> +        case PA_JSON_TYPE_DOUBLE:
> +        case PA_JSON_TYPE_BOOL:
> +        case PA_JSON_TYPE_NULL:
> +            break;
> +
> +        case PA_JSON_TYPE_STRING:
> +            pa_xfree(obj->string_value);
> +            break;
> +
> +        case PA_JSON_TYPE_OBJECT:
> +            pa_hashmap_free(obj->object_values);
> +            break;
> +
> +        case PA_JSON_TYPE_ARRAY:
> +            pa_idxset_free(obj->array_values, (pa_free_cb_t) pa_json_object_unref);
> +            break;
> +
> +        default:
> +            pa_assert_not_reached();
> +    }
> +
> +    pa_xfree(obj);
> +}
> +
> +int pa_json_object_get_int(const pa_json_object *o) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_INT, 0);

Is it a programming error to call this function with non-int object? If
yes, a regular assertion should be used. If not, then shouldn't the
caller decide whether to log something or not? (In case you didn't
know, pa_return_val_if_fail() prints "Assertion %s failed..." at debug
log level.)

This same comment applies to the following functions too.

> +    return o->int_value;
> +}
> +
> +double pa_json_object_get_double(const pa_json_object *o) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_DOUBLE, 0);
> +    return o->double_value;
> +}
> +
> +bool pa_json_object_get_bool(const pa_json_object *o) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_BOOL, false);
> +    return o->bool_value;
> +}
> +
> +const char* pa_json_object_get_string(const pa_json_object *o) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_STRING, NULL);
> +    return o->string_value;
> +}
> +
> +const pa_json_object* pa_json_object_get_object_member(const pa_json_object *o, const char *name) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_OBJECT, NULL);
> +    return pa_hashmap_get(o->object_values, name);
> +}
> +
> +int pa_json_object_get_array_length(const pa_json_object *o) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_ARRAY, 0);
> +    return pa_idxset_size(o->array_values);
> +}
> +
> +const pa_json_object* pa_json_object_get_array_member(const pa_json_object *o, int index) {
> +    pa_return_val_if_fail(JSON_OBJECT_TYPE(o) == PA_JSON_TYPE_ARRAY, NULL);
> +    return pa_idxset_get_by_index(o->array_values, index);
> +}
> diff --git a/src/pulse/json.h b/src/pulse/json.h
> new file mode 100644
> index 0000000..6eba9a9
> --- /dev/null
> +++ b/src/pulse/json.h
> @@ -0,0 +1,57 @@
> +/***
> +  This file is part of PulseAudio.
> +
> +  Copyright 2016 Arun Raghavan <mail at arunraghavan.net>
> +
> +  PulseAudio is free software; you can redistribute it and/or modify
> +  it under the terms of the GNU Lesser General Public License as published
> +  by the Free Software Foundation; either version 2.1 of the License,
> +  or (at your option) any later version.
> +
> +  PulseAudio is distributed in the hope that it will be useful, but
> +  WITHOUT ANY WARRANTY; without even the implied warranty of
> +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +  General Public License for more details.
> +
> +  You should have received a copy of the GNU Lesser General Public License
> +  along with PulseAudio; if not, see <http://www.gnu.org/licenses/>.
> +***/
> +
> +#ifdef HAVE_CONFIG_H
> +#include 
> +#endif

This belongs in the .c file (if it's needed at all).

> +
> +#include 
> +
> +PA_C_DECL_BEGIN

This isn't a public API, so this seems unnecessary.

> diff --git a/src/tests/json-test.c b/src/tests/json-test.c
> new file mode 100644
> index 0000000..9be4dac
> --- /dev/null
> +++ b/src/tests/json-test.c
> +START_TEST(object_test) {
> +    pa_json_object *o;
> +    const pa_json_object *v;
> +
> +    o = pa_json_parse(" { \"name\" : \"A Person\" } ");
> +
> +    fail_unless(o != NULL);
> +    fail_unless(pa_json_object_get_type(o) == PA_JSON_TYPE_OBJECT);
> +
> +    v = pa_json_object_get_object_member(o, "name");
> +    fail_unless(v != NULL);
> +    fail_unless(pa_json_object_get_type(v) == PA_JSON_TYPE_STRING);
> +    fail_unless(pa_streq(pa_json_object_get_string(v), "A Person"));
> +
> +    pa_json_object_unref(o);
> +
> +    o = pa_json_parse(" { \"age\" : -45.3e-0 } ");
> +
> +    fail_unless(o != NULL);
> +    fail_unless(pa_json_object_get_type(o) == PA_JSON_TYPE_OBJECT);
> +
> +    v = pa_json_object_get_object_member(o, "age");
> +    fail_unless(v != NULL);
> +    fail_unless(pa_json_object_get_type(v) == PA_JSON_TYPE_DOUBLE);
> +    fail_unless(IS_EQUAL(pa_json_object_get_double(v), -45.3));
> +
> +    pa_json_object_unref(o);
> +
> +    o = pa_json_parse("{\"person\":true}");
> +
> +    fail_unless(o != NULL);
> +    fail_unless(pa_json_object_get_type(o) == PA_JSON_TYPE_OBJECT);
> +
> +    v = pa_json_object_get_object_member(o, "person");
> +    fail_unless(v != NULL);
> +    fail_unless(pa_json_object_get_type(v) == PA_JSON_TYPE_BOOL);
> +    fail_unless(IS_EQUAL(pa_json_object_get_bool(v), true));

IS_EQUAL is only supposed to be used with floating point values, right?

-- 
Tanu