[igt-dev] [PATCH i-g-t] tests/gem_watchdog: Initial set of tests for GPU watchdog

Chris Wilson chris at chris-wilson.co.uk
Fri Sep 21 21:14:33 UTC 2018


Quoting Carlos Santa (2018-09-21 20:54:13)
> This test adds basic set of tests to reset the different
> GPU engines through the watchdog timer.
> 
> Credits to Antonio for the original codebase this is based on.
> 
> This was verified on SKL/ULT GT3:
> 
> $./gem_watchdog --run-subtest basic-vebox
> IGT-Version: 1.23-gaaeb2007206d (x86_64) (Linux: 4.18.0-rc7+ x86_64)
> Starting subtest: basic-vebox
> Subtest basic-vebox: SUCCESS (2.402s)
> $ sudo cat /sys/kernel/debug/dri/0/i915_reset_info
> full gpu reset = 0
> GuC watchdog/media reset = 0
> rcs0 = 0
> bcs0 = 0
> vcs0 = 0
> vcs1 = 0
> vecs0 = 1
> 
> Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Signed-off-by: Carlos Santa <carlos.santa at intel.com>
> ---
>  tests/Makefile.sources |   1 +
>  tests/gem_watchdog.c   | 252 +++++++++++++++++++++++++++++++++++++++++++++++++
>  tests/meson.build      |   1 +
>  3 files changed, 254 insertions(+)
>  create mode 100644 tests/gem_watchdog.c
> 
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index c84933f1d971..9b298886a2e1 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -155,6 +155,7 @@ TESTS_progs = \
>         gem_unref_active_buffers \
>         gem_userptr_blits \
>         gem_wait \
> +        gem_watchdog \
>         gem_workarounds \
>         gem_write_read_ring_switch \
>         gen3_mixed_blits \
> diff --git a/tests/gem_watchdog.c b/tests/gem_watchdog.c
> new file mode 100644
> index 000000000000..e9d8bbc9a2ec
> --- /dev/null
> +++ b/tests/gem_watchdog.c
> @@ -0,0 +1,252 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +#include "igt.h"
> +#include "igt_sysfs.h"
> +
> +#include <fcntl.h>
> +
> +#define MI_STORE_DATA_IMM (0x20 << 23)

We still use its older name of MI_STORE_DWORD_IMM

> +#define WATCHDOG_THRESHOLD (100)
> +
> +#define MAX_ENGINES 5
> +#define RENDER_CLASS 0
> +#define VIDEO_DECODE_CLASS 1
> +#define VIDEO_ENHANCEMENT_CLASS 2
> +#define COPY_ENGINE_CLASS 3
> +
> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x7
> +
> +static void clear_error_state(int fd)
> +{
> +    int dir;
> +
> +    dir = igt_sysfs_open(fd, NULL);
> +    if (dir < 0)
> +       return;
> +
> +    /* Any write to the error state clears it */
> +    igt_sysfs_set(dir, "error", "");
> +    close(dir);
> +}
> +
> +static bool check_error_state(int fd)
> +{
> +    char *error, *str;
> +    bool found = false;
> +    int dir;
> +
> +    dir = igt_sysfs_open(fd, NULL);
> +
> +    error = igt_sysfs_get(dir, "error");
> +    igt_sysfs_set(dir, "error", "Begone!");
> +
> +    igt_assert(error);
> +    igt_debug("Error: %s\n", error);
> +
> +    if ((str = strstr(error, "GPU HANG"))) {
> +       igt_debug("Found error state! GPU hang triggered! %s\n", str);
> +       found = true;
> +    }
> +
> +    close(dir);
> +
> +    return found;
> +}
> +
> +static void send_canary(uint32_t fd, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle)
> +{
> +    struct drm_i915_gem_exec_object2 obj[2];
> +    struct drm_i915_gem_relocation_entry reloc;
> +    struct drm_i915_gem_execbuffer2 execbuf;
> +
> +    uint32_t *bo;
> +    uint32_t batch[16];
> +    int i = 0;
> +
> +    memset(&execbuf, 0, sizeof(execbuf));
> +    memset(&obj, 0, sizeof(obj));
> +    memset(&reloc, 0, sizeof(reloc));
> +
> +    execbuf.buffers_ptr = to_user_pointer(obj);
> +
> +    execbuf.buffer_count = 2;
> +    execbuf.flags = exec_id;
> +
> +    obj[0].handle = target;
> +    obj[1].handle = gem_create(fd, 4096);
> +
> +    obj[1].relocation_count = 1;
> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
> +
> +    reloc.target_handle = obj[0].handle;
> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> +    reloc.delta = offset * sizeof(uint32_t);
> +
> +    batch[i++] = MI_STORE_DATA_IMM | 2;
> +
> +    reloc.offset = i * sizeof(uint32_t);
> +
> +    batch[i++] = 0x0;

Incorrect for offset != 0.

> +    batch[i++] = 0x0;
> +    batch[i++] = 0xdeadbeef;
> +
> +    batch[i++] = 0x0;
> +    batch[i++] = MI_BATCH_BUFFER_END;
> +
> +    gem_write(fd, obj[1].handle, 0, batch, sizeof(batch));
> +    gem_execbuf(fd, &execbuf);
> +
> +    if (handle) {
> +        *handle = obj[1].handle;
> +        return;
> +    }
> +
> +    gem_sync(fd, obj[1].handle);

sync because?

> +
> +    /* Read back value to make sure batch executed */
> +    bo = gem_mmap__cpu(fd, obj[0].handle, 0, 4096, PROT_READ);
> +    gem_set_domain(fd, obj[0].handle, I915_GEM_DOMAIN_CPU, 0);
> +
> +    igt_assert_eq(bo[offset], 0xdeadbeef);
> +
> +    gem_close(fd, obj[1].handle);
> +}
> +
> +/* Make sure the engines are working */
> +static void verify_engines(uint32_t fd)
> +{
> +    uint32_t scratch;
> +    int i;
> +    unsigned int engines[16];
> +    unsigned int nengine = 0;
> +    unsigned int engine;
> +

> +    for_each_physical_engine(fd, engine) {
if (gem_can_store_dword(fd, engine))
> +       engines[nengine++] = engine;
> +    }
> +
> +    igt_require(nengine);
> +
> +    /* Send a batch that changes the value in BO */
> +    for (i = 0; i < nengine; i++) {
> +        scratch = gem_create(fd, 4096);
> +        send_canary(fd, engines[i], scratch, 0, NULL);
> +       gem_close(fd, scratch);

Instead of using a simple canary, try TIMESTAMP; hang; TIMESTAMP and
check the dt is less than say 2*threshold + small margin for reset +
submission. Even .5s would be enough to confirm that we used the
watchdog and not hangcheck.

> +    }
> +}
> +
> +static void context_set_watchdog(int fd, int engine_id,
> +                                 unsigned ctx, unsigned threshold)
> +{
> +    struct drm_i915_gem_context_param arg = {
> +               .param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> +               .ctx_id = ctx,
> +    };
> +    unsigned engines_threshold[MAX_ENGINES];
> +
> +    memset(&arg, 0, sizeof(arg));

But you've already set bits of arg.

> +    memset(&engines_threshold, 0, sizeof(engines_threshold));
> +    arg.ctx_id = ctx;
> +    arg.value = (uint64_t)&engines_threshold;
> +    arg.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG;

For setting:
arg.value = &{ class, instance, threshold }
arg.size = sizeof(*arg.value);

MAX_ENGINES is not going to be an implicit part of the ABI.
	
> +
> +    /* read existing values */
> +    gem_context_get_param(fd, &arg);
> +
> +    switch (engine_id & I915_EXEC_RING_MASK) {
> +    case I915_EXEC_RENDER:
> +         engines_threshold[RENDER_CLASS] = threshold;
> +         break;
> +    case I915_EXEC_BSD:
> +         engines_threshold[VIDEO_DECODE_CLASS] = threshold;
> +         break;
> +    case I915_EXEC_VEBOX:
> +         engines_threshold[VIDEO_ENHANCEMENT_CLASS] = threshold;
> +         break;
> +    default:
> +        break;
> +    }
> +
> +    gem_context_set_param(fd, &arg);
> +}
> +
> +static void inject_hang(uint32_t fd, uint32_t ctx, unsigned ring, unsigned flags)
> +{
> +    igt_hang_t hang;
> +    hang = igt_hang_ctx(fd, ctx, ring, flags);
> +    gem_sync(fd, hang.spin->handle);
> +}
> +
> +static void media_hang_simple(int fd, unsigned ring)
> +{
> +    uint32_t ctx;
> +    unsigned flags = HANG_ALLOW_CAPTURE;
> +
> +    verify_engines(fd);
> +
> +    /* Submit on default context */
> +    ctx = 0;
> +    context_set_watchdog(fd, ring, ctx, WATCHDOG_THRESHOLD);
> +
> +    clear_error_state(fd);
> +    igt_assert(!check_error_state(fd));

I recall the argument was that we wouldn't capture the error for a
watchdog.

> +
> +    inject_hang(fd, ctx, ring, flags);
> +
> +    /* Assert if error state is not clean */
> +    igt_assert(!check_error_state(fd));
> +
> +    verify_engines(fd);
> +}
> +
> +igt_main
> +{
> +    int fd;
> +
> +    igt_skip_on_simulation();
> +
> +    igt_fixture {
> +       fd = drm_open_driver(DRIVER_INTEL);
> +       igt_require_gem(fd);
> +    }
> +
> +    igt_subtest_group {
> +
> +        for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++){

Looks like you want you use the class:instance interface to match up
with the context setparam.

> +            /* default exec-id is purely symbolic */
> +            if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
> +                continue;
> +
> +            igt_subtest_f("basic-%s", e->name) {
> +                igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
> +                media_hang_simple(fd, e->exec_id | e->flags);
> +            }
> +        }
> +    }
> +
> +    igt_fixture {
> +       close(fd);
> +    }
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 17deb945ec95..3b864d891a08 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -130,6 +130,7 @@ test_progs = [
>         'gem_unref_active_buffers',
>         'gem_userptr_blits',
>         'gem_wait',
> +        'gem_watchdog',
>         'gem_workarounds',
>         'gem_write_read_ring_switch',
>         'gen3_mixed_blits',
> -- 
> 2.7.4
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev


More information about the igt-dev mailing list