[igt-dev] [PATCH i-g-t] tests/gem_watchdog: Initial set of tests for GPU watchdog
Chris Wilson
chris at chris-wilson.co.uk
Fri Sep 21 21:14:33 UTC 2018
Quoting Carlos Santa (2018-09-21 20:54:13)
> This test adds basic set of tests to reset the different
> GPU engines through the watchdog timer.
>
> Credits to Antonio for the original codebase this is based on.
>
> This was verified on SKL/ULT GT3:
>
> $./gem_watchdog --run-subtest basic-vebox
> IGT-Version: 1.23-gaaeb2007206d (x86_64) (Linux: 4.18.0-rc7+ x86_64)
> Starting subtest: basic-vebox
> Subtest basic-vebox: SUCCESS (2.402s)
> $ sudo cat /sys/kernel/debug/dri/0/i915_reset_info
> full gpu reset = 0
> GuC watchdog/media reset = 0
> rcs0 = 0
> bcs0 = 0
> vcs0 = 0
> vcs1 = 0
> vecs0 = 1
>
> Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Signed-off-by: Carlos Santa <carlos.santa at intel.com>
> ---
> tests/Makefile.sources | 1 +
> tests/gem_watchdog.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++++
> tests/meson.build | 1 +
> 3 files changed, 254 insertions(+)
> create mode 100644 tests/gem_watchdog.c
>
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index c84933f1d971..9b298886a2e1 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -155,6 +155,7 @@ TESTS_progs = \
> gem_unref_active_buffers \
> gem_userptr_blits \
> gem_wait \
> + gem_watchdog \
> gem_workarounds \
> gem_write_read_ring_switch \
> gen3_mixed_blits \
> diff --git a/tests/gem_watchdog.c b/tests/gem_watchdog.c
> new file mode 100644
> index 000000000000..e9d8bbc9a2ec
> --- /dev/null
> +++ b/tests/gem_watchdog.c
> @@ -0,0 +1,252 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +#include "igt.h"
> +#include "igt_sysfs.h"
> +
> +#include <fcntl.h>
> +
> +#define MI_STORE_DATA_IMM (0x20 << 23)
We still use its older name of MI_STORE_DWORD_IMM
> +#define WATCHDOG_THRESHOLD (100)
> +
> +#define MAX_ENGINES 5
> +#define RENDER_CLASS 0
> +#define VIDEO_DECODE_CLASS 1
> +#define VIDEO_ENHANCEMENT_CLASS 2
> +#define COPY_ENGINE_CLASS 3
> +
> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x7
> +
> +static void clear_error_state(int fd)
> +{
> + int dir;
> +
> + dir = igt_sysfs_open(fd, NULL);
> + if (dir < 0)
> + return;
> +
> + /* Any write to the error state clears it */
> + igt_sysfs_set(dir, "error", "");
> + close(dir);
> +}
> +
> +static bool check_error_state(int fd)
> +{
> + char *error, *str;
> + bool found = false;
> + int dir;
> +
> + dir = igt_sysfs_open(fd, NULL);
> +
> + error = igt_sysfs_get(dir, "error");
> + igt_sysfs_set(dir, "error", "Begone!");
> +
> + igt_assert(error);
> + igt_debug("Error: %s\n", error);
> +
> + if ((str = strstr(error, "GPU HANG"))) {
> + igt_debug("Found error state! GPU hang triggered! %s\n", str);
> + found = true;
> + }
> +
> + close(dir);
> +
> + return found;
> +}
> +
> +static void send_canary(uint32_t fd, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle)
> +{
> + struct drm_i915_gem_exec_object2 obj[2];
> + struct drm_i915_gem_relocation_entry reloc;
> + struct drm_i915_gem_execbuffer2 execbuf;
> +
> + uint32_t *bo;
> + uint32_t batch[16];
> + int i = 0;
> +
> + memset(&execbuf, 0, sizeof(execbuf));
> + memset(&obj, 0, sizeof(obj));
> + memset(&reloc, 0, sizeof(reloc));
> +
> + execbuf.buffers_ptr = to_user_pointer(obj);
> +
> + execbuf.buffer_count = 2;
> + execbuf.flags = exec_id;
> +
> + obj[0].handle = target;
> + obj[1].handle = gem_create(fd, 4096);
> +
> + obj[1].relocation_count = 1;
> + obj[1].relocs_ptr = to_user_pointer(&reloc);
> +
> + reloc.target_handle = obj[0].handle;
> + reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> + reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> + reloc.delta = offset * sizeof(uint32_t);
> +
> + batch[i++] = MI_STORE_DATA_IMM | 2;
> +
> + reloc.offset = i * sizeof(uint32_t);
> +
> + batch[i++] = 0x0;
Incorrect for offset != 0.
> + batch[i++] = 0x0;
> + batch[i++] = 0xdeadbeef;
> +
> + batch[i++] = 0x0;
> + batch[i++] = MI_BATCH_BUFFER_END;
> +
> + gem_write(fd, obj[1].handle, 0, batch, sizeof(batch));
> + gem_execbuf(fd, &execbuf);
> +
> + if (handle) {
> + *handle = obj[1].handle;
> + return;
> + }
> +
> + gem_sync(fd, obj[1].handle);
sync because?
> +
> + /* Read back value to make sure batch executed */
> + bo = gem_mmap__cpu(fd, obj[0].handle, 0, 4096, PROT_READ);
> + gem_set_domain(fd, obj[0].handle, I915_GEM_DOMAIN_CPU, 0);
> +
> + igt_assert_eq(bo[offset], 0xdeadbeef);
> +
> + gem_close(fd, obj[1].handle);
> +}
> +
> +/* Make sure the engines are working */
> +static void verify_engines(uint32_t fd)
> +{
> + uint32_t scratch;
> + int i;
> + unsigned int engines[16];
> + unsigned int nengine = 0;
> + unsigned int engine;
> +
> + for_each_physical_engine(fd, engine) {
if (gem_can_store_dword(fd, engine))
> + engines[nengine++] = engine;
> + }
> +
> + igt_require(nengine);
> +
> + /* Send a batch that changes the value in BO */
> + for (i = 0; i < nengine; i++) {
> + scratch = gem_create(fd, 4096);
> + send_canary(fd, engines[i], scratch, 0, NULL);
> + gem_close(fd, scratch);
Instead of using a simple canary, try TIMESTAMP; hang; TIMESTAMP and
check the dt is less than say 2*threshold + small margin for reset +
submission. Even .5s would be enough to confirm that we used the
watchdog and not hangcheck.
> + }
> +}
> +
> +static void context_set_watchdog(int fd, int engine_id,
> + unsigned ctx, unsigned threshold)
> +{
> + struct drm_i915_gem_context_param arg = {
> + .param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> + .ctx_id = ctx,
> + };
> + unsigned engines_threshold[MAX_ENGINES];
> +
> + memset(&arg, 0, sizeof(arg));
But you've already set bits of arg.
> + memset(&engines_threshold, 0, sizeof(engines_threshold));
> + arg.ctx_id = ctx;
> + arg.value = (uint64_t)&engines_threshold;
> + arg.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG;
For setting:
arg.value = &{ class, instance, threshold }
arg.size = sizeof(*arg.value);
MAX_ENGINES is not going to be an implicit part of the ABI.
> +
> + /* read existing values */
> + gem_context_get_param(fd, &arg);
> +
> + switch (engine_id & I915_EXEC_RING_MASK) {
> + case I915_EXEC_RENDER:
> + engines_threshold[RENDER_CLASS] = threshold;
> + break;
> + case I915_EXEC_BSD:
> + engines_threshold[VIDEO_DECODE_CLASS] = threshold;
> + break;
> + case I915_EXEC_VEBOX:
> + engines_threshold[VIDEO_ENHANCEMENT_CLASS] = threshold;
> + break;
> + default:
> + break;
> + }
> +
> + gem_context_set_param(fd, &arg);
> +}
> +
> +static void inject_hang(uint32_t fd, uint32_t ctx, unsigned ring, unsigned flags)
> +{
> + igt_hang_t hang;
> + hang = igt_hang_ctx(fd, ctx, ring, flags);
> + gem_sync(fd, hang.spin->handle);
> +}
> +
> +static void media_hang_simple(int fd, unsigned ring)
> +{
> + uint32_t ctx;
> + unsigned flags = HANG_ALLOW_CAPTURE;
> +
> + verify_engines(fd);
> +
> + /* Submit on default context */
> + ctx = 0;
> + context_set_watchdog(fd, ring, ctx, WATCHDOG_THRESHOLD);
> +
> + clear_error_state(fd);
> + igt_assert(!check_error_state(fd));
I recall the argument was that we wouldn't capture the error for a
watchdog.
> +
> + inject_hang(fd, ctx, ring, flags);
> +
> + /* Assert if error state is not clean */
> + igt_assert(!check_error_state(fd));
> +
> + verify_engines(fd);
> +}
> +
> +igt_main
> +{
> + int fd;
> +
> + igt_skip_on_simulation();
> +
> + igt_fixture {
> + fd = drm_open_driver(DRIVER_INTEL);
> + igt_require_gem(fd);
> + }
> +
> + igt_subtest_group {
> +
> + for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++){
Looks like you want you use the class:instance interface to match up
with the context setparam.
> + /* default exec-id is purely symbolic */
> + if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
> + continue;
> +
> + igt_subtest_f("basic-%s", e->name) {
> + igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
> + media_hang_simple(fd, e->exec_id | e->flags);
> + }
> + }
> + }
> +
> + igt_fixture {
> + close(fd);
> + }
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 17deb945ec95..3b864d891a08 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -130,6 +130,7 @@ test_progs = [
> 'gem_unref_active_buffers',
> 'gem_userptr_blits',
> 'gem_wait',
> + 'gem_watchdog',
> 'gem_workarounds',
> 'gem_write_read_ring_switch',
> 'gen3_mixed_blits',
> --
> 2.7.4
>
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
More information about the igt-dev
mailing list