[igt-dev] [RFC] tests/gem_watchdog: Initial set of tests for GPU watchdog
Carlos Santa
carlos.santa at intel.com
Tue Apr 16 02:02:35 UTC 2019
On Mon, 2019-04-15 at 14:50 -0700, Antonio Argenziano wrote:
>
> On 15/04/19 11:22, Carlos Santa wrote:
> > This test adds basic set of tests to reset the different
> > GPU engines through the watchdog timer.
> >
> > Credits to Antonio for the original codebase this is based on.
> >
> > Cc: Ursulin Tvrtko <tvrtko.ursulin at intel.com>
> > Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> > Cc: Chris Wilson <chris at chris-wilson.co.uk
>
> nit: missing '>' at the end of address ^
>
> > Signed-off-by: Carlos Santa <carlos.santa at intel.com>
> > ---
> > tests/Makefile.sources | 3 +
> > tests/i915/gem_watchdog.c | 439
> > ++++++++++++++++++++++++++++++++++++++++++++++
> > tests/meson.build | 1 +
> > 3 files changed, 443 insertions(+)
> > create mode 100644 tests/i915/gem_watchdog.c
> >
> > diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> > index 214698d..7f17f20 100644
> > --- a/tests/Makefile.sources
> > +++ b/tests/Makefile.sources
> > @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES =
> > i915/gem_userptr_blits.c
> > TESTS_progs += gem_wait
> > gem_wait_SOURCES = i915/gem_wait.c
> >
> > +TESTS_progs += gem_watchdog
> > +gem_watchdog_SOURCES = i915/gem_watchdog.c
> > +
> > TESTS_progs += gem_workarounds
> > gem_workarounds_SOURCES = i915/gem_workarounds.c
> >
> > diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> > new file mode 100644
> > index 0000000..65e2f5b
> > --- /dev/null
> > +++ b/tests/i915/gem_watchdog.c
> > @@ -0,0 +1,439 @@
> > +/*
> > + * Copyright © 2016 Intel Corporation
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> > obtaining a
> > + * copy of this software and associated documentation files (the
> > "Software"),
> > + * to deal in the Software without restriction, including without
> > limitation
> > + * the rights to use, copy, modify, merge, publish, distribute,
> > sublicense,
> > + * and/or sell copies of the Software, and to permit persons to
> > whom the
> > + * Software is furnished to do so, subject to the following
> > conditions:
> > + *
> > + * The above copyright notice and this permission notice
> > (including the next
> > + * paragraph) shall be included in all copies or substantial
> > portions of the
> > + * Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
> > EVENT SHALL
> > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
> > DAMAGES OR OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > ARISING
> > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > OTHER DEALINGS
> > + * IN THE SOFTWARE.
> > + */
> > +#include "igt.h"
> > +#include "igt_sysfs.h"
> > +#include "sw_sync.h"
> > +
> > +#include <pthread.h>
> > +#include <fcntl.h>
> > +
> > +#include <sys/ioctl.h>
> > +#include <sys/poll.h>
> > +#include <sys/signal.h>
> > +#include "i915/gem_ring.h"
> > +
> > +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
> > +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
> > +#define HIGH 1
> > +#define LOW 0
> > +#define LOCAL_EXEC_FENCE_OUT (1 << 17)
> > +#define WATCHDOG_THRESHOLD (100)
> > +#define MAX_ENGINES 5
> > +#define RENDER_CLASS 0
> > +#define VIDEO_DECODE_CLASS 1
> > +#define VIDEO_ENHANCEMENT_CLASS 2
> > +#define COPY_ENGINE_CLASS 3
> > +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
> > +
> > +const uint64_t timeout_100ms = 100000000LL;
> > +static pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;
> > +
> > +struct drm_i915_gem_watchdog_timeout {
> > + union {
> > + struct {
> > + /*
> > + * Engine class & instance to be configured or
> > queried.
> > + */
> > + __u16 engine_class;
> > + __u16 engine_instance;
> > + };
> > + /* Index based addressing mode */
> > + __u32 index;
> > + };
> > + /* GPU Engine watchdog resets timeout in us */
> > + __u32 timeout_us;
> > +};
> > +
> > +static void clear_error_state(int fd)
> > +{
> > + int dir;
> > +
> > + dir = igt_sysfs_open(fd);
> > +
> > + if (dir < 0)
> > + return;
> > +
> > + /* Any write to the error state clears it */
> > + igt_sysfs_set(dir, "error", "");
> > + close(dir);
> > +}
> > +
> > +static bool check_error_state(int fd) > +{
>
> Don't think this is used.
You're right, I missed to delete it.
>
> > + char *error, *str;
> > + bool found = false;
> > + int dir;
> > +
> > + dir = igt_sysfs_open(fd);
> > +
> > + error = igt_sysfs_get(dir, "error");
> > + igt_sysfs_set(dir, "error", "Begone!");
> > +
> > + igt_assert(error);
> > + igt_debug("Error: %s\n", error);
> > +
> > + if ((str = strstr(error, "GPU HANG"))) {
> > + igt_debug("Found error state! GPU hang triggered!
> > %s\n", str);
> > + found = true;
> > + }
> > +
> > + close(dir);
> > +
> > + return found;
> > +}
> > +
> > +static void context_set_watchdog(int fd, int engine_id,
> > + unsigned ctx_id, unsigned
> > threshold)
> > +{
> > + struct drm_i915_gem_watchdog_timeout
> > engines_threshold[MAX_ENGINES];
> > + struct drm_i915_gem_context_param arg = {
> > + .param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> > + .ctx_id = ctx_id,
> > + .size = sizeof(engines_threshold),
> > + .value = (uint64_t)&engines_threshold
> > + };
> > +
> > + memset(&engines_threshold, 0, sizeof(engines_threshold));
> > +
> > + /* read existing values */
> > + gem_context_get_param(fd, &arg);
>
> Why do we need to read the existing values?
You're right, we don't need to. I just quickly tested it...
>
> > +
> > + switch (engine_id & I915_EXEC_RING_MASK) {
>
> If I read this correctly we cannot set the threshold for more than
> one
> engine at a time. Maybe use a mask?
>
> > + case I915_EXEC_RENDER:
> > + engines_threshold[RENDER_CLASS].timeout_us = threshold;
> > + engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> > + engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> > + engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
> > + break;
> > + case I915_EXEC_BSD:
> > + engines_threshold[RENDER_CLASS].timeout_us = 0;
> > + engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
> > + engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> > + engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
> > + break;
> > + case I915_EXEC_VEBOX:
> > + engines_threshold[RENDER_CLASS].timeout_us = 0;
> > + engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> > + engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us =
> > threshold;
> > + engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
> > + break;
> > + default:
> > + engines_threshold[RENDER_CLASS].timeout_us = 0;
> > + engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> > + engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> > + engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
>
> Not sure about clearing everything here, we should at least log
> something to say we got in an unwanted situation.
>
> nit: bit of a weird indent in this block.
You're right, we don't actually need to clear everything in the default
case. I just re-ran the tests. I initially had it as a precaution but
missed to retest/remove...
>
> > + break;
> > + }
> > + gem_context_set_param(fd, &arg);
> > +}
> > +
> > +static void send_canary(uint32_t fd, uint32_t ctx_id, unsigned
> > exec_id, uint32_t target, uint32_t offset, uint32_t *handle,
> > uint64_t timeout)
>
> Should be possible to use gem_test_engine();
Ok, let me think about it.
>
> > +{
> > + struct drm_i915_gem_exec_object2 obj[2];
> > + struct drm_i915_gem_relocation_entry reloc;
> > + struct drm_i915_gem_execbuffer2 execbuf;
> > + igt_spin_t *spin = NULL;
> > +
> > + const uint32_t bbe = MI_BATCH_BUFFER_END;
> > + int i = 0;
> > + int fence;
> > +
> > + gem_quiescent_gpu(fd);
> > +
> > + memset(&execbuf, 0, sizeof(execbuf));
> > + memset(&obj, 0, sizeof(obj));
> > + memset(&reloc, 0, sizeof(reloc));
> > +
> > + execbuf.buffers_ptr = to_user_pointer(obj);
> > +
> > + execbuf.buffer_count = 2;
> > + execbuf.flags = exec_id | LOCAL_EXEC_FENCE_OUT;
> > +
> > + obj[0].handle = target;
> > + obj[1].handle = gem_create(fd, 4096);
> > +
> > + obj[1].relocation_count = 1;
> > + obj[1].relocs_ptr = to_user_pointer(&reloc);
> > +
> > + reloc.target_handle = obj[0].handle;
> > + reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> > + reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> > + reloc.delta = offset * sizeof(uint32_t);
> > +
> > + reloc.offset = i * sizeof(uint32_t);
> > + gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
> > +
> > + __sync_synchronize();
> > +
> > + if (handle) {
> > + *handle = obj[1].handle;
> > + return;
> > + }
> > + gem_sync(fd, obj[1].handle);
> > + execbuf.rsvd1 = ctx_id;
> > + execbuf.rsvd2 = -1;
> > +
> > + spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
> > + igt_spin_batch_set_timeout(spin, timeout);
> > + igt_assert(gem_bo_busy(fd, obj[0].handle));
> > +
> > + gem_execbuf_wr(fd, &execbuf);
> > + igt_spin_batch_free(fd, spin);
> > +
> > + fence = execbuf.rsvd2 >> 32;
> > +
> > +#if 0
> > + igt_info("fence:%d, fence status : %d EIO: %d
> > ctx_id:%d\n",fence, sync_fence_status(fence),-EIO, ctx_id);
> > +#endif
> > +
Any ideas why would the fence wouldn't signal correctly (i.e., EIO)
after resetting? I need help with this...
> > + close(fence);
> > +
> > + gem_close(fd, obj[1].handle);
> > + gem_quiescent_gpu(fd);
> > +}
> > +
> > +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
> > +{
> > + uint32_t ctx = gem_context_create(fd);
> > +
> > + switch (ctx_prio) {
> > + case HIGH:
> > + __gem_context_set_priority(fd, ctx, MAX_PRIO);
> > + igt_info("Setting MAX priority %d\n", ctx_prio);
> > + break;
> > + case LOW:
> > + __gem_context_set_priority(fd, ctx, MIN_PRIO);
> > + igt_info("Setting MIN priority %d\n", ctx_prio);
> > + break;
> > + default:
> > + igt_info("Ignoring context priority %d\n", ctx_prio);
> > + break;
> > + }
> > + return ctx;
> > +}
> > +
> > +static void bb_factory(uint32_t fd, uint32_t *ctx, int prio,
> > uint64_t timeout, int pid)
>
> I would scrap this function. AFAICT you use it both for submitting a
> canary batch and for creating contexts. Do it explicitly in the test
> or
> somewhere it is clear what is happening.
Ok, let me think about it...
>
> > +{
> > + uint32_t scratch;
> > + unsigned int nengine = 0;
> > + unsigned int engines[16];
> > + int i, j;
> > + unsigned int engine;
> > +
> > + for_each_physical_engine(fd, engine)
> > + engines[nengine++] = engine;
> > +
> > + igt_require(nengine);
> > +
> > + for (i = 0; i < nengine; i++) {
> > + scratch = gem_create(fd, 4096);
> > + pthread_mutex_lock(&list_lock);
> > +
> > + if (prio == HIGH)
> > + create_ctx_with_priority(fd, HIGH);
>
> missing "ctx[j] =" ^^^ and ...
Yeah, this doesn't look right...
>
> > + else if (prio == LOW)
> > + create_ctx_with_priority(fd, LOW);
>
> ... here ^^^ ?
Yeah, this doesn't look right...
>
> > + else
> > + ctx[j] = gem_context_create(fd);
> > +
> > + send_canary(fd, ctx[j], engines[i], scratch, 0, NULL,
> > timeout);
> > + gem_close(fd, scratch);
> > + pthread_mutex_unlock(&list_lock);
> > + j++;
> > + }
> > +}
> > +
> > +static void inject_hang(uint32_t fd, unsigned ring, uint32_t
> > ctx_id, unsigned flags)
> > +{
> > + igt_hang_t hang;
> > + hang = igt_hang_ctx(fd, ctx_id, ring, flags);
> > + gem_sync(fd, hang.spin->handle);
> > +}
> > +
> > +static void gpu_watchdog_long_batch_2_contexts(int fd)
> > +{
> > + unsigned flags = HANG_ALLOW_CAPTURE;
> > + int i = 0, j=0;
> > +
> > + igt_fork(child, 1) {
>
> Why do you want to do this from two separate processes?
Ursulin suggested 2 separate contexts, both with long batches, then ctx
1 would execute until completion and ctx2 would get canceled some time
after then resetting it with gpu watchdog. I thought I needed 2
separate threads as oppose to having them getting launched from the
same thread but maybe I am over complicating it...
>
> > + /* Child process */
> > + uint32_t ctx[5];
> > + const uint64_t batch_timeout_ms = timeout_100ms*3;
> > + memset(&ctx, 0, sizeof(ctx));
> > + bb_factory(fd, ctx, -1, batch_timeout_ms,
> > syscall(SYS_gettid));
> > +
> > + sleep(6);
> > +
> > + context_set_watchdog(fd, 1, 1, WATCHDOG_THRESHOLD);
> > + clear_error_state(fd);
> > +
> > + /* Engine ID: RCS0, Context ID: 1 */
> > + inject_hang(fd, 1, 1, flags);
> > +
> > + for (i = 0; i <5; i++) {
> > + printf("ctx_child: %u ctx_cnt: %d
> > tid:%d\n",ctx[i], syscall(SYS_gettid));
> > + //gem_context_destroy(fd, ctx[i]);
> > + }
> > + }
> > +
> > + {
> > + /* Parent process */
> > + uint32_t ctx[5];
> > + const uint64_t batch_timeout_ms = timeout_100ms*1;
> > + memset(&ctx, 0, sizeof(ctx));
> > + printf("%s %d , tid: %d\n", __FUNCTION__,__LINE__,
> > syscall(SYS_gettid));
> > +
> > + bb_factory(fd, ctx, -1, batch_timeout_ms,
> > syscall(SYS_gettid));
> > +
> > + sleep(6);
> > +
> > + context_set_watchdog(fd, 4, 10,
> > WATCHDOG_THRESHOLD);
> > + clear_error_state(fd);
> > +
> > + /* Engine ID: VECS0, Context ID: 10 */
> > + inject_hang(fd, 4, 10, flags);
>
> Why hardcoding the context ids? You can use the ones coming from
> gem_context_create();
Right, this still needs some more work....
>
> > +
> > + for (i = 0; i <5; i++) {
> > + igt_info("ctx_dad: %u ctx_cnt: %d tid:%d
> > \n",ctx[i], i, syscall(SYS_gettid));
> > + //gem_context_destroy(fd, ctx[i]);
> > + }
> > + }
> > +
> > + igt_waitchildren();
> > +}
> > +
> > +static void gpu_watchodg_hang_long_batch_single_engine(int fd,
> > unsigned engine_id, int nengine)
> > +{
> > + uint32_t *ctx;
> > + int i;
> > + unsigned flags = HANG_ALLOW_CAPTURE;
> > + const uint64_t batch_timeout_ms = timeout_100ms*4;
> > +
> > + igt_require(nengine);
> > +
> > + ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
> > +
> > + if (!ctx) {
> > + igt_info("Out of memory\n");
> > + exit(1);
> > + }
> > +
> > + bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
>
> The call to bb_factory sends a canary batch to each engine, it would
> make more sense to do that after the hang triggered.
Ok, I think I need to separate the context creation and the canary
submission and then do what you're suggesting...
>
> > +
> > + if (engine_id == 1) {
>
> Not sure what this if{}else block is for, can you add a comment?
This allows to run a batch separately per engine when calling "gpu-
watchdog-long-batch-render" subtest for example. I found this to be
true rcs0 = 1, bsd1 = 8194, bsd2 = 16386, vecs0 = 5.
>
> > + context_set_watchdog(fd, engine_id, 1,
> > WATCHDOG_THRESHOLD); > + clear_error_state(fd);
> > + inject_hang(fd, engine_id, 1, flags);
> > + } else if (engine_id == 8194) {
> > + context_set_watchdog(fd, engine_id, 2,
> > WATCHDOG_THRESHOLD);
> > + clear_error_state(fd);
> > + inject_hang(fd, engine_id, 2, flags);
> > + } else if (engine_id == 16386) {
> > + context_set_watchdog(fd, engine_id, 3,
> > WATCHDOG_THRESHOLD);
> > + clear_error_state(fd);
> > + inject_hang(fd, engine_id, 3, flags);
> > + } else if (engine_id == 4) {
> > + context_set_watchdog(fd, engine_id, 5,
> > WATCHDOG_THRESHOLD);
> > + clear_error_state(fd);
> > + inject_hang(fd, engine_id, 5, flags);
> > + }
>
> context ids hardcoded above as well.
Because when I create the contexts I loop through all the engines and
then I paired the cxt id with the batch buffer being submmited
execbuf.rsvd1 = ctx_id; Later on when I set the watchdog() I pass the
engine_id and then I inject the hang, the exact ctx_id that is setting
the watchdog timeout and submitting the batch work must match.
>
> > +
> > + for (i = 0; i <nengine; i++)
> > + gem_context_destroy(fd, ctx[i]);
> > + free(ctx);
> > +}
> > +static void no_gpu_hang_long_batch_all_engines(int fd, int
> > nengine, int prio)
>
> Wouldn't this test case be covered by the other tests that do normal
> submissions?
Well, this test just creates the batches and executes them until
completion without setting the gpu watchdog nor injecting hang. The
idea behind this test was to create a long batch and let it execute.
>
> > +{
> > + uint32_t *ctx;
> > + int i;
> > + const uint64_t batch_timeout_ms = timeout_100ms*4;
> > +
> > + ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
> > +
> > + if (!ctx) {
> > + printf("Out of memory\n");
> > + exit(1);
> > + }
> > +
> > + bb_factory(fd, ctx, prio, batch_timeout_ms,
> > syscall(SYS_gettid));
> > +
> > + for (i = 0; i <nengine; i++)
> > + gem_context_destroy(fd, ctx[i]);
> > +
> > + free(ctx);
> > +}
> > +
> > +igt_main
> > +{
> > + int fd;
> > + unsigned int nengine = 0;
> > + unsigned int engine;
> > + unsigned int engines[16];
> > +
> > + igt_skip_on_simulation();
> > +
> > + igt_fixture {
> > + fd = drm_open_driver(DRIVER_INTEL);
> > + igt_require_gem(fd);
> > +
> > + for_each_physical_engine(fd, engine)
> > + engines[nengine++] = engine;
> > + igt_require(nengine);
>
> I think this test should run with the hang detector always on.
Ok, let me ping you on this because I don't understand fully the hang
detector mechanism...
>
> > + }
> > +
> > + igt_subtest_group {
>
> I think we are missing a test where we set more than one engine's
> threshold and check that the wathcdog only applies to the engine it
> got
> set for. There is also a low hanging fruit for invalid parameters
> into
> the IOCTL.
Ok, I've tested the first case already, I can add it, and I can add
case 2 as well.
Regards,
Carlos
>
> Antonio
>
> > + igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-
> > no-priority") {
> > + int prio = -1;
> > + no_gpu_hang_long_batch_all_engines(fd, nengine,
> > prio);
> > + }
> > +
> > + igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-
> > low-priority") {
> > + int prio = LOW;
> > + no_gpu_hang_long_batch_all_engines(fd, nengine,
> > prio);
> > + }
> > +
> > + igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-
> > high-priority") {
> > + int prio = HIGH;
> > + no_gpu_hang_long_batch_all_engines(fd, nengine,
> > prio);
> > + }
> > +
> > + for (const struct intel_execution_engine *e =
> > intel_execution_engines; e->name; e++) {
> > + /* no support for gpu watchdog on BLT */
> > + if (e->exec_id == 0 || e->exec_id ==
> > I915_EXEC_BLT)
> > + continue;
> > +
> > + igt_subtest_f("gpu-watchdog-long-batch-%s", e-
> > >name) {
> > + igt_require(gem_ring_has_physical_engin
> > e(fd, e->exec_id | e->flags));
> > + gpu_watchodg_hang_long_batch_single_eng
> > ine(fd, e->exec_id | e->flags, nengine);
> > + }
> > + }
> > +
> > + igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
> > + gpu_watchdog_long_batch_2_contexts(fd);
> > + }
> > + }
> > +
> > + igt_fixture {
> > + close(fd);
> > + }
> > +}
> > diff --git a/tests/meson.build b/tests/meson.build
> > index 5167a6c..b281b75 100644
> > --- a/tests/meson.build
> > +++ b/tests/meson.build
> > @@ -210,6 +210,7 @@ i915_progs = [
> > 'gem_unref_active_buffers',
> > 'gem_userptr_blits',
> > 'gem_wait',
> > + 'gem_watchdog',
> > 'gem_workarounds',
> > 'gem_write_read_ring_switch',
> > 'i915_fb_tiling',
> >
More information about the igt-dev
mailing list