[igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog

Tue Apr 23 16:31:29 UTC 2019

On 23/04/2019 17:25, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> How is the test doing against the code in progress? Any issues 
> discovered etc?
> 
> In general it looks a bit untidy. Coding style is off, alignment is 
> random etc.. so a bit difficult to read. But I'll make a pass and see if 
> I can make sense of it. I won't leave comments on coding style and 
> formatting since there are too many. But please do fix that before 
> re-posting.
> 
> On 23/04/2019 01:05, Carlos Santa wrote:
>> This test adds basic set of tests to reset the different
>> GPU engines through the gpu watchdog timer.
>>
>> Credits to Antonio for the original codebase this is based on.
>>
>> v2: remove gem_context_get_param() during set (Antonio)
>>      remove clearing of the engines_threshold[] in the default case
>>      inside context_set_watchdog(). (Antonio)
>>      fix indexing when creating low/high priority contexts
>>      get rid of 2 threads idea (Antonio)
>>      fix context prio bug due to wrong indexing (Antonio)
>> v3: no need to force clear drm_i915_gem_watchdog_timeout struct
>>      when setting the watchdog timeouts (Antonio)
>>      -use gem_context_set_priority() instead to use error checking
>>      -always use a looper to test all engines (Antonio)
>>      -modify test gpu_watchodg_hang_long_batch_single_engine()
>>      to use a couple of batches and set the timeout in between them 
>> (Antonio)
>>      -make use of for_each_engine_class_instance() (Antonio)
>>
>> Cc: Ursulin Tvrtko <tvrtko.ursulin at intel.com>
>> Cc: Antonio Argenziano <antonio.argenziano at intel.com>
>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>> Signed-off-by: Carlos Santa <carlos.santa at intel.com>
>> ---
>>   tests/Makefile.sources    |   3 +
>>   tests/i915/gem_watchdog.c | 517 
>> ++++++++++++++++++++++++++++++++++++++++++++++
>>   tests/meson.build         |   1 +
>>   3 files changed, 521 insertions(+)
>>   create mode 100644 tests/i915/gem_watchdog.c
>>
>> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
>> index 214698d..7f17f20 100644
>> --- a/tests/Makefile.sources
>> +++ b/tests/Makefile.sources
>> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
>>   TESTS_progs += gem_wait
>>   gem_wait_SOURCES = i915/gem_wait.c
>> +TESTS_progs += gem_watchdog
>> +gem_watchdog_SOURCES = i915/gem_watchdog.c
>> +
>>   TESTS_progs += gem_workarounds
>>   gem_workarounds_SOURCES = i915/gem_workarounds.c
>> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
>> new file mode 100644
>> index 0000000..8f789ab
>> --- /dev/null
>> +++ b/tests/i915/gem_watchdog.c
>> @@ -0,0 +1,517 @@
>> +/*
>> + * Copyright © 2016 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without 
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, 
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including 
>> the next
>> + * paragraph) shall be included in all copies or substantial portions 
>> of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + */
>> +#include "igt.h"
>> +#include "igt_sysfs.h"
>> +#include "sw_sync.h"
>> +
>> +#include <pthread.h>
>> +#include <fcntl.h>
>> +
>> +#include <sys/ioctl.h>
>> +#include <sys/poll.h>
>> +#include <sys/signal.h>
>> +#include "i915/gem_ring.h"
>> +
>> +#define LOCAL_I915_EXEC_BSD_SHIFT    (13)
>> +#define LOCAL_I915_EXEC_BSD_RING1     (1 << LOCAL_I915_EXEC_BSD_SHIFT)
>> +#define LOCAL_I915_EXEC_BSD_RING2     (2 << LOCAL_I915_EXEC_BSD_SHIFT)
> 
> These are in old so no need for local definitions.
> 
>> +
>> +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
>> +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
>> +#define HIGH 1
>> +#define LOW 0
>> +#define TRUE 1
>> +#define FALSE 0
> 
> true and false are fair game in igt.
> 
>> +#define WATCHDOG_THRESHOLD (100)
>> +#define MAX_ENGINES 5
>> +#define RENDER_CLASS 0
>> +#define VIDEO_DECODE_CLASS 1
>> +#define VIDEO_ENHANCEMENT_CLASS 2
>> +#define COPY_ENGINE_CLASS 3
> 
> Classes are in i915_drm.h as well.
> 
>> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
>> +
>> +#define RS_NO_ERROR      0
>> +#define RS_BATCH_ACTIVE  (1 << 0)
>> +#define RS_BATCH_PENDING (1 << 1)
>> +#define RS_UNKNOWN       (1 << 2)
> 
> What are these? Look unused in code.
> 
>> +
>> +#define GET_RESET_STATS_IOCTL DRM_IOWR(DRM_COMMAND_BASE + 0x32, 
>> struct local_drm_i915_reset_stats)
>> +struct local_drm_i915_reset_stats {
>> +    __u32 ctx_id;
>> +    __u32 flags;
>> +    __u32 reset_count;
>> +    __u32 batch_active;
>> +    __u32 batch_pending;
>> +    __u32 pad;
>> +};
>> +
>> +const uint64_t timeout_100ms = 100000000LL;
>> +int num;
>> +
>> +struct drm_i915_gem_watchdog_timeout {
>> +    union {
>> +        struct {
>> +            /*
>> +             * Engine class & instance to be configured or queried.
>> +             */
>> +            __u16 engine_class;
>> +            __u16 engine_instance;
>> +        };
>> +        /* Index based addressing mode */
>> +        __u32 index;
>> +    };
>> +    /* GPU Engine watchdog resets timeout in us */
>> +    __u32 timeout_us;
>> +};
>> +
>> +static void clear_error_state(int fd)
>> +{
>> +    int dir;
>> +
>> +    dir = igt_sysfs_open(fd);
>> +
>> +    if (dir < 0)
>> +        return;
> 
> igt_require(dir >= 0) if test relies on this being present?
> 
>> +
>> +    /* Any write to the error state clears it */
>> +    igt_sysfs_set(dir, "error", "");
>> +    close(dir);
>> +}
>> +
>> +static unsigned int e2ring(int fd, const struct 
>> intel_execution_engine2 *e)
>> +{
>> +    return gem_class_instance_to_eb_flags(fd, e->class, e->instance);
>> +}
>> +
>> +static void context_set_watchdog(int fd, unsigned engine_id,
>> +                                 unsigned ctx_id, unsigned threshold)
>> +{
>> +    struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
>> +    struct drm_i915_gem_context_param arg = {
>> +        .param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
>> +        .ctx_id = ctx_id,
>> +        .size = sizeof(engines_threshold),
>> +        .value = (uint64_t)&engines_threshold
>> +    };
>> +
>> +    memset(&engines_threshold, 0, sizeof(engines_threshold));
>> +
>> +    switch (engine_id & I915_EXEC_RING_MASK) {
>> +    case I915_EXEC_RENDER:
>> +        engines_threshold[RENDER_CLASS].timeout_us = threshold;
> 
> Looks like uAPI is in some interim state? I was expecting to see here 
> something like:
> 
>      engines_threshold.engine_class = RENDER_CLASS;
>      engines_threshold.engine_instance = 0;
>      engines_threshold.timeout_us = threshold;
> 
> But you seem to be passing an array of structs which carry 
> class/instance data, don't use it but instead infer the class from the 
> position in the array.
> 
>> +        break;
>> +    case I915_EXEC_BSD:
>> +        engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
>> +        break;
>> +    case I915_EXEC_VEBOX:
>> +        engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 
>> threshold;
>> +        break;
>> +    default:
>> +        break;
>> +    }
>> +
>> +    gem_context_set_param(fd, &arg);
>> +}
>> +
>> +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id, 
>> unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, 
>> uint64_t timeout, int *fence, int fence_index)
>> +{
>> +    struct drm_i915_gem_exec_object2 obj[2];
>> +    struct drm_i915_gem_relocation_entry reloc;
>> +    struct drm_i915_gem_execbuffer2 execbuf;
>> +    igt_spin_t *spin = NULL;
>> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
>> +    int i = 0;
>> +
>> +    gem_quiescent_gpu(fd);
>> +
>> +    memset(&execbuf, 0, sizeof(execbuf));
>> +    memset(&obj, 0, sizeof(obj));
>> +    memset(&reloc, 0, sizeof(reloc));
>> +
>> +    execbuf.buffers_ptr = to_user_pointer(obj);
>> +
>> +    execbuf.buffer_count = 2;
>> +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
>> +
>> +    obj[0].handle = target;
>> +    obj[1].handle = gem_create(fd, 4096);
>> +
>> +    obj[1].relocation_count = 1;
>> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
>> +
>> +    reloc.target_handle = obj[0].handle;
>> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
>> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
>> +    reloc.delta = offset * sizeof(uint32_t);
>> +
>> +    reloc.offset = i * sizeof(uint32_t);
>> +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
>> +
>> +    __sync_synchronize();
>> +
>> +    if (handle) {
>> +        *handle = obj[1].handle;
>> +        return;
> 
> Is this path used by the callers?
> 
>> +    }
>> +
>> +    gem_sync(fd, obj[1].handle);
>> +    execbuf.rsvd1 = ctx_id;
>> +    execbuf.rsvd2 = -1;
>> +
>> +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
>> +    igt_spin_batch_set_timeout(spin, timeout);
>> +    igt_assert(gem_bo_busy(fd, obj[0].handle));
>> +
>> +    gem_execbuf_wr(fd, &execbuf);
>> +    igt_spin_batch_free(fd, spin);
>> +
>> +    fence[fence_index] = execbuf.rsvd2 >> 32;
>> +
>> +    gem_close(fd, obj[1].handle);
>> +    gem_quiescent_gpu(fd);
>> +}
>> +
>> +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
>> +{
>> +    uint32_t ctx = gem_context_create(fd);
>> +
>> +    switch (ctx_prio) {
>> +    case HIGH:
>> +        gem_context_set_priority(fd, ctx, MAX_PRIO);
>> +        igt_info("Setting MAX priority %d\n", ctx_prio);
>> +        break;
>> +    case LOW:
>> +        gem_context_set_priority(fd, ctx, MIN_PRIO);
>> +        igt_info("Setting MIN priority %d\n", ctx_prio);
>> +        break;
>> +    default:
>> +        igt_info("Ignoring context priority %d\n", ctx_prio);
>> +        break;
>> +    }
>> +
>> +    return ctx;
> 
> Why do you need this helper which translates an integer to an integer? 
> Couldn't you just pass in the real priority to start with?
> 
>> +}
>> +
>> +static int gem_reset_stats(int fd, int ctx_id,
>> +               struct local_drm_i915_reset_stats *rs)
>> +{
>> +    memset(rs, 0, sizeof(*rs));
>> +    rs->ctx_id = ctx_id;
>> +    rs->reset_count = -1;
>> +
>> +    if (drmIoctl(fd, GET_RESET_STATS_IOCTL, rs)) {
>> +        printf("error:%d\n",-errno);
>> +        return -errno;
> 
> Make it assert success here since all callers do that anyway and it 
> matches the convention for gem prefixed ioctl helpers?
> 
>> +    }
>> +    igt_assert(rs->reset_count != -1);
>> +    return 0;
>> +}
>> +
>> +static unsigned index_to_engine_id(int fd, int index)
>> +{
>> +    unsigned engine_id = I915_EXEC_RENDER;
>> +
>> +    switch (index) {
>> +    case 1:
>> +        engine_id = I915_EXEC_RENDER;
>> +        break;
>> +    case 2:
>> +        if (gem_has_bsd2(fd))
>> +            engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
>> +        else
>> +            engine_id = I915_EXEC_BSD;
>> +        break;
>> +    case 3:
>> +        if (gem_has_bsd2(fd))
>> +            engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING2;
>> +        else
>> +            engine_id = I915_EXEC_BSD;
>> +        break;
>> +    case 4:
>> +        engine_id = I915_EXEC_VEBOX;
>> +        break;
>> +    default:
>> +        break;
>> +    }
> 
> I hope we can avoid this maintenance burden, will see later in patch how.
> 
>> +
>> +    return engine_id;
>> +}
>> +
>> +static void inject_hang(uint32_t fd, unsigned engine_id, uint32_t 
>> ctx_id,  unsigned flags)
>> +{
>> +    igt_hang_t hang;
>> +    hang = igt_hang_ctx(fd, ctx_id, engine_id, flags);
>> +    gem_sync(fd, hang.spin->handle);
>> +}
>> +
>> +static void gpu_watchdog_long_batch_2_contexts(int fd, int nengine, 
>> int prio_ctx1, int reset_ctx1, int prio_ctx2, int reset_ctx2)
>> +{
>> +    uint32_t ctx[48];
>> +    uint32_t scratch[48];
>> +    unsigned flags = HANG_ALLOW_CAPTURE;
>> +    const uint64_t batch_timeout_ms = timeout_100ms * 3;
> 
> ms suffix usually means the unit stored is ms. While here it seems to be 
> something else.
> 
> const unsigned long batch_timeout_ms = 300;
> const unsigned long batch_timeout_us = 300e3;
> const unsigned long batch_timeout_ns = 300e6;
> 
> Not sure which one you want. Probably use ms and scale up at the use site.
> 
>> +    const struct intel_execution_engine2 *e;
>> +    const struct intel_execution_engine2 *e2;
>> +    int i = 0, index = 0, engine_id = 0, ctx_cnt = 0;
>> +    int *fence = 0, active_count = 0;
>> +    struct local_drm_i915_reset_stats rs;
>> +
>> +    igt_require(nengine);
>> +
>> +    fence = (int *)malloc(sizeof(int) * 48);
>> +
>> +    if (!fence) {
>> +        igt_info("Out of memory\n");
>> +        exit(1);
>> +    }
> 
> Replace with igt_assert(fence).
> 
>> +
>> +    /* Intializes random number generator */
>> +    srand((unsigned) time(NULL));
>> +
>> +    /* The outer looper is ctx1 and the inner looper is ctx2
>> +     * Both contexts can set their priority, create work, and cancel 
>> themselves
>> +     */
>> +    for_each_engine_class_instance(fd, e) {
>> +        if ( strcmp(e->name, "bcs0") == 0 )
>> +            continue;
> 
> Use -ENODEV or -EINVAL from context_set_watchdog to skip.
> 
>> +
>> +        scratch[ctx_cnt] = gem_create(fd, 4096);
>> +        index = rand() % 4 + 1;
>> +        engine_id = index_to_engine_id(fd, index);
>> +
>> +        /* get current batch_active count */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        active_count = rs.batch_active;
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx1);
>> +        batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, 
>> scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
>> +        context_set_watchdog(fd, engine_id, ctx[ctx_cnt], 
>> WATCHDOG_THRESHOLD);
>> +        sleep(1);
> 
> What's the sleep for? If unavoidable at least needs a comment.
> 
>> +
>> +        if (reset_ctx1) {
>> +            clear_error_state(fd);
>> +            inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
>> +            active_count++;
>> +        }
>> +
>> +        /* check batch_active count once again after the hang */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        ctx_cnt++;
>> +        i = 1;
>> +
>> +        for_each_engine_class_instance(fd, e2) {
>> +            if ( strcmp(e->name, "bcs0") == 0 )
> 
> e2, but remove it altogether as said above.
> 
> I don't get what is the purpose of two for_each_engine_class_instance 
> loops when neither e or e2 appear to be used?
> 
> Since either I am missing something critical or the test really is very 
> confused, until clarified, I am skipping a bit lower down.
> 
>> +                continue;
>> +
>> +            scratch[ctx_cnt] = gem_create(fd, 4096);
>> +            engine_id = index_to_engine_id(fd, i);
>> +
>> +            /* get current batch_active count */
>> +            igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +            active_count = rs.batch_active;
>> +            igt_assert_eq(rs.batch_active, active_count);
>> +
>> +            ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx2);
>> +            batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, 
>> scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
>> +            context_set_watchdog(fd, engine_id, ctx[ctx_cnt], 
>> WATCHDOG_THRESHOLD);
>> +
>> +            if (reset_ctx2) {
>> +                clear_error_state(fd);
>> +                inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
>> +                active_count++;
>> +            }
>> +
>> +            /* check batch_active count once again after the hang */
>> +            igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +            igt_assert_eq(rs.batch_active, active_count);
>> +
>> +            ctx_cnt++;
>> +            i++;
>> +        }
>> +    }
>> +
>> +    for (i = 0; i < ctx_cnt; i++) {
>> +        close(fence[i]);
>> +        gem_context_destroy(fd, ctx[i]);
>> +        gem_close(fd, scratch[i]);
>> +    }
>> +}
>> +
>> +static void gpu_watchodg_hang_long_batch_single_engine(int fd, const 
>> char *name)
>> +{
>> +    uint32_t ctx[16];
>> +    uint32_t scratch[16];
>> +    int *fence;
>> +    const struct intel_execution_engine2 *e;
>> +    unsigned nengine = 0;
>> +    unsigned engine_id = 0;
>> +    int active_count = 0;
>> +    struct local_drm_i915_reset_stats rs;
>> +    int i;
>> +    unsigned flags = HANG_ALLOW_CAPTURE;
>> +    const uint64_t batch_timeout_ms = timeout_100ms * 4;
>> +
>> +    fence = (int *)malloc(sizeof(int)*16);
>> +
>> +    if (!fence) {
>> +        igt_info("Out of memory\n");
>> +        exit(1);
>> +    }
>> +
>> +    for_each_engine_class_instance(fd, e) {
>> +        if ( strcmp(e->name, "bcs0") == 0 )
>> +            continue;
>> +
>> +        engine_id = e2ring(fd, e);
>> +
>> +        scratch[nengine] = gem_create(fd, 4096);
>> +        ctx[nengine] = create_ctx_with_priority(fd, -1);
>> +
>> +        /* get current batch_active count */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        active_count = rs.batch_active;
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        /* create some work on this engine using the above ctx*/
>> +        batch_buffer_factory(fd, ctx[nengine], engine_id, 
>> scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
>> +
>> +        /* set the gpu watchdog timeout */
>> +        context_set_watchdog(fd, engine_id, ctx[nengine], 
>> WATCHDOG_THRESHOLD);
>> +        clear_error_state(fd);
>> +
>> +        /*
>> +         * give enough time for the batch buffer to run
>> +         */
>> +        nanosleep(&(struct timespec){ .tv_sec = 0,
>> +                          .tv_nsec = batch_timeout_ms * 2 }, NULL);
>> +
>> +        /* create some work on this engine using the above ctx*/
>> +        batch_buffer_factory(fd, ctx[nengine], engine_id, 
>> scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
>> +        context_set_watchdog(fd, engine_id, ctx[nengine], 
>> WATCHDOG_THRESHOLD);
>> +        clear_error_state(fd);
>> +
>> +        /* cancel only the batch requested */
>> +        if ( strcmp(e->name, name) == 0 ) {
>> +            inject_hang(fd, engine_id, ctx[nengine], flags);
>> +            active_count++;
>> +        }
>> +
>> +        /* check batch_active count once again after the hang */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        active_count = rs.batch_active;
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        nengine++;
>> +    }
>> +
>> +    for (i = 0; i < nengine; i++) {
>> +        close(fence[i]);
>> +        gem_context_destroy(fd, ctx[i]);
>> +        gem_close(fd, scratch[i]);
>> +    }
>> +}
>> +
>> +igt_main
>> +{
>> +    int fd;
>> +    unsigned int nengine = 0;
>> +    unsigned int engine;
>> +    const struct intel_execution_engine2 *e;
>> +
>> +    igt_skip_on_simulation();
>> +
>> +    igt_fixture {
>> +        fd = drm_open_driver(DRIVER_INTEL);
>> +        igt_require_gem(fd);
>> +
>> +        for_each_physical_engine(fd, engine)
>> +            nengine++;
>> +        igt_require(nengine);
> 
> Do we have GPUs with no engines?
> 
>> +    }
>> +
>> +    igt_subtest_group {
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-low-prio-ctx1-reset-ctx2-high-prio-execute") 
>> {
>> +            int prio_ctx1 = LOW;
>> +            int prio_ctx2 = HIGH;
>> +            int reset_ctx1 = TRUE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
> 
> This is kind of verbose. I recommend out usual pattern along the lines of:
> 
> struct {
>      char *name;
>      int prio[2];
>      bool reset[2];
> } tests[] = {
>      { "change me", {0, 0}, {false, false} },
>      { "change me", {-1, 0}, {true, false} },
>      ...
> };
> 
> for (i = 0; i < ARRAY_SIZE(tests); i++) {
>      igt_subtest_f("long-batch-%s", tests[i]->name)
>          long_batch(fd, tests[i].prio[0], tests[i].ctx[0], ...);
> }
> 
> So you run all subtests from a single loop.

Close email too early, there were more comments below:

>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-high-prio-ctx1-reset-ctx2-low-prio-execute") 
>> {
>> +            int prio_ctx1 = HIGH;
>> +            int prio_ctx2 = LOW;
>> +            int reset_ctx1 = TRUE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-ctx1-reset-ctx2-execute") {
>> +            int prio_ctx1 = -1;
>> +            int prio_ctx2 = -1;
>> +            int reset_ctx1 = TRUE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-reset") {
>> +            int prio_ctx1 = -1;
>> +            int prio_ctx2 = -1;
>> +            int reset_ctx1 = FALSE;
>> +            int reset_ctx2 = TRUE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-execute") {
>> +            int prio_ctx1 = -1;
>> +            int prio_ctx2 = -1;
>> +            int reset_ctx1 = FALSE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +           __for_each_engine_class_instance(e) {
>> +            igt_subtest_group {
>> +                igt_fixture {
>> +                gem_require_engine(fd,
>> +                    e->class,
>> +                    e->instance);
>> +                }
>> +            /* no support for gpu watchdog on bcs0 */
>> +            if (strcmp(e->name, "bcs0") == 0)
>> +                continue;
> 
> Software implementation can support it. So don't skip here but skip in 
> the subtest if you get -ENODEV when you try to configure it.

Here ^

>> +
>> +                    
>> igt_subtest_f("gpu-watchdog-long-batch-single-engine-reset-%s", e->name)
> 
> Don't prefix all tests with gpu-watchdog, it is very verbose to read and 
> the test itself is called gem_watchdog already.

And here ^

I did not really get to dig into the test case coverage due all the 
noise and confusion with engine iteration.

Regards,

Tvrtko

>> +                        
>> gpu_watchodg_hang_long_batch_single_engine(fd, e->name);
>> +            }
>> +        }
>> +    }
>> +
>> +    igt_fixture {
>> +    close(fd);
>> +    }
>> +}
>> diff --git a/tests/meson.build b/tests/meson.build
>> index 5167a6c..b281b75 100644
>> --- a/tests/meson.build
>> +++ b/tests/meson.build
>> @@ -210,6 +210,7 @@ i915_progs = [
>>       'gem_unref_active_buffers',
>>       'gem_userptr_blits',
>>       'gem_wait',
>> +        'gem_watchdog',
>>       'gem_workarounds',
>>       'gem_write_read_ring_switch',
>>       'i915_fb_tiling',
>>