[Intel-gfx] [PATCH i-g-t 8/8] tests/i915/gem_exec_capture: Update to support GuC based resets
Matthew Brost
matthew.brost at intel.com
Fri Oct 29 02:54:28 UTC 2021
On Thu, Oct 21, 2021 at 04:40:44PM -0700, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> When GuC submission is enabled, GuC itself manages hang detection and
> recovery. Therefore, any test that relies on being able to trigger an
> engine reset in the driver will fail. Full GT resets can still be
> triggered by the driver. However, in that situation detecting the
> specific context that caused a hang is not possible as the driver has
> no information about what is actually running on the hardware at any
> given time. Plus of course, there was no context that caused the hang
> because the hang was triggered manually, so it's basically a bogus
> mechanism in the first place!
>
> Update the capture test to cause a reset via a the hangcheck mechanism
> by submitting a hanging batch and waiting. That way it is guaranteed to
> be testing the correct reset code paths for the current platform,
> whether that is GuC enabled or not.
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> ---
> tests/i915/gem_exec_capture.c | 65 ++++++++++++++++++++++++++++-------
> 1 file changed, 53 insertions(+), 12 deletions(-)
>
> diff --git a/tests/i915/gem_exec_capture.c b/tests/i915/gem_exec_capture.c
> index 8997125ee..dda6e6a8f 100644
> --- a/tests/i915/gem_exec_capture.c
> +++ b/tests/i915/gem_exec_capture.c
> @@ -23,6 +23,7 @@
>
> #include <sys/poll.h>
> #include <zlib.h>
> +#include <sched.h>
>
> #include "i915/gem.h"
> #include "i915/gem_create.h"
> @@ -31,6 +32,8 @@
> #include "igt_rand.h"
> #include "igt_sysfs.h"
>
> +#define MAX_RESET_TIME 600
> +
> IGT_TEST_DESCRIPTION("Check that we capture the user specified objects on a hang");
>
> struct offset {
> @@ -213,7 +216,29 @@ static void configure_hangs(int fd, const struct intel_execution_engine2 *e, int
> gem_engine_property_printf(fd, e->name, "heartbeat_interval_ms", "%d", 500);
>
> /* Allow engine based resets and disable banning */
> - igt_allow_hang(fd, ctxt_id, HANG_ALLOW_CAPTURE);
> + igt_allow_hang(fd, ctxt_id, HANG_ALLOW_CAPTURE | HANG_WANT_ENGINE_RESET);
> +}
> +
> +static bool fence_busy(int fence)
> +{
> + return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
> +}
> +
> +static void wait_to_die(int fence_out)
> +{
> + struct timeval before, after, delta;
> +
> + /* Wait for a reset to occur */
> + gettimeofday(&before, NULL);
> + while (fence_busy(fence_out)) {
> + gettimeofday(&after, NULL);
> + timersub(&after, &before, &delta);
> + igt_assert(delta.tv_sec < MAX_RESET_TIME);
> + sched_yield();
> + }
> + gettimeofday(&after, NULL);
> + timersub(&after, &before, &delta);
> + igt_info("Target died after %ld.%06lds\n", delta.tv_sec, delta.tv_usec);
> }
>
> static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> @@ -230,7 +255,7 @@ static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> struct drm_i915_gem_execbuffer2 execbuf;
> uint32_t *batch, *seqno;
> struct offset offset;
> - int i;
> + int i, fence_out;
>
> configure_hangs(fd, e, ctx->id);
>
> @@ -315,18 +340,25 @@ static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> execbuf.flags = e->flags;
> if (gen > 3 && gen < 6)
> execbuf.flags |= I915_EXEC_SECURE;
> + execbuf.flags |= I915_EXEC_FENCE_OUT;
> execbuf.rsvd1 = ctx->id;
> + execbuf.rsvd2 = ~0UL;
>
> igt_assert(!READ_ONCE(*seqno));
> - gem_execbuf(fd, &execbuf);
> + gem_execbuf_wr(fd, &execbuf);
> +
> + fence_out = execbuf.rsvd2 >> 32;
> + igt_assert(fence_out >= 0);
>
> /* Wait for the request to start */
> while (READ_ONCE(*seqno) != 0xc0ffee)
> igt_assert(gem_bo_busy(fd, obj[SCRATCH].handle));
> munmap(seqno, 4096);
>
> + /* Wait for a reset to occur */
> + wait_to_die(fence_out);
> +
> /* Check that only the buffer we marked is reported in the error */
> - igt_force_gpu_reset(fd);
> memset(&offset, 0, sizeof(offset));
> offset.addr = obj[CAPTURE].offset;
> igt_assert_eq(check_error_state(dir, &offset, 1, target_size, false), 1);
> @@ -373,7 +405,8 @@ static int cmp(const void *A, const void *B)
> static struct offset *
> __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> const struct intel_execution_engine2 *e,
> - unsigned int size, int count, unsigned int flags)
> + unsigned int size, int count,
> + unsigned int flags, int *_fence_out)
> #define INCREMENTAL 0x1
> #define ASYNC 0x2
> {
> @@ -383,7 +416,7 @@ __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> struct drm_i915_gem_execbuffer2 execbuf;
> uint32_t *batch, *seqno;
> struct offset *offsets;
> - int i;
> + int i, fence_out;
>
> configure_hangs(fd, e, ctx->id);
>
> @@ -491,10 +524,17 @@ __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> execbuf.flags = e->flags;
> if (gen > 3 && gen < 6)
> execbuf.flags |= I915_EXEC_SECURE;
> + execbuf.flags |= I915_EXEC_FENCE_OUT;
> execbuf.rsvd1 = ctx->id;
> + execbuf.rsvd2 = ~0UL;
>
> igt_assert(!READ_ONCE(*seqno));
> - gem_execbuf(fd, &execbuf);
> + gem_execbuf_wr(fd, &execbuf);
> +
> + fence_out = execbuf.rsvd2 >> 32;
> + igt_assert(fence_out >= 0);
> + if (_fence_out)
> + *_fence_out = fence_out;
>
> /* Wait for the request to start */
> while (READ_ONCE(*seqno) != 0xc0ffee)
> @@ -502,7 +542,7 @@ __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> munmap(seqno, 4096);
>
> if (!(flags & ASYNC)) {
> - igt_force_gpu_reset(fd);
> + wait_to_die(fence_out);
> gem_sync(fd, obj[count + 1].handle);
> }
>
> @@ -549,7 +589,7 @@ static void many(int fd, int dir, uint64_t size, unsigned int flags)
> intel_require_memory(count, size, CHECK_RAM);
> ahnd = get_reloc_ahnd(fd, ctx->id);
>
> - offsets = __captureN(fd, dir, ahnd, ctx, e, size, count, flags);
> + offsets = __captureN(fd, dir, ahnd, ctx, e, size, count, flags, NULL);
>
> blobs = check_error_state(dir, offsets, count, size, !!(flags & INCREMENTAL));
> igt_info("Captured %lu %"PRId64"-blobs out of a total of %lu\n",
> @@ -602,6 +642,7 @@ static void prioinv(int fd, int dir, const intel_ctx_t *ctx,
> igt_assert(pipe(link) == 0);
> igt_fork(child, 1) {
> const intel_ctx_t *ctx2;
> + int fence_out;
> fd = gem_reopen_driver(fd);
> igt_debug("Submitting large capture [%ld x %dMiB objects]\n",
> count, (int)(size >> 20));
> @@ -613,11 +654,11 @@ static void prioinv(int fd, int dir, const intel_ctx_t *ctx,
> /* Reopen the allocator in the new process. */
> ahnd = get_reloc_ahnd(fd, ctx2->id);
>
> - free(__captureN(fd, dir, ahnd, ctx2, e, size, count, ASYNC));
> + free(__captureN(fd, dir, ahnd, ctx2, e, size, count, ASYNC, &fence_out));
> put_ahnd(ahnd);
>
> write(link[1], &fd, sizeof(fd)); /* wake the parent up */
> - igt_force_gpu_reset(fd);
> + wait_to_die(fence_out);
> write(link[1], &fd, sizeof(fd)); /* wake the parent up */
> }
> read(link[0], &dummy, sizeof(dummy));
> @@ -714,7 +755,7 @@ igt_main
> gem_require_mmap_wc(fd);
> igt_require(has_capture(fd));
> ctx = intel_ctx_create_all_physical(fd);
> - igt_allow_hang(fd, ctx->id, HANG_ALLOW_CAPTURE);
> + igt_allow_hang(fd, ctx->id, HANG_ALLOW_CAPTURE | HANG_WANT_ENGINE_RESET);
>
> dir = igt_sysfs_open(fd);
> igt_require(igt_sysfs_set(dir, "error", "Begone!"));
> --
> 2.25.1
>
More information about the Intel-gfx
mailing list