[igt-dev] [PATCH i-g-t] i915/hangman: Fix gt hang/error tests
Kamil Konieczny
kamil.konieczny at linux.intel.com
Thu Jul 28 14:59:53 UTC 2022
Hi Umesh,
On 2022-07-28 at 01:15:59 +0000, Umesh Nerlige Ramappa wrote:
> gt-engine-hang and gt-engine-error tests were still using reset=2
> setting so that ended up just doing an engine reset. Fix the tests so
> that they actually do the gt specific reset and set the expectations
> correctly.
>
> In some rare failures, some of the background spinners show up as
> innocent and keep spinning after the hang recovery. This can only happen
> if the spinners did not start for some reason. Check that the spinners
> actually started before submitting a hanging batch.
>
> For engine specific hang, only one context is marked guilty, but for a
> gt hang all contexts are marked guilty. Check for different expected
> behavior for engine vs. gt reset.
>
> v2:
> - gt-reset resets all contexts on all engines. The execlist implementation
> of gt-engine-* tests expected that a preemptible background spinner
> running on the target engine should be marked innocent. While i915 can
> mark such a context for execlist mode, GuC scheduling does not guarantee
> that the background spinner can be marked as innocent. Since the state
> of the background spinner depends on the scheduling backend, do no
> validate the state of the background spinner for the target engine.
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> ---
> tests/i915/i915_hangman.c | 84 ++++++++++++++++++++++++---------------
> 1 file changed, 53 insertions(+), 31 deletions(-)
>
> diff --git a/tests/i915/i915_hangman.c b/tests/i915/i915_hangman.c
> index c7d69fdd..d7b173ab 100644
> --- a/tests/i915/i915_hangman.c
> +++ b/tests/i915/i915_hangman.c
> @@ -295,9 +295,15 @@ static void context_unban(int fd, unsigned ctx)
> gem_context_set_param(fd, ¶m);
> }
>
> +enum reset_type {
> + GT_RESET = 1,
> + ENGINE_RESET = 2,
> +};
> +
> static void
> test_engine_hang(const intel_ctx_t *ctx,
> - const struct intel_execution_engine2 *e, unsigned int flags)
> + const struct intel_execution_engine2 *e, unsigned int flags,
> + enum reset_type reset)
> {
> const struct intel_execution_engine2 *other;
> const intel_ctx_t *local_ctx[GEM_MAX_ENGINES];
> @@ -309,6 +315,7 @@ test_engine_hang(const intel_ctx_t *ctx,
> igt_skip_on(flags & IGT_SPIN_INVALID_CS &&
> gem_engine_has_cmdparser(device, &ctx->cfg, e->flags));
>
> + igt_assert(reset == GT_RESET || reset == ENGINE_RESET);
> /*
> * Fill all engines with background load.
> * This verifies that independent engines are unaffected and gives
> @@ -324,7 +331,10 @@ test_engine_hang(const intel_ctx_t *ctx,
> .ahnd = ahndN,
> .ctx = local_ctx[num_ctx],
> .engine = other->flags,
> - .flags = IGT_SPIN_FENCE_OUT);
> + .flags = IGT_SPIN_FENCE_OUT |
> + IGT_SPIN_POLL_RUN);
> + igt_spin_busywait_until_started(spin);
> +
> num_ctx++;
>
> igt_list_move(&spin->link, &list);
> @@ -344,14 +354,43 @@ test_engine_hang(const intel_ctx_t *ctx,
> igt_assert_eq(sync_fence_status(spin->out_fence), -EIO);
> igt_spin_free(device, spin);
>
> - /* But no other engines/clients should be affected */
> - igt_list_for_each_entry_safe(spin, next, &list, link) {
> + /*
> + * engine-engine-hang: Other engines/clients should not be affected for
> + * engine reset, so innocent contexts complete successfully once the
> + * spinner is ended.
> + *
> + * gt-engine-hang: All engines/clients are guilty and complete with a
> + * -EIO fence status, however the background task that was submitted to
> + * the target engine is innocent and is expected to complete
> + * successfully.
> + */
imho we should keep test for execlist behaviour. It is not clear
why background task survives ? You stated "All ... are guilty" ?
Isn't this a race between completing and resetting GT ?
> + igt_list_for_each_entry_safe_reverse(spin, next, &list, link) {
> + bool innocent = reset == ENGINE_RESET;
> + int expect = innocent ? 1 : -EIO;
> +
> ahndN = spin->opts.ahnd;
> - igt_assert(sync_fence_wait(spin->out_fence, 0) == -ETIME);
> - igt_spin_end(spin);
> + if (innocent) {
> + igt_assert_eq(sync_fence_wait(spin->out_fence, 0), -ETIME);
> + igt_spin_end(spin);
> + }
>
> - igt_assert(sync_fence_wait(spin->out_fence, 500) == 0);
> - igt_assert_eq(sync_fence_status(spin->out_fence), 1);
> + if (spin->opts.engine == e->flags) {
> + /*
> + * gt-reset resets all contexts on all engines. The execlist
> + * implementation of gt-engine-* tests expected that a
> + * preemptible background spinner running on the target engine
> + * should be marked innocent. While i915 can mark such a
> + * context for execlist mode, GuC scheduling does not guarantee
> + * that the background spinner can be marked as innocent. Since
> + * the state of the background spinner depends on the scheduling
> + * backend, do no validate the state of the background spinner
> + * for the target engine.
> + */
> + igt_spin_end(spin);
> + } else {
> + igt_assert_eq(sync_fence_wait(spin->out_fence, 500), 0);
> + igt_assert_eq(sync_fence_status(spin->out_fence), expect);
> + }
> igt_spin_free(device, spin);
> put_ahnd(ahndN);
> }
> @@ -439,6 +478,10 @@ static void do_tests(const char *name, const char *prefix,
> {
> const struct intel_execution_engine2 *e;
> char buff[256];
> + enum reset_type reset = ENGINE_RESET;
> +
> + if (!strncmp(prefix, "gt", 2))
> + reset = GT_RESET;
>
> snprintf(buff, sizeof(buff), "Per engine error capture (%s reset)", name);
> igt_describe(buff);
> @@ -454,20 +497,9 @@ static void do_tests(const char *name, const char *prefix,
> igt_describe(buff);
> snprintf(buff, sizeof(buff), "%s-engine-hang", prefix);
> igt_subtest_with_dynamic(buff) {
> - int has_gpu_reset = 0;
> - struct drm_i915_getparam gp = {
> - .param = I915_PARAM_HAS_GPU_RESET,
> - .value = &has_gpu_reset,
> - };
> -
> - igt_require(gem_scheduler_has_preemption(device));
> - igt_params_set(device, "reset", "%u", -1);
> - ioctl(device, DRM_IOCTL_I915_GETPARAM, &gp);
> - igt_require(has_gpu_reset > 1);
> -
> for_each_ctx_engine(device, ctx, e) {
> igt_dynamic_f("%s", e->name)
> - test_engine_hang(ctx, e, 0);
> + test_engine_hang(ctx, e, 0, reset);
> }
> }
>
> @@ -475,19 +507,9 @@ static void do_tests(const char *name, const char *prefix,
> igt_describe(buff);
> snprintf(buff, sizeof(buff), "%s-engine-error", prefix);
> igt_subtest_with_dynamic(buff) {
> - int has_gpu_reset = 0;
> - struct drm_i915_getparam gp = {
> - .param = I915_PARAM_HAS_GPU_RESET,
> - .value = &has_gpu_reset,
> - };
> -
> - igt_params_set(device, "reset", "%u", -1);
> - ioctl(device, DRM_IOCTL_I915_GETPARAM, &gp);
> - igt_require(has_gpu_reset > 1);
> -
imho we need this code in test_engine_hang for execlist
subbmission for gt-engine-hang case.
Regards,
Kamil
> for_each_ctx_engine(device, ctx, e) {
> igt_dynamic_f("%s", e->name)
> - test_engine_hang(ctx, e, IGT_SPIN_INVALID_CS);
> + test_engine_hang(ctx, e, IGT_SPIN_INVALID_CS, reset);
> }
> }
> }
> --
> 2.25.1
>
More information about the igt-dev
mailing list