[Intel-gfx] [PATCH] drm/i915/selftests: Exercise far preemption rollbacks

Tue Jun 16 14:23:23 UTC 2020

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Not too long ago, we realised we had issues with a rolling back a
> context so far for a preemption request we considered the resubmit not
> to be a rollback but a forward roll. This means we would issue a lite
> restore instead of forcing a full restore, continuing execution of the
> old requests rather than causing a preemption. Add a selftest to
> exercise such a far rollback, such that if we were to skip the full
> restore, we would execute invalid instructions in the ring and hang.
>
> Note that while I was able to confirm that this causes us to do a
> lite-restore preemption rollback (with commit e36ba817fa96 ("drm/i915/gt:
> Incrementally check for rewinding") disabled), it did not trick the HW
> into rolling past the old RING_TAIL. Myybe on other HW.

s/Myybe/Maybe.

>
> References: e36ba817fa96 ("drm/i915/gt: Incrementally check for rewinding")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/gt/selftest_lrc.c | 150 +++++++++++++++++++++++++
>  1 file changed, 150 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 91543494f595..3d088116a055 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -363,6 +363,155 @@ static int live_unlite_preempt(void *arg)
>  	return live_unlite_restore(arg, I915_USER_PRIORITY(I915_PRIORITY_MAX));
>  }
>  
> +static int live_unlite_ring(void *arg)
> +{
> +	struct intel_gt *gt = arg;
> +	struct intel_engine_cs *engine;
> +	struct igt_spinner spin;
> +	enum intel_engine_id id;
> +	int err = 0;
> +
> +	/*
> +	 * Setup a preemption event that will cause almost the entire ring
> +	 * to be unwound, potentially fooling our intel_ring_direction()
> +	 * into emitting a forward lite-restore instead of the rollback.
> +	 */
> +
> +	if (igt_spinner_init(&spin, gt))
> +		return -ENOMEM;
> +
> +	for_each_engine(engine, gt, id) {
> +		struct intel_context *ce[2] = {};
> +		struct i915_request *rq;
> +		struct igt_live_test t;
> +		int n;
> +
> +		if (!intel_engine_has_preemption(engine))
> +			continue;
> +
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
> +
> +		if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) {
> +			err = -EIO;
> +			break;
> +		}
> +		engine_heartbeat_disable(engine);
> +
> +		for (n = 0; n < ARRAY_SIZE(ce); n++) {
> +			struct intel_context *tmp;
> +
> +			tmp = intel_context_create(engine);
> +			if (IS_ERR(tmp)) {
> +				err = PTR_ERR(tmp);
> +				goto err_ce;
> +			}
> +
> +			err = intel_context_pin(tmp);
> +			if (err) {
> +				intel_context_put(tmp);
> +				goto err_ce;
> +			}
> +
> +			memset32(tmp->ring->vaddr,
> +				 0xdeadbeef, /* trigger a hang if executed */
> +				 tmp->ring->vma->size / sizeof(u32));
> +
> +			ce[n] = tmp;
> +		}
> +
> +		rq = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			goto err_ce;
> +		}
> +
> +		i915_request_get(rq);
> +		rq->sched.attr.priority = I915_PRIORITY_BARRIER;

I missed this on reading and was very lost in the woods.

Chris kindly explained the scheduling steps, in short:

This makes both contexts first requests to be equal prio and
causes the ce[0] ring tail manipulation to happen back and forth,
when the spinner is released.

Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>

> +		i915_request_add(rq);
> +
> +		if (!igt_wait_for_spinner(&spin, rq)) {
> +			intel_gt_set_wedged(gt);
> +			i915_request_put(rq);
> +			err = -ETIME;
> +			goto err_ce;
> +		}
> +
> +		/* Fill the ring, until we will cause a wrap */
> +		n = 0;
> +		while (intel_ring_direction(ce[0]->ring,
> +					    rq->wa_tail,
> +					    ce[0]->ring->tail) <= 0) {
> +			struct i915_request *tmp;
> +
> +			tmp = intel_context_create_request(ce[0]);
> +			if (IS_ERR(tmp)) {
> +				err = PTR_ERR(tmp);
> +				i915_request_put(rq);
> +				goto err_ce;
> +			}
> +
> +			i915_request_add(tmp);
> +			intel_engine_flush_submission(engine);
> +			n++;
> +		}
> +		intel_engine_flush_submission(engine);
> +		pr_debug("%s: Filled ring with %d nop tails {size:%x, tail:%x, emit:%x, rq.tail:%x}\n",
> +			 engine->name, n,
> +			 ce[0]->ring->size,
> +			 ce[0]->ring->tail,
> +			 ce[0]->ring->emit,
> +			 rq->tail);
> +		GEM_BUG_ON(intel_ring_direction(ce[0]->ring,
> +						rq->tail,
> +						ce[0]->ring->tail) <= 0);
> +		i915_request_put(rq);
> +
> +		/* Create a second request to preempt the first ring */
> +		rq = intel_context_create_request(ce[1]);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			goto err_ce;
> +		}
> +
> +		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
> +		i915_request_get(rq);
> +		i915_request_add(rq);
> +
> +		err = wait_for_submit(engine, rq, HZ / 2);
> +		i915_request_put(rq);
> +		if (err) {
> +			pr_err("%s: preemption request was not submited\n",
> +			       engine->name);
> +			err = -ETIME;
> +		}
> +
> +		pr_debug("%s: ring[0]:{ tail:%x, emit:%x }, ring[1]:{ tail:%x, emit:%x }\n",
> +			 engine->name,
> +			 ce[0]->ring->tail, ce[0]->ring->emit,
> +			 ce[1]->ring->tail, ce[1]->ring->emit);
> +
> +err_ce:
> +		intel_engine_flush_submission(engine);
> +		igt_spinner_end(&spin);
> +		for (n = 0; n < ARRAY_SIZE(ce); n++) {
> +			if (IS_ERR_OR_NULL(ce[n]))
> +				break;
> +
> +			intel_context_unpin(ce[n]);
> +			intel_context_put(ce[n]);
> +		}
> +		engine_heartbeat_enable(engine);
> +		if (igt_live_test_end(&t))
> +			err = -EIO;
> +		if (err)
> +			break;
> +	}
> +
> +	igt_spinner_fini(&spin);
> +	return err;
> +}
> +
>  static int live_pin_rewind(void *arg)
>  {
>  	struct intel_gt *gt = arg;
> @@ -4374,6 +4523,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>  		SUBTEST(live_sanitycheck),
>  		SUBTEST(live_unlite_switch),
>  		SUBTEST(live_unlite_preempt),
> +		SUBTEST(live_unlite_ring),
>  		SUBTEST(live_pin_rewind),
>  		SUBTEST(live_hold_reset),
>  		SUBTEST(live_error_interrupt),
> -- 
> 2.20.1