[Intel-gfx] [PATCH 2/3] drm/i915/execlists: Reclaim the hanging virtual request

Wed Jan 22 13:38:20 UTC 2020

On 22/01/2020 11:29, Chris Wilson wrote:
> If we encounter a hang on a virtual engine, as we process the hang the
> request may already have been moved back to the virtual engine (we are
> processing the hang on the physical engine). We need to reclaim the
> request from the virtual engine so that the locking is consistent and
> local to the real engine on which we will hold the request for error
> state capturing.
> 
> v2: Pull the reclamation into execlists_hold() and assert that cannot be
> called from outside of the reset (i.e. with the tasklet disabled).
> v3: Added selftest
> v4: Drop the reference owned by the virtual engine
> 
> Fixes: 748317386afb ("drm/i915/execlists: Offline error capture")
> Testcase: igt/gem_exec_balancer/hang
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c    |  29 +++++
>   drivers/gpu/drm/i915/gt/selftest_lrc.c | 157 ++++++++++++++++++++++++-
>   2 files changed, 185 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 59af136e1b1d..5bacff7724e9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2403,6 +2403,35 @@ static bool execlists_hold(struct intel_engine_cs *engine,
>   		goto unlock;
>   	}
>   
> +	if (rq->engine != engine) { /* preempted virtual engine */
> +		struct virtual_engine *ve = to_virtual_engine(rq->engine);
> +
> +		/*
> +		 * intel_context_inflight() is only protected by virtue
> +		 * of process_csb() being called only by the tasklet (or
> +		 * directly from inside reset while the tasklet is suspended).
> +		 * Assert that neither of those are allowed to run while we
> +		 * poke at the request queues.
> +		 */
> +		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
> +
> +		/*
> +		 * An unsubmitted request along a virtual engine will
> +		 * remain on the active (this) engine until we are able
> +		 * to process the context switch away (and so mark the
> +		 * context as no longer in flight). That cannot have happened
> +		 * yet, otherwise we would not be hanging!
> +		 */
> +		spin_lock(&ve->base.active.lock);
> +		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
> +		GEM_BUG_ON(ve->request != rq);
> +		ve->request = NULL;
> +		spin_unlock(&ve->base.active.lock);
> +		i915_request_put(rq);
> +
> +		rq->engine = engine;
> +	}
> +
>   	/*
>   	 * Transfer this request onto the hold queue to prevent it
>   	 * being resumbitted to HW (and potentially completed) before we have
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index b208c2176bbd..f830bd81d913 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -335,7 +335,6 @@ static int live_hold_reset(void *arg)
>   
>   		if (test_and_set_bit(I915_RESET_ENGINE + id,
>   				     &gt->reset.flags)) {
> -			spin_unlock_irq(&engine->active.lock);
>   			intel_gt_set_wedged(gt);
>   			err = -EBUSY;
>   			goto out;
> @@ -3411,6 +3410,161 @@ static int live_virtual_bond(void *arg)
>   	return 0;
>   }
>   
> +static int reset_virtual_engine(struct intel_gt *gt,
> +				struct intel_engine_cs **siblings,
> +				unsigned int nsibling)
> +{
> +	struct intel_engine_cs *engine;
> +	struct intel_context *ve;
> +	unsigned long *heartbeat;
> +	struct igt_spinner spin;
> +	struct i915_request *rq;
> +	unsigned int n;
> +	int err = 0;
> +
> +	/*
> +	 * In order to support offline error capture for fast preempt reset,
> +	 * we need to decouple the guilty request and ensure that it and its
> +	 * descendents are not executed while the capture is in progress.
> +	 */
> +
> +	heartbeat = kmalloc_array(nsibling, sizeof(*heartbeat), GFP_KERNEL);
> +	if (!heartbeat)
> +		return -ENOMEM;
> +
> +	if (igt_spinner_init(&spin, gt)) {
> +		err = -ENOMEM;
> +		goto out_free;
> +	}
> +
> +	ve = intel_execlists_create_virtual(siblings, nsibling);
> +	if (IS_ERR(ve)) {
> +		err = PTR_ERR(ve);
> +		goto out_spin;
> +	}
> +
> +	for (n = 0; n < nsibling; n++)
> +		engine_heartbeat_disable(siblings[n], &heartbeat[n]);
> +
> +	rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto out_heartbeat;
> +	}
> +	i915_request_add(rq);
> +
> +	if (!igt_wait_for_spinner(&spin, rq)) {
> +		intel_gt_set_wedged(gt);
> +		err = -ETIME;
> +		goto out_heartbeat;
> +	}
> +
> +	engine = rq->engine;
> +	GEM_BUG_ON(engine == ve->engine);
> +
> +	/* Take ownership of the reset and tasklet */
> +	if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> +			     &gt->reset.flags)) {
> +		intel_gt_set_wedged(gt);
> +		err = -EBUSY;
> +		goto out_heartbeat;
> +	}
> +	tasklet_disable(&engine->execlists.tasklet);
> +
> +	engine->execlists.tasklet.func(engine->execlists.tasklet.data);
> +	GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
> +
> +	/* Fake a preemption event; failed of course */
> +	spin_lock_irq(&engine->active.lock);
> +	__unwind_incomplete_requests(engine);
> +	spin_unlock_irq(&engine->active.lock);
> +	GEM_BUG_ON(rq->engine != ve->engine);
> +
> +	/* Reset the engine while keeping our active request on hold */
> +	execlists_hold(engine, rq);
> +	GEM_BUG_ON(!i915_request_on_hold(rq));
> +
> +	intel_engine_reset(engine, NULL);
> +	GEM_BUG_ON(rq->fence.error != -EIO);
> +
> +	/* Release our grasp on the engine, letting CS flow again */
> +	tasklet_enable(&engine->execlists.tasklet);
> +	clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags);
> +
> +	/* Check that we do not resubmit the held request */
> +	i915_request_get(rq);
> +	if (!i915_request_wait(rq, 0, HZ / 5)) {
> +		pr_err("%s: on hold request completed!\n",
> +		       engine->name);
> +		intel_gt_set_wedged(gt);
> +		err = -EIO;
> +		goto out_rq;
> +	}
> +	GEM_BUG_ON(!i915_request_on_hold(rq));
> +
> +	/* But is resubmitted on release */
> +	execlists_unhold(engine, rq);
> +	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> +		pr_err("%s: held request did not complete!\n",
> +		       engine->name);
> +		intel_gt_set_wedged(gt);
> +		err = -ETIME;
> +	}
> +
> +out_rq:
> +	i915_request_put(rq);
> +out_heartbeat:
> +	for (n = 0; n < nsibling; n++)
> +		engine_heartbeat_enable(siblings[n], heartbeat[n]);
> +
> +	intel_context_put(ve);
> +out_spin:
> +	igt_spinner_fini(&spin);
> +out_free:
> +	kfree(heartbeat);
> +	return err;
> +}
> +
> +static int live_virtual_reset(void *arg)
> +{
> +	struct intel_gt *gt = arg;
> +	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
> +	unsigned int class, inst;
> +
> +	/*
> +	 * Check that we handle a reset event within a virtual engine.
> +	 * Only the physical engine is reset, but we have to check the flow
> +	 * of the virtual requests around the reset, and make sure it is not
> +	 * forgotten.
> +	 */
> +
> +	if (USES_GUC_SUBMISSION(gt->i915))
> +		return 0;
> +
> +	if (!intel_has_reset_engine(gt))
> +		return 0;
> +
> +	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
> +		int nsibling, err;
> +
> +		nsibling = 0;
> +		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
> +			if (!gt->engine_class[class][inst])
> +				continue;
> +
> +			siblings[nsibling++] = gt->engine_class[class][inst];
> +		}
> +		if (nsibling < 2)
> +			continue;
> +
> +		err = reset_virtual_engine(gt, siblings, nsibling);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
>   int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
> @@ -3436,6 +3590,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_virtual_mask),
>   		SUBTEST(live_virtual_preserved),
>   		SUBTEST(live_virtual_bond),
> +		SUBTEST(live_virtual_reset),
>   	};
>   
>   	if (!HAS_EXECLISTS(i915))
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko