[Intel-gfx] [PATCH 19/24] drm/i915/selftests: Be a little more lenient for reset workers
Mika Kuoppala
mika.kuoppala at linux.intel.com
Fri Feb 28 15:38:42 UTC 2020
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Give the reset worker a kick before losing help when waiting for hang
> recovery, as the CPU scheduler is a little unreliable.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
> drivers/gpu/drm/i915/gt/selftest_lrc.c | 74 ++++++++++++++++++--------
> 1 file changed, 52 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 95da6b880e3f..af5b3da6d894 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -90,6 +90,48 @@ static int wait_for_submit(struct intel_engine_cs *engine,
> return -ETIME;
> }
>
> +static int wait_for_reset(struct intel_engine_cs *engine,
> + struct i915_request *rq,
> + unsigned long timeout)
> +{
> + timeout += jiffies;
> + do {
> + cond_resched();
> + intel_engine_flush_submission(engine);
> +
> + if (READ_ONCE(engine->execlists.pending[0]))
> + continue;
> +
> + if (i915_request_completed(rq))
> + break;
> +
> + if (READ_ONCE(rq->fence.error))
> + break;
> + } while (time_before(jiffies, timeout));
> +
> + flush_scheduled_work();
> +
> + if (rq->fence.error != -EIO) {
> + pr_err("%s: hanging request %llx:%lld not reset\n",
> + engine->name,
> + rq->fence.context,
> + rq->fence.seqno);
> + return -EINVAL;
> + }
> +
> + /* Give the request a jiffie to complete after flushing the worker */
> + if (i915_request_wait(rq, 0,
> + max(0l, (long)(timeout - jiffies)) + 1) < 0) {
> + pr_err("%s: hanging request %llx:%lld did not complete\n",
> + engine->name,
> + rq->fence.context,
> + rq->fence.seqno);
> + return -ETIME;
> + }
> +
> + return 0;
> +}
> +
> static int live_sanitycheck(void *arg)
> {
> struct intel_gt *gt = arg;
> @@ -1805,14 +1847,9 @@ static int __cancel_active0(struct live_preempt_cancel *arg)
> if (err)
> goto out;
>
> - if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> - err = -EIO;
> - goto out;
> - }
> -
> - if (rq->fence.error != -EIO) {
> - pr_err("Cancelled inflight0 request did not report -EIO\n");
> - err = -EINVAL;
> + err = wait_for_reset(arg->engine, rq, HZ / 2);
> + if (err) {
> + pr_err("Cancelled inflight0 request did not reset\n");
> goto out;
> }
>
> @@ -1870,10 +1907,9 @@ static int __cancel_active1(struct live_preempt_cancel *arg)
> goto out;
>
> igt_spinner_end(&arg->a.spin);
> - if (i915_request_wait(rq[1], 0, HZ / 5) < 0) {
> - err = -EIO;
> + err = wait_for_reset(arg->engine, rq[1], HZ / 2);
> + if (err)
> goto out;
> - }
>
> if (rq[0]->fence.error != 0) {
> pr_err("Normal inflight0 request did not complete\n");
> @@ -1953,10 +1989,9 @@ static int __cancel_queued(struct live_preempt_cancel *arg)
> if (err)
> goto out;
>
> - if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
> - err = -EIO;
> + err = wait_for_reset(arg->engine, rq[2], HZ / 2);
> + if (err)
> goto out;
> - }
>
> if (rq[0]->fence.error != -EIO) {
> pr_err("Cancelled inflight0 request did not report -EIO\n");
> @@ -2014,14 +2049,9 @@ static int __cancel_hostile(struct live_preempt_cancel *arg)
> if (err)
> goto out;
>
> - if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> - err = -EIO;
> - goto out;
> - }
> -
> - if (rq->fence.error != -EIO) {
> - pr_err("Cancelled inflight0 request did not report -EIO\n");
> - err = -EINVAL;
> + err = wait_for_reset(arg->engine, rq, HZ / 2);
> + if (err) {
> + pr_err("Cancelled inflight0 request did not reset\n");
> goto out;
> }
>
> --
> 2.25.1
More information about the Intel-gfx
mailing list