[Intel-gfx] [PATCH] drm/i915/selftests: Force a failed engine reset
Mika Kuoppala
mika.kuoppala at linux.intel.com
Tue Jan 12 17:07:13 UTC 2021
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Inject a fault into the engine reset and check that the outstanding
> requests are completed despite the failed reset.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 133 +++++++++++++++++++
> 1 file changed, 133 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index ffc6eabb6404..875633cc0a75 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -540,6 +540,138 @@ static int igt_reset_nop_engine(void *arg)
> return 0;
> }
>
> +static void force_reset_timeout(struct intel_engine_cs *engine)
> +{
> + engine->reset_timeout.probability = 999;
> + atomic_set(&engine->reset_timeout.times, -1);
> +}
> +
> +static void cancel_reset_timeout(struct intel_engine_cs *engine)
> +{
> + memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
> +}
> +
> +static int igt_reset_fail_engine(void *arg)
> +{
> + struct intel_gt *gt = arg;
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> +
> + /* Check that we can engine-reset during non-user portions */
> +
> + if (!intel_has_reset_engine(gt))
> + return 0;
> +
> + for_each_engine(engine, gt, id) {
> + unsigned int count;
> + struct intel_context *ce;
> + IGT_TIMEOUT(end_time);
> + int err;
> +
> + ce = intel_context_create(engine);
> + if (IS_ERR(ce))
> + return PTR_ERR(ce);
> +
> + st_engine_heartbeat_disable(engine);
> + set_bit(I915_RESET_ENGINE + id, >->reset.flags);
> + count = 0;
> + do {
> + struct i915_request *last = NULL;
> + int i;
> +
> + if (!wait_for_idle(engine)) {
> + pr_err("%s failed to idle before reset\n",
> + engine->name);
> + err = -EIO;
> + break;
> + }
> +
> + for (i = 0; i < 16; i++) {
> + struct i915_request *rq;
> +
> + rq = intel_context_create_request(ce);
> + if (IS_ERR(rq)) {
> + struct drm_printer p =
> + drm_info_printer(gt->i915->drm.dev);
> + intel_engine_dump(engine, &p,
> + "%s(%s): failed to submit request\n",
> + __func__,
> + engine->name);
> +
> + GEM_TRACE("%s(%s): failed to submit request\n",
> + __func__,
> + engine->name);
> + GEM_TRACE_DUMP();
> +
> + intel_gt_set_wedged(gt);
> + if (last)
> + i915_request_put(last);
> +
> + err = PTR_ERR(rq);
> + goto out;
> + }
> +
> + if (last)
> + i915_request_put(last);
> + last = i915_request_get(rq);
> + i915_request_add(rq);
> + }
> +
> + if (count & 1) {
> + err = intel_engine_reset(engine, NULL);
> + if (err) {
> + GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
> + engine->name, err);
> + GEM_TRACE_DUMP();
> + break;
> + }
> + } else {
> + force_reset_timeout(engine);
> + err = intel_engine_reset(engine, NULL);
We dont promote to global here if the engine one fails?
If not, what mechanism then guarantees the request completion.
-Mika
> + cancel_reset_timeout(engine);
> + if (err != -ETIMEDOUT) {
> + pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
> + engine->name, err);
> + break;
> + }
> + }
> +
> + err = 0;
> + if (i915_request_wait(last, 0, HZ /2) < 0) {
> + struct drm_printer p =
> + drm_info_printer(gt->i915->drm.dev);
> +
> + intel_engine_dump(engine, &p,
> + "%s(%s): failed to complete request\n",
> + __func__,
> + engine->name);
> +
> + GEM_TRACE("%s(%s): failed to complete request\n",
> + __func__,
> + engine->name);
> + GEM_TRACE_DUMP();
> +
> + err = -EIO;
> + }
> + i915_request_put(last);
> + count++;
> + } while (err == 0 && time_before(jiffies, end_time));
> +out:
> + clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
> + st_engine_heartbeat_enable(engine);
> +
> + pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
> +
> + intel_context_put(ce);
> + if (igt_flush_test(gt->i915))
> + err = -EIO;
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +
> static int __igt_reset_engine(struct intel_gt *gt, bool active)
> {
> struct i915_gpu_error *global = >->i915->gpu_error;
> @@ -1694,6 +1826,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> SUBTEST(igt_reset_nop_engine),
> SUBTEST(igt_reset_idle_engine),
> SUBTEST(igt_reset_active_engine),
> + SUBTEST(igt_reset_fail_engine),
> SUBTEST(igt_reset_engines),
> SUBTEST(igt_reset_engines_atomic),
> SUBTEST(igt_reset_queue),
> --
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list