[Intel-gfx] [PATCH 3/3] drm/i915/selftests: Fix up igt_reset_engine
Michel Thierry
michel.thierry at intel.com
Mon Dec 18 21:50:17 UTC 2017
On 17/12/17 05:28, Chris Wilson wrote:
> Now that we skip a per-engine reset on an idle engine, we need to update
> the selftest to take that into account. In the process, we find that we
> were not stressing the per-engine reset very hard, so add those missing
> active resets.
>
> v2: Actually test i915_reset_engine() by loading it with requests.
>
> Fixes: f6ba181ada55 ("drm/i915: Skip an engine reset if it recovered before our preparations")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Reviewed-by: Michel Thierry <michel.thierry at intel.com>
And all these subtests passed with and without GuC in SKL.
> ---
> drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 314 ++++++++++++++++++-----
> 1 file changed, 250 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index f98546b8a7fa..c8a756e2139f 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
> *batch++ = lower_32_bits(hws_address(hws, rq));
> *batch++ = upper_32_bits(hws_address(hws, rq));
> *batch++ = rq->fence.seqno;
> + *batch++ = MI_ARB_CHECK;
> +
> + memset(batch, 0, 1024);
> + batch += 1024 / sizeof(*batch);
> +
> + *batch++ = MI_ARB_CHECK;
> *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
> *batch++ = lower_32_bits(vma->node.start);
> *batch++ = upper_32_bits(vma->node.start);
> @@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
> *batch++ = 0;
> *batch++ = lower_32_bits(hws_address(hws, rq));
> *batch++ = rq->fence.seqno;
> + *batch++ = MI_ARB_CHECK;
> +
> + memset(batch, 0, 1024);
> + batch += 1024 / sizeof(*batch);
> +
> + *batch++ = MI_ARB_CHECK;
> *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
> *batch++ = lower_32_bits(vma->node.start);
> } else if (INTEL_GEN(i915) >= 4) {
> @@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
> *batch++ = 0;
> *batch++ = lower_32_bits(hws_address(hws, rq));
> *batch++ = rq->fence.seqno;
> + *batch++ = MI_ARB_CHECK;
> +
> + memset(batch, 0, 1024);
> + batch += 1024 / sizeof(*batch);
> +
> + *batch++ = MI_ARB_CHECK;
> *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
> *batch++ = lower_32_bits(vma->node.start);
> } else {
> *batch++ = MI_STORE_DWORD_IMM;
> *batch++ = lower_32_bits(hws_address(hws, rq));
> *batch++ = rq->fence.seqno;
> + *batch++ = MI_ARB_CHECK;
> +
> + memset(batch, 0, 1024);
> + batch += 1024 / sizeof(*batch);
> +
> + *batch++ = MI_ARB_CHECK;
> *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
> *batch++ = lower_32_bits(vma->node.start);
> }
> @@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
> i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
> }
>
> +static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
> +{
> + return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
> + rq->fence.seqno),
> + 10) &&
> + wait_for(i915_seqno_passed(hws_seqno(h, rq),
> + rq->fence.seqno),
> + 1000));
> +}
> +
> static int igt_hang_sanitycheck(void *arg)
> {
> struct drm_i915_private *i915 = arg;
> @@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
>
> + pr_debug("%s: current gpu_error=%08lx\n",
> + __func__, i915->gpu_error.flags);
> +
> while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
> wait_event(i915->gpu_error.reset_queue,
> !test_bit(I915_RESET_BACKOFF,
> @@ -353,54 +390,127 @@ static int igt_global_reset(void *arg)
> return err;
> }
>
> -static int igt_reset_engine(void *arg)
> +static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
> {
> - struct drm_i915_private *i915 = arg;
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> - unsigned int reset_count, reset_engine_count;
> + struct hang h;
> int err = 0;
>
> - /* Check that we can issue a global GPU and engine reset */
> + /* Check that we can issue an engine reset on an idle engine (no-op) */
>
> if (!intel_has_reset_engine(i915))
> return 0;
>
> + if (active) {
> + mutex_lock(&i915->drm.struct_mutex);
> + err = hang_init(&h, i915);
> + mutex_unlock(&i915->drm.struct_mutex);
> + if (err)
> + return err;
> + }
> +
> for_each_engine(engine, i915, id) {
> - set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
> + unsigned int reset_count, reset_engine_count;
> + IGT_TIMEOUT(end_time);
> +
> + if (active && !intel_engine_can_store_dword(engine))
> + continue;
> +
> reset_count = i915_reset_count(&i915->gpu_error);
> reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
> engine);
>
> - err = i915_reset_engine(engine, I915_RESET_QUIET);
> - if (err) {
> - pr_err("i915_reset_engine failed\n");
> - break;
> - }
> + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> + do {
> + if (active) {
> + struct drm_i915_gem_request *rq;
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + rq = hang_create_request(&h, engine,
> + i915->kernel_context);
> + if (IS_ERR(rq)) {
> + err = PTR_ERR(rq);
> + break;
> + }
> +
> + i915_gem_request_get(rq);
> + __i915_add_request(rq, true);
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + if (!wait_for_hang(&h, rq)) {
> + struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> + pr_err("%s: Failed to start request %x, at %x\n",
> + __func__, rq->fence.seqno, hws_seqno(&h, rq));
> + intel_engine_dump(engine, &p,
> + "%s\n", engine->name);
> +
> + i915_gem_request_put(rq);
> + err = -EIO;
> + break;
> + }
>
> - if (i915_reset_count(&i915->gpu_error) != reset_count) {
> - pr_err("Full GPU reset recorded! (engine reset expected)\n");
> - err = -EINVAL;
> - break;
> - }
> + i915_gem_request_put(rq);
> + }
> +
> + engine->hangcheck.stalled = true;
> + engine->hangcheck.seqno =
> + intel_engine_get_seqno(engine);
> +
> + err = i915_reset_engine(engine, I915_RESET_QUIET);
> + if (err) {
> + pr_err("i915_reset_engine failed\n");
> + break;
> + }
> +
> + if (i915_reset_count(&i915->gpu_error) != reset_count) {
> + pr_err("Full GPU reset recorded! (engine reset expected)\n");
> + err = -EINVAL;
> + break;
> + }
> +
> + reset_engine_count += active;
> + if (i915_reset_engine_count(&i915->gpu_error, engine) !=
> + reset_engine_count) {
> + pr_err("%s engine reset %srecorded!\n",
> + engine->name, active ? "not " : "");
> + err = -EINVAL;
> + break;
> + }
> +
> + engine->hangcheck.stalled = false;
> + } while (time_before(jiffies, end_time));
> + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>
> - if (i915_reset_engine_count(&i915->gpu_error, engine) ==
> - reset_engine_count) {
> - pr_err("No %s engine reset recorded!\n", engine->name);
> - err = -EINVAL;
> + if (err)
> break;
> - }
>
> - clear_bit(I915_RESET_ENGINE + engine->id,
> - &i915->gpu_error.flags);
> + cond_resched();
> }
>
> if (i915_terminally_wedged(&i915->gpu_error))
> err = -EIO;
>
> + if (active) {
> + mutex_lock(&i915->drm.struct_mutex);
> + hang_fini(&h);
> + mutex_unlock(&i915->drm.struct_mutex);
> + }
> +
> return err;
> }
>
> +static int igt_reset_idle_engine(void *arg)
> +{
> + return __igt_reset_engine(arg, false);
> +}
> +
> +static int igt_reset_active_engine(void *arg)
> +{
> + return __igt_reset_engine(arg, true);
> +}
> +
> static int active_engine(void *data)
> {
> struct intel_engine_cs *engine = data;
> @@ -462,11 +572,12 @@ static int active_engine(void *data)
> return err;
> }
>
> -static int igt_reset_active_engines(void *arg)
> +static int __igt_reset_engine_others(struct drm_i915_private *i915,
> + bool active)
> {
> - struct drm_i915_private *i915 = arg;
> - struct intel_engine_cs *engine, *active;
> + struct intel_engine_cs *engine, *other;
> enum intel_engine_id id, tmp;
> + struct hang h;
> int err = 0;
>
> /* Check that issuing a reset on one engine does not interfere
> @@ -476,24 +587,36 @@ static int igt_reset_active_engines(void *arg)
> if (!intel_has_reset_engine(i915))
> return 0;
>
> + if (active) {
> + mutex_lock(&i915->drm.struct_mutex);
> + err = hang_init(&h, i915);
> + mutex_unlock(&i915->drm.struct_mutex);
> + if (err)
> + return err;
> + }
> +
> for_each_engine(engine, i915, id) {
> - struct task_struct *threads[I915_NUM_ENGINES];
> + struct task_struct *threads[I915_NUM_ENGINES] = {};
> unsigned long resets[I915_NUM_ENGINES];
> unsigned long global = i915_reset_count(&i915->gpu_error);
> + unsigned long count = 0;
> IGT_TIMEOUT(end_time);
>
> + if (active && !intel_engine_can_store_dword(engine))
> + continue;
> +
> memset(threads, 0, sizeof(threads));
> - for_each_engine(active, i915, tmp) {
> + for_each_engine(other, i915, tmp) {
> struct task_struct *tsk;
>
> - if (active == engine)
> - continue;
> -
> resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
> - active);
> + other);
>
> - tsk = kthread_run(active_engine, active,
> - "igt/%s", active->name);
> + if (other == engine)
> + continue;
> +
> + tsk = kthread_run(active_engine, other,
> + "igt/%s", other->name);
> if (IS_ERR(tsk)) {
> err = PTR_ERR(tsk);
> goto unwind;
> @@ -503,20 +626,70 @@ static int igt_reset_active_engines(void *arg)
> get_task_struct(tsk);
> }
>
> - set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
> + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> do {
> + if (active) {
> + struct drm_i915_gem_request *rq;
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + rq = hang_create_request(&h, engine,
> + i915->kernel_context);
> + if (IS_ERR(rq)) {
> + err = PTR_ERR(rq);
> + mutex_unlock(&i915->drm.struct_mutex);
> + break;
> + }
> +
> + i915_gem_request_get(rq);
> + __i915_add_request(rq, true);
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + if (!wait_for_hang(&h, rq)) {
> + struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> + pr_err("%s: Failed to start request %x, at %x\n",
> + __func__, rq->fence.seqno, hws_seqno(&h, rq));
> + intel_engine_dump(engine, &p,
> + "%s\n", engine->name);
> +
> + i915_gem_request_put(rq);
> + err = -EIO;
> + break;
> + }
> +
> + i915_gem_request_put(rq);
> + }
> +
> + engine->hangcheck.stalled = true;
> + engine->hangcheck.seqno =
> + intel_engine_get_seqno(engine);
> +
> err = i915_reset_engine(engine, I915_RESET_QUIET);
> if (err) {
> - pr_err("i915_reset_engine(%s) failed, err=%d\n",
> - engine->name, err);
> + pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
> + engine->name, active ? "active" : "idle", err);
> break;
> }
> +
> + engine->hangcheck.stalled = false;
> + count++;
> } while (time_before(jiffies, end_time));
> - clear_bit(I915_RESET_ENGINE + engine->id,
> - &i915->gpu_error.flags);
> + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> + pr_info("i915_reset_engine(%s:%s): %lu resets\n",
> + engine->name, active ? "active" : "idle", count);
> +
> + if (i915_reset_engine_count(&i915->gpu_error, engine) -
> + resets[engine->id] != (active ? count : 0)) {
> + pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
> + engine->name, active ? "active" : "idle", count,
> + i915_reset_engine_count(&i915->gpu_error,
> + engine) - resets[engine->id]);
> + if (!err)
> + err = -EINVAL;
> + }
>
> unwind:
> - for_each_engine(active, i915, tmp) {
> + for_each_engine(other, i915, tmp) {
> int ret;
>
> if (!threads[tmp])
> @@ -524,27 +697,29 @@ static int igt_reset_active_engines(void *arg)
>
> ret = kthread_stop(threads[tmp]);
> if (ret) {
> - pr_err("kthread for active engine %s failed, err=%d\n",
> - active->name, ret);
> + pr_err("kthread for other engine %s failed, err=%d\n",
> + other->name, ret);
> if (!err)
> err = ret;
> }
> put_task_struct(threads[tmp]);
>
> if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
> - active)) {
> + other)) {
> pr_err("Innocent engine %s was reset (count=%ld)\n",
> - active->name,
> + other->name,
> i915_reset_engine_count(&i915->gpu_error,
> - active) - resets[tmp]);
> - err = -EIO;
> + other) - resets[tmp]);
> + if (!err)
> + err = -EINVAL;
> }
> }
>
> if (global != i915_reset_count(&i915->gpu_error)) {
> pr_err("Global reset (count=%ld)!\n",
> i915_reset_count(&i915->gpu_error) - global);
> - err = -EIO;
> + if (!err)
> + err = -EINVAL;
> }
>
> if (err)
> @@ -556,9 +731,25 @@ static int igt_reset_active_engines(void *arg)
> if (i915_terminally_wedged(&i915->gpu_error))
> err = -EIO;
>
> + if (active) {
> + mutex_lock(&i915->drm.struct_mutex);
> + hang_fini(&h);
> + mutex_unlock(&i915->drm.struct_mutex);
> + }
> +
> return err;
> }
>
> +static int igt_reset_idle_engine_others(void *arg)
> +{
> + return __igt_reset_engine_others(arg, false);
> +}
> +
> +static int igt_reset_active_engine_others(void *arg)
> +{
> + return __igt_reset_engine_others(arg, true);
> +}
> +
> static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
> {
> u32 reset_count;
> @@ -574,16 +765,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
> return reset_count;
> }
>
> -static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
> -{
> - return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
> - rq->fence.seqno),
> - 10) &&
> - wait_for(i915_seqno_passed(hws_seqno(h, rq),
> - rq->fence.seqno),
> - 1000));
> -}
> -
> static int igt_wait_reset(void *arg)
> {
> struct drm_i915_private *i915 = arg;
> @@ -617,8 +798,8 @@ static int igt_wait_reset(void *arg)
> if (!wait_for_hang(&h, rq)) {
> struct drm_printer p = drm_info_printer(i915->drm.dev);
>
> - pr_err("Failed to start request %x, at %x\n",
> - rq->fence.seqno, hws_seqno(&h, rq));
> + pr_err("%s: Failed to start request %x, at %x\n",
> + __func__, rq->fence.seqno, hws_seqno(&h, rq));
> intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
>
> i915_reset(i915, 0);
> @@ -712,8 +893,8 @@ static int igt_reset_queue(void *arg)
> if (!wait_for_hang(&h, prev)) {
> struct drm_printer p = drm_info_printer(i915->drm.dev);
>
> - pr_err("Failed to start request %x, at %x\n",
> - prev->fence.seqno, hws_seqno(&h, prev));
> + pr_err("%s: Failed to start request %x, at %x\n",
> + __func__, prev->fence.seqno, hws_seqno(&h, prev));
> intel_engine_dump(prev->engine, &p,
> "%s\n", prev->engine->name);
>
> @@ -819,8 +1000,8 @@ static int igt_handle_error(void *arg)
> if (!wait_for_hang(&h, rq)) {
> struct drm_printer p = drm_info_printer(i915->drm.dev);
>
> - pr_err("Failed to start request %x, at %x\n",
> - rq->fence.seqno, hws_seqno(&h, rq));
> + pr_err("%s: Failed to start request %x, at %x\n",
> + __func__, rq->fence.seqno, hws_seqno(&h, rq));
> intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
>
> i915_reset(i915, 0);
> @@ -864,21 +1045,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> static const struct i915_subtest tests[] = {
> SUBTEST(igt_global_reset), /* attempt to recover GPU first */
> SUBTEST(igt_hang_sanitycheck),
> - SUBTEST(igt_reset_engine),
> - SUBTEST(igt_reset_active_engines),
> + SUBTEST(igt_reset_idle_engine),
> + SUBTEST(igt_reset_active_engine),
> + SUBTEST(igt_reset_idle_engine_others),
> + SUBTEST(igt_reset_active_engine_others),
> SUBTEST(igt_wait_reset),
> SUBTEST(igt_reset_queue),
> SUBTEST(igt_handle_error),
> };
> + bool saved_hangcheck;
> int err;
>
> if (!intel_has_gpu_reset(i915))
> return 0;
>
> intel_runtime_pm_get(i915);
> + saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
>
> err = i915_subtests(tests, i915);
>
> + i915_modparams.enable_hangcheck = saved_hangcheck;
> intel_runtime_pm_put(i915);
>
> return err;
>
More information about the Intel-gfx
mailing list