[Intel-gfx] [PATCH 3/3] drm/i915/selftests: Fix up igt_reset_engine

Mon Dec 18 21:50:17 UTC 2017

On 17/12/17 05:28, Chris Wilson wrote:
> Now that we skip a per-engine reset on an idle engine, we need to update
> the selftest to take that into account. In the process, we find that we
> were not stressing the per-engine reset very hard, so add those missing
> active resets.
> 
> v2: Actually test i915_reset_engine() by loading it with requests.
> 
> Fixes: f6ba181ada55 ("drm/i915: Skip an engine reset if it recovered before our preparations")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>

Reviewed-by: Michel Thierry <michel.thierry at intel.com>

And all these subtests passed with and without GuC in SKL.

> ---
>   drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 314 ++++++++++++++++++-----
>   1 file changed, 250 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index f98546b8a7fa..c8a756e2139f 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
>   		*batch++ = lower_32_bits(hws_address(hws, rq));
>   		*batch++ = upper_32_bits(hws_address(hws, rq));
>   		*batch++ = rq->fence.seqno;
> +		*batch++ = MI_ARB_CHECK;
> +
> +		memset(batch, 0, 1024);
> +		batch += 1024 / sizeof(*batch);
> +
> +		*batch++ = MI_ARB_CHECK;
>   		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
>   		*batch++ = lower_32_bits(vma->node.start);
>   		*batch++ = upper_32_bits(vma->node.start);
> @@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
>   		*batch++ = 0;
>   		*batch++ = lower_32_bits(hws_address(hws, rq));
>   		*batch++ = rq->fence.seqno;
> +		*batch++ = MI_ARB_CHECK;
> +
> +		memset(batch, 0, 1024);
> +		batch += 1024 / sizeof(*batch);
> +
> +		*batch++ = MI_ARB_CHECK;
>   		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
>   		*batch++ = lower_32_bits(vma->node.start);
>   	} else if (INTEL_GEN(i915) >= 4) {
> @@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
>   		*batch++ = 0;
>   		*batch++ = lower_32_bits(hws_address(hws, rq));
>   		*batch++ = rq->fence.seqno;
> +		*batch++ = MI_ARB_CHECK;
> +
> +		memset(batch, 0, 1024);
> +		batch += 1024 / sizeof(*batch);
> +
> +		*batch++ = MI_ARB_CHECK;
>   		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
>   		*batch++ = lower_32_bits(vma->node.start);
>   	} else {
>   		*batch++ = MI_STORE_DWORD_IMM;
>   		*batch++ = lower_32_bits(hws_address(hws, rq));
>   		*batch++ = rq->fence.seqno;
> +		*batch++ = MI_ARB_CHECK;
> +
> +		memset(batch, 0, 1024);
> +		batch += 1024 / sizeof(*batch);
> +
> +		*batch++ = MI_ARB_CHECK;
>   		*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
>   		*batch++ = lower_32_bits(vma->node.start);
>   	}
> @@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
>   	i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
>   }
>   
> +static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
> +{
> +	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
> +					       rq->fence.seqno),
> +			     10) &&
> +		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
> +					    rq->fence.seqno),
> +			  1000));
> +}
> +
>   static int igt_hang_sanitycheck(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
>   
> +	pr_debug("%s: current gpu_error=%08lx\n",
> +		 __func__, i915->gpu_error.flags);
> +
>   	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
>   		wait_event(i915->gpu_error.reset_queue,
>   			   !test_bit(I915_RESET_BACKOFF,
> @@ -353,54 +390,127 @@ static int igt_global_reset(void *arg)
>   	return err;
>   }
>   
> -static int igt_reset_engine(void *arg)
> +static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>   {
> -	struct drm_i915_private *i915 = arg;
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
> -	unsigned int reset_count, reset_engine_count;
> +	struct hang h;
>   	int err = 0;
>   
> -	/* Check that we can issue a global GPU and engine reset */
> +	/* Check that we can issue an engine reset on an idle engine (no-op) */
>   
>   	if (!intel_has_reset_engine(i915))
>   		return 0;
>   
> +	if (active) {
> +		mutex_lock(&i915->drm.struct_mutex);
> +		err = hang_init(&h, i915);
> +		mutex_unlock(&i915->drm.struct_mutex);
> +		if (err)
> +			return err;
> +	}
> +
>   	for_each_engine(engine, i915, id) {
> -		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
> +		unsigned int reset_count, reset_engine_count;
> +		IGT_TIMEOUT(end_time);
> +
> +		if (active && !intel_engine_can_store_dword(engine))
> +			continue;
> +
>   		reset_count = i915_reset_count(&i915->gpu_error);
>   		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
>   							     engine);
>   
> -		err = i915_reset_engine(engine, I915_RESET_QUIET);
> -		if (err) {
> -			pr_err("i915_reset_engine failed\n");
> -			break;
> -		}
> +		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> +		do {
> +			if (active) {
> +				struct drm_i915_gem_request *rq;
> +
> +				mutex_lock(&i915->drm.struct_mutex);
> +				rq = hang_create_request(&h, engine,
> +							 i915->kernel_context);
> +				if (IS_ERR(rq)) {
> +					err = PTR_ERR(rq);
> +					break;
> +				}
> +
> +				i915_gem_request_get(rq);
> +				__i915_add_request(rq, true);
> +				mutex_unlock(&i915->drm.struct_mutex);
> +
> +				if (!wait_for_hang(&h, rq)) {
> +					struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> +					pr_err("%s: Failed to start request %x, at %x\n",
> +					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
> +					intel_engine_dump(engine, &p,
> +							  "%s\n", engine->name);
> +
> +					i915_gem_request_put(rq);
> +					err = -EIO;
> +					break;
> +				}
>   
> -		if (i915_reset_count(&i915->gpu_error) != reset_count) {
> -			pr_err("Full GPU reset recorded! (engine reset expected)\n");
> -			err = -EINVAL;
> -			break;
> -		}
> +				i915_gem_request_put(rq);
> +			}
> +
> +			engine->hangcheck.stalled = true;
> +			engine->hangcheck.seqno =
> +				intel_engine_get_seqno(engine);
> +
> +			err = i915_reset_engine(engine, I915_RESET_QUIET);
> +			if (err) {
> +				pr_err("i915_reset_engine failed\n");
> +				break;
> +			}
> +
> +			if (i915_reset_count(&i915->gpu_error) != reset_count) {
> +				pr_err("Full GPU reset recorded! (engine reset expected)\n");
> +				err = -EINVAL;
> +				break;
> +			}
> +
> +			reset_engine_count += active;
> +			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
> +			    reset_engine_count) {
> +				pr_err("%s engine reset %srecorded!\n",
> +				       engine->name, active ? "not " : "");
> +				err = -EINVAL;
> +				break;
> +			}
> +
> +			engine->hangcheck.stalled = false;
> +		} while (time_before(jiffies, end_time));
> +		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>   
> -		if (i915_reset_engine_count(&i915->gpu_error, engine) ==
> -		    reset_engine_count) {
> -			pr_err("No %s engine reset recorded!\n", engine->name);
> -			err = -EINVAL;
> +		if (err)
>   			break;
> -		}
>   
> -		clear_bit(I915_RESET_ENGINE + engine->id,
> -			  &i915->gpu_error.flags);
> +		cond_resched();
>   	}
>   
>   	if (i915_terminally_wedged(&i915->gpu_error))
>   		err = -EIO;
>   
> +	if (active) {
> +		mutex_lock(&i915->drm.struct_mutex);
> +		hang_fini(&h);
> +		mutex_unlock(&i915->drm.struct_mutex);
> +	}
> +
>   	return err;
>   }
>   
> +static int igt_reset_idle_engine(void *arg)
> +{
> +	return __igt_reset_engine(arg, false);
> +}
> +
> +static int igt_reset_active_engine(void *arg)
> +{
> +	return __igt_reset_engine(arg, true);
> +}
> +
>   static int active_engine(void *data)
>   {
>   	struct intel_engine_cs *engine = data;
> @@ -462,11 +572,12 @@ static int active_engine(void *data)
>   	return err;
>   }
>   
> -static int igt_reset_active_engines(void *arg)
> +static int __igt_reset_engine_others(struct drm_i915_private *i915,
> +				     bool active)
>   {
> -	struct drm_i915_private *i915 = arg;
> -	struct intel_engine_cs *engine, *active;
> +	struct intel_engine_cs *engine, *other;
>   	enum intel_engine_id id, tmp;
> +	struct hang h;
>   	int err = 0;
>   
>   	/* Check that issuing a reset on one engine does not interfere
> @@ -476,24 +587,36 @@ static int igt_reset_active_engines(void *arg)
>   	if (!intel_has_reset_engine(i915))
>   		return 0;
>   
> +	if (active) {
> +		mutex_lock(&i915->drm.struct_mutex);
> +		err = hang_init(&h, i915);
> +		mutex_unlock(&i915->drm.struct_mutex);
> +		if (err)
> +			return err;
> +	}
> +
>   	for_each_engine(engine, i915, id) {
> -		struct task_struct *threads[I915_NUM_ENGINES];
> +		struct task_struct *threads[I915_NUM_ENGINES] = {};
>   		unsigned long resets[I915_NUM_ENGINES];
>   		unsigned long global = i915_reset_count(&i915->gpu_error);
> +		unsigned long count = 0;
>   		IGT_TIMEOUT(end_time);
>   
> +		if (active && !intel_engine_can_store_dword(engine))
> +			continue;
> +
>   		memset(threads, 0, sizeof(threads));
> -		for_each_engine(active, i915, tmp) {
> +		for_each_engine(other, i915, tmp) {
>   			struct task_struct *tsk;
>   
> -			if (active == engine)
> -				continue;
> -
>   			resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
> -							      active);
> +							      other);
>   
> -			tsk = kthread_run(active_engine, active,
> -					  "igt/%s", active->name);
> +			if (other == engine)
> +				continue;
> +
> +			tsk = kthread_run(active_engine, other,
> +					  "igt/%s", other->name);
>   			if (IS_ERR(tsk)) {
>   				err = PTR_ERR(tsk);
>   				goto unwind;
> @@ -503,20 +626,70 @@ static int igt_reset_active_engines(void *arg)
>   			get_task_struct(tsk);
>   		}
>   
> -		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
> +		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
>   		do {
> +			if (active) {
> +				struct drm_i915_gem_request *rq;
> +
> +				mutex_lock(&i915->drm.struct_mutex);
> +				rq = hang_create_request(&h, engine,
> +							 i915->kernel_context);
> +				if (IS_ERR(rq)) {
> +					err = PTR_ERR(rq);
> +					mutex_unlock(&i915->drm.struct_mutex);
> +					break;
> +				}
> +
> +				i915_gem_request_get(rq);
> +				__i915_add_request(rq, true);
> +				mutex_unlock(&i915->drm.struct_mutex);
> +
> +				if (!wait_for_hang(&h, rq)) {
> +					struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> +					pr_err("%s: Failed to start request %x, at %x\n",
> +					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
> +					intel_engine_dump(engine, &p,
> +							  "%s\n", engine->name);
> +
> +					i915_gem_request_put(rq);
> +					err = -EIO;
> +					break;
> +				}
> +
> +				i915_gem_request_put(rq);
> +			}
> +
> +			engine->hangcheck.stalled = true;
> +			engine->hangcheck.seqno =
> +				intel_engine_get_seqno(engine);
> +
>   			err = i915_reset_engine(engine, I915_RESET_QUIET);
>   			if (err) {
> -				pr_err("i915_reset_engine(%s) failed, err=%d\n",
> -				       engine->name, err);
> +				pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
> +				       engine->name, active ? "active" : "idle", err);
>   				break;
>   			}
> +
> +			engine->hangcheck.stalled = false;
> +			count++;
>   		} while (time_before(jiffies, end_time));
> -		clear_bit(I915_RESET_ENGINE + engine->id,
> -			  &i915->gpu_error.flags);
> +		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> +		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
> +			engine->name, active ? "active" : "idle", count);
> +
> +		if (i915_reset_engine_count(&i915->gpu_error, engine) -
> +		    resets[engine->id] != (active ? count : 0)) {
> +			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
> +			       engine->name, active ? "active" : "idle", count,
> +			       i915_reset_engine_count(&i915->gpu_error,
> +						       engine) - resets[engine->id]);
> +			if (!err)
> +				err = -EINVAL;
> +		}
>   
>   unwind:
> -		for_each_engine(active, i915, tmp) {
> +		for_each_engine(other, i915, tmp) {
>   			int ret;
>   
>   			if (!threads[tmp])
> @@ -524,27 +697,29 @@ static int igt_reset_active_engines(void *arg)
>   
>   			ret = kthread_stop(threads[tmp]);
>   			if (ret) {
> -				pr_err("kthread for active engine %s failed, err=%d\n",
> -				       active->name, ret);
> +				pr_err("kthread for other engine %s failed, err=%d\n",
> +				       other->name, ret);
>   				if (!err)
>   					err = ret;
>   			}
>   			put_task_struct(threads[tmp]);
>   
>   			if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
> -								   active)) {
> +								   other)) {
>   				pr_err("Innocent engine %s was reset (count=%ld)\n",
> -				       active->name,
> +				       other->name,
>   				       i915_reset_engine_count(&i915->gpu_error,
> -							       active) - resets[tmp]);
> -				err = -EIO;
> +							       other) - resets[tmp]);
> +				if (!err)
> +					err = -EINVAL;
>   			}
>   		}
>   
>   		if (global != i915_reset_count(&i915->gpu_error)) {
>   			pr_err("Global reset (count=%ld)!\n",
>   			       i915_reset_count(&i915->gpu_error) - global);
> -			err = -EIO;
> +			if (!err)
> +				err = -EINVAL;
>   		}
>   
>   		if (err)
> @@ -556,9 +731,25 @@ static int igt_reset_active_engines(void *arg)
>   	if (i915_terminally_wedged(&i915->gpu_error))
>   		err = -EIO;
>   
> +	if (active) {
> +		mutex_lock(&i915->drm.struct_mutex);
> +		hang_fini(&h);
> +		mutex_unlock(&i915->drm.struct_mutex);
> +	}
> +
>   	return err;
>   }
>   
> +static int igt_reset_idle_engine_others(void *arg)
> +{
> +	return __igt_reset_engine_others(arg, false);
> +}
> +
> +static int igt_reset_active_engine_others(void *arg)
> +{
> +	return __igt_reset_engine_others(arg, true);
> +}
> +
>   static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
>   {
>   	u32 reset_count;
> @@ -574,16 +765,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
>   	return reset_count;
>   }
>   
> -static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
> -{
> -	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
> -					       rq->fence.seqno),
> -			     10) &&
> -		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
> -					    rq->fence.seqno),
> -			  1000));
> -}
> -
>   static int igt_wait_reset(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -617,8 +798,8 @@ static int igt_wait_reset(void *arg)
>   	if (!wait_for_hang(&h, rq)) {
>   		struct drm_printer p = drm_info_printer(i915->drm.dev);
>   
> -		pr_err("Failed to start request %x, at %x\n",
> -		       rq->fence.seqno, hws_seqno(&h, rq));
> +		pr_err("%s: Failed to start request %x, at %x\n",
> +		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
>   		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
>   
>   		i915_reset(i915, 0);
> @@ -712,8 +893,8 @@ static int igt_reset_queue(void *arg)
>   			if (!wait_for_hang(&h, prev)) {
>   				struct drm_printer p = drm_info_printer(i915->drm.dev);
>   
> -				pr_err("Failed to start request %x, at %x\n",
> -				       prev->fence.seqno, hws_seqno(&h, prev));
> +				pr_err("%s: Failed to start request %x, at %x\n",
> +				       __func__, prev->fence.seqno, hws_seqno(&h, prev));
>   				intel_engine_dump(prev->engine, &p,
>   						  "%s\n", prev->engine->name);
>   
> @@ -819,8 +1000,8 @@ static int igt_handle_error(void *arg)
>   	if (!wait_for_hang(&h, rq)) {
>   		struct drm_printer p = drm_info_printer(i915->drm.dev);
>   
> -		pr_err("Failed to start request %x, at %x\n",
> -		       rq->fence.seqno, hws_seqno(&h, rq));
> +		pr_err("%s: Failed to start request %x, at %x\n",
> +		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
>   		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
>   
>   		i915_reset(i915, 0);
> @@ -864,21 +1045,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>   	static const struct i915_subtest tests[] = {
>   		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
>   		SUBTEST(igt_hang_sanitycheck),
> -		SUBTEST(igt_reset_engine),
> -		SUBTEST(igt_reset_active_engines),
> +		SUBTEST(igt_reset_idle_engine),
> +		SUBTEST(igt_reset_active_engine),
> +		SUBTEST(igt_reset_idle_engine_others),
> +		SUBTEST(igt_reset_active_engine_others),
>   		SUBTEST(igt_wait_reset),
>   		SUBTEST(igt_reset_queue),
>   		SUBTEST(igt_handle_error),
>   	};
> +	bool saved_hangcheck;
>   	int err;
>   
>   	if (!intel_has_gpu_reset(i915))
>   		return 0;
>   
>   	intel_runtime_pm_get(i915);
> +	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
>   
>   	err = i915_subtests(tests, i915);
>   
> +	i915_modparams.enable_hangcheck = saved_hangcheck;
>   	intel_runtime_pm_put(i915);
>   
>   	return err;
>