[Intel-gfx] [PATCH 17/20] drm/i915/selftest: Refactor reset locking

Mon Jul 24 19:25:52 UTC 2017

On 7/21/2017 5:32 AM, Chris Wilson wrote:
> Extract the common barrier against rogue hangchecks from disrupting our
> direct testing of resets, and in the process expand the lock to include
> the per-engine reset shortcuts.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Michel Thierry <michel.thierry at intel.com>

I don't know how you trigger that, but

Reviewed-by: Michel Thierry <michel.thierry at intel.com>

> ---
>   drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 58 ++++++++++++++++++------
>   1 file changed, 43 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index dbfcb31ba9f4..3ece564e4d96 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -297,6 +297,37 @@ static int igt_hang_sanitycheck(void *arg)
>   	return err;
>   }
>   
> +static void global_reset_lock(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
> +		wait_event(i915->gpu_error.reset_queue,
> +			   !test_bit(I915_RESET_BACKOFF,
> +				     &i915->gpu_error.flags));
> +
> +	for_each_engine(engine, i915, id) {
> +		while (test_and_set_bit(I915_RESET_ENGINE + id,
> +					&i915->gpu_error.flags))
> +			wait_on_bit(&i915->gpu_error.flags,
> +				    I915_RESET_ENGINE + id,
> +				    TASK_UNINTERRUPTIBLE);
> +	}
> +}
> +
> +static void global_reset_unlock(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	for_each_engine(engine, i915, id)
> +		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
> +
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	wake_up_all(&i915->gpu_error.reset_queue);
> +}
> +
>   static int igt_global_reset(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -305,7 +336,7 @@ static int igt_global_reset(void *arg)
>   
>   	/* Check that we can issue a global GPU reset */
>   
> -	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	global_reset_lock(i915);
>   	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
> @@ -320,8 +351,7 @@ static int igt_global_reset(void *arg)
>   	mutex_unlock(&i915->drm.struct_mutex);
>   
>   	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> -	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	global_reset_unlock(i915);
>   
>   	if (i915_terminally_wedged(&i915->gpu_error))
>   		err = -EIO;
> @@ -572,7 +602,7 @@ static int igt_wait_reset(void *arg)
>   
>   	/* Check that we detect a stuck waiter and issue a reset */
>   
> -	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	global_reset_lock(i915);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
>   	err = hang_init(&h, i915);
> @@ -617,8 +647,7 @@ static int igt_wait_reset(void *arg)
>   	hang_fini(&h);
>   unlock:
>   	mutex_unlock(&i915->drm.struct_mutex);
> -	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	global_reset_unlock(i915);
>   
>   	if (i915_terminally_wedged(&i915->gpu_error))
>   		return -EIO;
> @@ -639,7 +668,8 @@ static int igt_reset_queue(void *arg)
>   	if (!igt_can_mi_store_dword_imm(i915))
>   		return 0;
>   
> -	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	global_reset_lock(i915);
> +
>   	mutex_lock(&i915->drm.struct_mutex);
>   	err = hang_init(&h, i915);
>   	if (err)
> @@ -733,8 +763,7 @@ static int igt_reset_queue(void *arg)
>   	hang_fini(&h);
>   unlock:
>   	mutex_unlock(&i915->drm.struct_mutex);
> -	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	global_reset_unlock(i915);
>   
>   	if (i915_terminally_wedged(&i915->gpu_error))
>   		return -EIO;
> @@ -756,7 +785,8 @@ static int igt_render_engine_reset_fallback(void *arg)
>   	if (!intel_has_reset_engine(i915))
>   		return 0;
>   
> -	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	global_reset_lock(i915);
> +
>   	mutex_lock(&i915->drm.struct_mutex);
>   
>   	err = hang_init(&h, i915);
> @@ -786,8 +816,7 @@ static int igt_render_engine_reset_fallback(void *arg)
>   
>   	/* unlock since we'll call handle_error */
>   	mutex_unlock(&i915->drm.struct_mutex);
> -	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	global_reset_unlock(i915);
>   
>   	i915_handle_error(i915, intel_engine_flag(engine), "live test");
>   
> @@ -809,7 +838,7 @@ static int igt_render_engine_reset_fallback(void *arg)
>   	 * more full reset to re-enable the hw.
>   	 */
>   	if (i915_terminally_wedged(&i915->gpu_error)) {
> -		set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +		global_reset_lock(i915);
>   		rq->fence.error = 0;
>   
>   		mutex_lock(&i915->drm.struct_mutex);
> @@ -830,8 +859,7 @@ static int igt_render_engine_reset_fallback(void *arg)
>   	i915_gem_request_put(rq);
>   	hang_fini(&h);
>   out_backoff:
> -	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	global_reset_unlock(i915);
>   
>   	if (i915_terminally_wedged(&i915->gpu_error))
>   		return -EIO;
>