[Intel-gfx] [PATCH] drm/i915: Make i915_gem_context_mark_guilty() safe for unlocked updates

Mika Kuoppala mika.kuoppala at linux.intel.com
Mon Jul 10 14:14:19 UTC 2017


Chris Wilson <chris at chris-wilson.co.uk> writes:

> Since we make call i915_gem_context_mark_guilty() concurrently when
> resetting different engines in parallel, we need to make sure that our
> updates are safe for the unlocked access.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>

And we dont ever clear the context bannable flag so
this should be fine.

One improvement would be that only the triggering
ban would spew the message. But now thinking about it,
the dual message would reveal the concurrent ban and
is more informative.

Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c         | 32 ++++++++++++++++++--------------
>  drivers/gpu/drm/i915/i915_gem_context.c |  6 +++---
>  drivers/gpu/drm/i915/i915_gem_context.h |  6 +++---
>  drivers/gpu/drm/i915/i915_gem_request.c |  3 +--
>  drivers/gpu/drm/i915/i915_gpu_error.c   |  8 ++++----
>  6 files changed, 30 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 81cd21ecfa7d..62208b3bf417 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -596,7 +596,7 @@ struct drm_i915_file_private {
>   * to limit the badly behaving clients access to gpu.
>   */
>  #define I915_MAX_CLIENT_CONTEXT_BANS 3
> -	int context_bans;
> +	atomic_t context_bans;
>  };
>  
>  /* Used by dp and fdi links */
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1564cadda94d..cbe70a94b663 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2739,34 +2739,38 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
>  	return 0;
>  }
>  
> -static bool ban_context(const struct i915_gem_context *ctx)
> +static bool ban_context(const struct i915_gem_context *ctx,
> +			unsigned int score)
>  {
>  	return (i915_gem_context_is_bannable(ctx) &&
> -		ctx->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD);
> +		score >= CONTEXT_SCORE_BAN_THRESHOLD);
>  }
>  
>  static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  {
> -	ctx->guilty_count++;
> -	ctx->ban_score += CONTEXT_SCORE_GUILTY;
> -	if (ban_context(ctx))
> -		i915_gem_context_set_banned(ctx);
> +	unsigned int score;
> +	bool banned;
>  
> -	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
> -			 ctx->name, ctx->ban_score,
> -			 yesno(i915_gem_context_is_banned(ctx)));
> +	atomic_inc(&ctx->guilty_count);
>  
> -	if (!i915_gem_context_is_banned(ctx) || IS_ERR_OR_NULL(ctx->file_priv))
> +	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> +	banned = ban_context(ctx, score);
> +	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
> +			 ctx->name, score, yesno(banned));
> +	if (!banned)
>  		return;
>  
> -	ctx->file_priv->context_bans++;
> -	DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
> -			 ctx->name, ctx->file_priv->context_bans);
> +	i915_gem_context_set_banned(ctx);
> +	if (!IS_ERR_OR_NULL(ctx->file_priv)) {
> +		atomic_inc(&ctx->file_priv->context_bans);
> +		DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
> +				 ctx->name, atomic_read(&ctx->file_priv->context_bans));
> +	}
>  }
>  
>  static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
>  {
> -	ctx->active_count++;
> +	atomic_inc(&ctx->active_count);
>  }
>  
>  struct drm_i915_gem_request *
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 1a87d04e7937..ed91ac8ca832 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -977,7 +977,7 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv)
>  
>  static bool client_is_banned(struct drm_i915_file_private *file_priv)
>  {
> -	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
> +	return atomic_read(&file_priv->context_bans) > I915_MAX_CLIENT_CONTEXT_BANS;
>  }
>  
>  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
> @@ -1179,8 +1179,8 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
>  	else
>  		args->reset_count = 0;
>  
> -	args->batch_active = READ_ONCE(ctx->guilty_count);
> -	args->batch_pending = READ_ONCE(ctx->active_count);
> +	args->batch_active = atomic_read(&ctx->guilty_count);
> +	args->batch_pending = atomic_read(&ctx->active_count);
>  
>  	ret = 0;
>  out:
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 04320f80f9f4..2d02918a449e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -191,17 +191,17 @@ struct i915_gem_context {
>  	u32 desc_template;
>  
>  	/** guilty_count: How many times this context has caused a GPU hang. */
> -	unsigned int guilty_count;
> +	atomic_t guilty_count;
>  	/**
>  	 * @active_count: How many times this context was active during a GPU
>  	 * hang, but did not cause it.
>  	 */
> -	unsigned int active_count;
> +	atomic_t active_count;
>  
>  #define CONTEXT_SCORE_GUILTY		10
>  #define CONTEXT_SCORE_BAN_THRESHOLD	40
>  	/** ban_score: Accumulated score of all hangs caused by this context. */
> -	int ban_score;
> +	atomic_t ban_score;
>  
>  	/** remap_slice: Bitmask of cache lines that need remapping */
>  	u8 remap_slice;
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 483af8921060..8ba89abe5441 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -370,8 +370,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  	i915_gem_request_remove_from_client(request);
>  
>  	/* Retirement decays the ban score as it is a sign of ctx progress */
> -	if (request->ctx->ban_score > 0)
> -		request->ctx->ban_score--;
> +	atomic_dec_if_positive(&request->ctx->ban_score);
>  
>  	/* The backing object for the context is done after switching to the
>  	 * *next* context. Therefore we cannot retire the previous context until
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index ae70283470a6..ed5a1eb839ad 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1266,7 +1266,7 @@ static void record_request(struct drm_i915_gem_request *request,
>  			   struct drm_i915_error_request *erq)
>  {
>  	erq->context = request->ctx->hw_id;
> -	erq->ban_score = request->ctx->ban_score;
> +	erq->ban_score = atomic_read(&request->ctx->ban_score);
>  	erq->seqno = request->global_seqno;
>  	erq->jiffies = request->emitted_jiffies;
>  	erq->head = request->head;
> @@ -1357,9 +1357,9 @@ static void record_context(struct drm_i915_error_context *e,
>  
>  	e->handle = ctx->user_handle;
>  	e->hw_id = ctx->hw_id;
> -	e->ban_score = ctx->ban_score;
> -	e->guilty = ctx->guilty_count;
> -	e->active = ctx->active_count;
> +	e->ban_score = atomic_read(&ctx->ban_score);
> +	e->guilty = atomic_read(&ctx->guilty_count);
> +	e->active = atomic_read(&ctx->active_count);
>  }
>  
>  static void request_record_user_bo(struct drm_i915_gem_request *request,
> -- 
> 2.13.2


More information about the Intel-gfx mailing list