[Intel-gfx] [PATCH 5/6] drm/i915: Add per client max context ban limit

Mika Kuoppala mika.kuoppala at linux.intel.com
Tue Nov 15 17:12:23 UTC 2016


Chris Wilson <chris at chris-wilson.co.uk> writes:

> On Tue, Nov 15, 2016 at 04:36:35PM +0200, Mika Kuoppala wrote:
>> If we have a bad client submitting unfavourably across different
>> contexts, creating new ones, the per context scoring of badness
>> doesn't remove the root cause, the offending client.
>> To counter, keep track of per client context bans. Deny access if
>> client is responsible for more than 3 context bans in
>> it's lifetime.
>> 
>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h            |  4 ++++
>>  drivers/gpu/drm/i915/i915_gem.c            | 28 +++++++++++++++++++++-------
>>  drivers/gpu/drm/i915/i915_gem_context.c    | 16 ++++++++++++++++
>>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 ++++++--
>>  drivers/gpu/drm/i915/i915_gpu_error.c      | 10 ++++++----
>>  5 files changed, 53 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 5af1c38..92c5284 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -411,6 +411,9 @@ struct drm_i915_file_private {
>>  	} rps;
>>  
>>  	unsigned int bsd_engine;
>> +
>> +#define I915_MAX_CLIENT_CONTEXT_BANS 3
>> +	int context_bans;
>
> 3 feels a little too small. But possibly ok since this is at the context
> level. Still webgl...
>

3 bans is a significant amount on hangs. 12 consecutive hangs without
single succesful request in between to reach this.

>>  };
>>  
>>  /* Used by dp and fdi links */
>> @@ -854,6 +857,7 @@ struct drm_i915_error_state {
>>  
>>  		pid_t pid;
>>  		char comm[TASK_COMM_LEN];
>> +		int context_bans;
>>  	} engine[I915_NUM_ENGINES];
>>  
>>  	struct drm_i915_error_buffer {
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 40a9e10..f56230f 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2636,19 +2636,33 @@ static bool i915_context_is_banned(const struct i915_gem_context *ctx)
>>  	return false;
>>  }
>>  
>> -static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>> +static void i915_gem_request_mark_guilty(struct drm_i915_gem_request *request)
>>  {
>> -	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
>> +	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
>>  
>>  	hs->ban_score += 10;
>>  
>> -	hs->banned = i915_context_is_banned(ctx);
>> +	hs->banned = i915_context_is_banned(request->ctx);
>>  	hs->batch_active++;
>> +
>> +	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
>> +			 request->ctx->name, hs->ban_score, yesno(hs->banned));
>> +
>> +	if (!request->file_priv)
>> +		return;
>> +
>> +	if (hs->banned) {
>> +		request->file_priv->context_bans++;
>> +
>> +		DRM_DEBUG_DRIVER("client %s has has %d context banned\n",
>> +				 request->ctx->name,
>> +				 request->file_priv->context_bans);
>> +	}
>>  }
>>  
>> -static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
>> +static void i915_gem_request_mark_innocent(struct drm_i915_gem_request *request)
>>  {
>> -	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
>> +	struct i915_ctx_hang_stats *hs = &request->ctx->hang_stats;
>>  
>>  	hs->batch_pending++;
>>  }
>> @@ -2719,9 +2733,9 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
>>  	}
>>  
>>  	if (ring_hung)
>> -		i915_gem_context_mark_guilty(request->ctx);
>> +		i915_gem_request_mark_guilty(request);
>>  	else
>> -		i915_gem_context_mark_innocent(request->ctx);
>> +		i915_gem_request_mark_innocent(request);
>
> Why? Just use the file_priv on the ctx.
>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index 9abaae4..4ddd044 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -347,6 +347,14 @@ __create_hw_context(struct drm_device *dev,
>>  	return ERR_PTR(ret);
>>  }
>>  
>> +static bool client_is_banned(struct drm_i915_file_private *file_priv)
>> +{
>> +	if (file_priv->context_bans <= I915_MAX_CLIENT_CONTEXT_BANS)
>> +		return false;
>
> 	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
>> +
>> +	return true;
>> +}
>> +
>>  /**
>>   * The default context needs to exist per ring that uses contexts. It stores the
>>   * context state of the GPU for applications that don't utilize HW contexts, as
>> @@ -360,6 +368,14 @@ i915_gem_create_context(struct drm_device *dev,
>>  
>>  	lockdep_assert_held(&dev->struct_mutex);
>>  
>> +	if (file_priv && client_is_banned(file_priv)) {
>
> So this belongs to context_craate_ioctl
>

Ah yes.

>> +		DRM_DEBUG("client %s[%d] banned from creating ctx\n",
>> +			  current->comm,
>> +			  pid_nr(get_task_pid(current, PIDTYPE_PID)));
>> +
>> +		return ERR_PTR(-EIO);
>> +	}
>> +
>>  	ctx = __create_hw_context(dev, file_priv);
>>  	if (IS_ERR(ctx))
>>  		return ctx;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index e804cb2..21beaf0 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1231,16 +1231,20 @@ static struct i915_gem_context *
>>  i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
>>  			  struct intel_engine_cs *engine, const u32 ctx_id)
>>  {
>> +	struct drm_i915_file_private *file_priv = file->driver_priv;
>>  	struct i915_gem_context *ctx;
>>  	struct i915_ctx_hang_stats *hs;
>>  
>> -	ctx = i915_gem_context_lookup(file->driver_priv, ctx_id);
>> +	ctx = i915_gem_context_lookup(file_priv, ctx_id);
>>  	if (IS_ERR(ctx))
>>  		return ctx;
>>  
>>  	hs = &ctx->hang_stats;
>>  	if (hs->banned) {
>> -		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
>> +		DRM_DEBUG("Client %s banned from submitting (%d:%d)\n",
>> +			  ctx->name,
>> +			  file_priv->context_bans,
>> +			  hs->ban_score);
>
> Context bans prevents creating new contexts. What is its relevance here?
>

No relevanse, just upgraded the ctx->name. Should not be in this patch.

-Mika

> (We should start using pr_debug for these user aides, at least something
> more suitable than DRM_DEBUG).
> -Chris
>
> -- 
> Chris Wilson, Intel Open Source Technology Centre


More information about the Intel-gfx mailing list