[Intel-gfx] [PATCH 2/3] drm/i915: Seek only one guilty batch per hanged ring

Fri Jan 17 15:50:25 CET 2014

Mika Kuoppala <mika.kuoppala at linux.intel.com> writes:

> Instead of going through all the requests to find a batch that
> hanged the machine, use hangcheck score and the fact that
> first noncompleted request on hanged ring is, with great
> probability, the guilty one. This also ensure that we get one
> guilty batch per hang instead of possibly more (for each ring)
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652

Suggested-by: Chris Wilson <chris at chris-wilson.co.uk>

missing in here.
> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |   19 ++++++++++---------
>  drivers/gpu/drm/i915/i915_irq.c         |    3 +--
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 ++
>  3 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d270351..27a97c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2322,20 +2322,17 @@ static bool i915_context_is_banned(const struct i915_ctx_hang_stats *hs)
>  
>  static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
> -				  u32 acthd)
> +				  u32 acthd, const bool guilty)
>  {
>  	struct i915_ctx_hang_stats *hs = NULL;
> -	bool inside, guilty;
> +	bool inside;
>  	unsigned long offset = 0;
>  
> -	/* Innocent until proven guilty */
> -	guilty = false;
> -
>  	if (request->batch_obj)
>  		offset = i915_gem_obj_offset(request->batch_obj,
>  					     request_to_vm(request));
>  
> -	if (ring->hangcheck.action != HANGCHECK_WAIT &&
> +	if (guilty &&
>  	    i915_request_guilty(request, acthd, &inside)) {
>  		DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
>  			  ring->name,
> @@ -2343,8 +2340,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  			  offset,
>  			  request->ctx ? request->ctx->id : 0,
>  			  acthd);
> -
> -		guilty = true;
>  	}
>  
>  	/* If contexts are disabled or this is the default context, use
> @@ -2383,12 +2378,18 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
>  	u32 completed_seqno = ring->get_seqno(ring, false);
>  	u32 acthd = intel_ring_get_active_head(ring);
>  	struct drm_i915_gem_request *request;
> +	bool guilty = false;
>  
>  	list_for_each_entry(request, &ring->request_list, list) {
>  		if (i915_seqno_passed(completed_seqno, request->seqno))
>  			continue;
>  
> -		i915_set_reset_status(ring, request, acthd);
> +		if (!guilty && ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
> +			guilty = true;
> +			i915_set_reset_status(ring, request, acthd, true);
> +		} else {
> +			i915_set_reset_status(ring, request, acthd, false);
> +		}
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6d11e25..e24f9ef 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2473,7 +2473,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  #define BUSY 1
>  #define KICK 5
>  #define HUNG 20
> -#define FIRE 30
>  
>  	if (!i915_enable_hangcheck)
>  		return;
> @@ -2557,7 +2556,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  	}
>  
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score > FIRE) {
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_GUILTY) {
>  			DRM_INFO("%s on %s\n",
>  				 stuck[i] ? "stuck" : "no progress",
>  				 ring->name);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 71a73f4..6018793 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
>  	HANGCHECK_HUNG,
>  };
>  
> +#define HANGCHECK_SCORE_GUILTY 31
> +
>  struct intel_ring_hangcheck {
>  	bool deadlock;
>  	u32 seqno;
> -- 
> 1.7.9.5