[Intel-gfx] [PATCH 4/5] drm/i915: detect hang using per ring hangcheck_score

Mon May 27 03:34:48 CEST 2013

On Mon, May 13, 2013 at 04:32:12PM +0300, Mika Kuoppala wrote:
> Keep track of ring seqno progress and if there are no
> progress detected, declare hang. Use actual head (acthd)
> to distinguish between ring stuck and batchbuffer looping
> situation. Stuck ring will be kicked to trigger progress.
> 
> v2: use atchd to detect stuck ring from loop (Ben Widawsky)
> 
> v3: Use acthd to check when ring needs kicking.
> Declare hang on third time in order to give time for
> kick_ring to take effect.
> 
> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
>
>
>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   80 ++++++++++++++++++-------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
>  2 files changed, 49 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6efe3b0..5dde61f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -681,7 +681,6 @@ static void notify_ring(struct drm_device *dev,
>  
>  	wake_up_all(&ring->irq_queue);
>  	if (i915_enable_hangcheck) {
> -		dev_priv->gpu_error.hangcheck_count = 0;
>  		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>  			  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
>  	}
> @@ -2381,61 +2380,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
>  
>  /**
>   * This is called when the chip hasn't reported back with completed
> - * batchbuffers in a long time. The first time this is called we simply record
> - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
> - * again, we assume the chip is wedged and try to fix it.
> + * batchbuffers in a long time. We keep track per ring seqno progress and
> + * if there are no progress, hangcheck score for that ring is increased.
> + * Further, acthd is inspected to see if the ring is stuck. On stuck case
> + * we kick the ring. If we see no progress on three subsequent calls
> + * we assume chip is wedged and try to fix it by resetting the chip.
>   */
>  void i915_hangcheck_elapsed(unsigned long data)
>  {
>  	struct drm_device *dev = (struct drm_device *)data;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
>  	struct intel_ring_buffer *ring;
> -	bool err = false, idle;
>  	int i;
> -	u32 seqno[I915_NUM_RINGS];
> -	bool work_done;
> +	int busy_count = 0, rings_hung = 0;
> +	bool stuck[I915_NUM_RINGS];
>  
>  	if (!i915_enable_hangcheck)
>  		return;
>  
> -	idle = true;
>  	for_each_ring(ring, dev_priv, i) {
> -		seqno[i] = ring->get_seqno(ring, false);
> -		idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
> -	}
> +		u32 seqno, acthd;
> +		bool idle, err = false;
> +
> +		seqno = ring->get_seqno(ring, false);
> +		acthd = intel_ring_get_active_head(ring);
> +		idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> +		stuck[i] = ring->hangcheck.acthd == acthd;
> +
> +		if (idle) {
> +			if (err)
> +				ring->hangcheck.score += 2;
> +			else
> +				ring->hangcheck.score = 0;
> +		} else {
> +			busy_count++;
>  
> -	/* If all work is done then ACTHD clearly hasn't advanced. */
> -	if (idle) {
> -		if (err) {
> -			if (i915_hangcheck_hung(dev))
> -				return;
> +			if (ring->hangcheck.seqno == seqno) {
> +				ring->hangcheck.score++;
>  
> -			goto repeat;
> +				/* Kick ring if stuck*/
> +				if (stuck[i])
> +					i915_hangcheck_ring_hung(ring);
> +			} else {
> +				ring->hangcheck.score = 0;
> +			}
>  		}
>  
> -		dev_priv->gpu_error.hangcheck_count = 0;
> -		return;
> +		ring->hangcheck.seqno = seqno;
> +		ring->hangcheck.acthd = acthd;
>  	}
>  
> -	work_done = false;
>  	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.seqno != seqno[i]) {
> -			work_done = true;
> -			ring->hangcheck.seqno = seqno[i];
> +		if (ring->hangcheck.score > 2) {
> +			rings_hung++;
> +			DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
> +				  stuck[i] ? "stuck" : "no progress",
> +				  stuck[i] ? "addr" : "seqno",
> +				  stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
> +				  ring->hangcheck.seqno);
>  		}
>  	}
>  
> -	if (!work_done) {
> -		if (i915_hangcheck_hung(dev))
> -			return;
> -	} else {
> -		dev_priv->gpu_error.hangcheck_count = 0;
> -	}
> +	if (rings_hung)
> +		return i915_handle_error(dev, true);
>  
> -repeat:
> -	/* Reset timer case chip hangs without another request being added */
> -	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	if (busy_count)
> +		/* Reset timer case chip hangs without another request
> +		 * being added */
> +		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> +			  round_jiffies_up(jiffies +
> +					   DRM_I915_HANGCHECK_JIFFIES));
>  }
>  
>  /* drm_dma.h hooks
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index ef374a8..5886667 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -39,6 +39,8 @@ struct  intel_hw_status_page {
>  
>  struct intel_ring_hangcheck {
>  	u32 seqno;
> +	u32 acthd;
> +	int score;
>  };
>  
>  struct  intel_ring_buffer {
>

Maybe I'm just stating the functional changes of the patch, but in case
they were unintended here is what I see as potential issues:

1. If ring B is waiting on ring A via semaphore, and ring A is making
   progress, albeit slowly - the hangcheck will fire. The check will
   determine that A is moving, however ring B will appear hung because
   the ACTHD doesn't move. I honestly can't say if that's actually a
   realistic problem to hit it probably implies the timeout value is too
   low.

2. There's also another corner case on the kick. If the seqno = 2
   (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
   we try to kick it... we don't actually try to find out if the kick
   helped.

#2 doesn't worry me much, just wanted to point it out. The question then
is, did I make #1 up? If not, what should we do?

-- 
Ben Widawsky, Intel Open Source Technology Center