[Intel-gfx] [PATCH 2/3] drm/i915: check acthd for all rings

Daniel Vetter daniel at ffwll.ch
Fri Oct 7 15:59:16 CEST 2011


On Tue, Oct 04, 2011 at 02:11:52PM -0700, Ben Widawsky wrote:
> On Gen6+ we have other rings which may be in use. We haven't hung if the
> blit or media ring is still going
> 
> Before rebase:
> Reviewed-by: Daniel Vetter <daniel.vetter at ffwll.ch>
> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>

I've spotted another goof-up besides the instdone_stuck && acthd_stuck
I've overlooked last time around.

> ---
>  drivers/gpu/drm/i915/i915_drv.h |    5 +-
>  drivers/gpu/drm/i915/i915_irq.c |  143 +++++++++++++++++++++++++++------------
>  2 files changed, 102 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 567275c..edfa8be 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -323,9 +323,8 @@ typedef struct drm_i915_private {
>  #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
>  	struct timer_list hangcheck_timer;
>  	int hangcheck_count;
> -	uint32_t last_acthd;
> -	uint32_t last_instdone;
> -	uint32_t last_instdone1;
> +	uint32_t last_acthd[I915_NUM_RINGS];
> +	uint64_t last_instdone[I915_NUM_RINGS];
>  
>  	unsigned long cfb_size;
>  	unsigned int cfb_fb;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 97e338b..6b6abe1 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -948,7 +948,7 @@ static void i915_capture_error_state(struct drm_device *dev)
>  		error->instdone[RCS] = I915_READ(INSTDONE_I965);
>  		error->instps[RCS] = I915_READ(INSTPS);
>  		error->instdone1 = I915_READ(INSTDONE1);
> -		error->acthd = I915_READ(ACTHD_I965);
> +		error->acthd[RCS] = I915_READ(ACTHD_I965);
>  		error->bbaddr = I915_READ64(BB_ADDR);
>  	} else {
>  		error->ipeir[RCS] = I915_READ(IPEIR);
> @@ -1666,6 +1666,85 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>  	return false;
>  }
>  
> +static bool
> +instdone_stuck(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	uint64_t instdone = 0, instdone1 = 0, vcs_instdone = 0, bcs_instdone = 0;
> +	bool stuck;
> +
> +	switch (INTEL_INFO(dev)->gen) {
> +	case 7:
> +	case 6:
> +		bcs_instdone = I915_READ(BCS_INSTDONE);
> +	case 5:
> +		vcs_instdone = I915_READ(VCS_INSTDONE);
> +	case 4:
> +		instdone = I915_READ(INSTDONE_I965);
> +		instdone1 = I915_READ(INSTDONE1);
> +		break;
> +	case 3:
> +	case 2:
> +		instdone = I915_READ(INSTDONE);
> +		break;
> +	}
> +
> +	stuck =
> +	    (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
> +	    (dev_priv->last_instdone[VCS] == vcs_instdone) &&
> +	    (dev_priv->last_instdone[BCS] == bcs_instdone);
> +
> +	dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
> +	dev_priv->last_instdone[VCS] = vcs_instdone;
> +	dev_priv->last_instdone[BCS] = bcs_instdone;
> +
> +	return stuck;
> +}
> +
> +static bool
> +acthd_stuck(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
> +	bool stuck = false;
> +
> +	switch (INTEL_INFO(dev)->gen) {
> +	case 7:
> +	case 6:
> +		bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
> +	case 5:
> +		vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
> +	case 4:
> +	case 3:
> +	case 2:
> +		acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
> +		break;
> +	}
> +
> +	stuck =
> +	    (dev_priv->last_acthd[RCS] == acthd) &&
> +	    (dev_priv->last_acthd[VCS] == vcs_acthd) &&
> +	    (dev_priv->last_acthd[BCS] == bcs_acthd);
> +
> +	dev_priv->last_acthd[RCS] = acthd;
> +	dev_priv->last_acthd[VCS] = vcs_acthd;
> +	dev_priv->last_acthd[BCS] = bcs_acthd;
> +
> +	return stuck;
> +}
> +
> +static bool gpu_stuck(struct drm_device *dev)
> +{
> +	#define NUM_HANGCHECKS_TO_RESET 1
> +
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	if (dev_priv->hangcheck_count++ < NUM_HANGCHECKS_TO_RESET)
> +		return false;
> +
> +	return acthd_stuck(dev) && instdone_stuck(dev);

First check whether the gpu ist stuck, then increment the hangcheck_count
If the gpu is not stuck, we also need to clear the hangcheck_count again.

> +}
> +
>  /**
>   * This is called when the chip hasn't reported back with completed
>   * batchbuffers in a long time. The first time this is called we simply record
> @@ -1676,13 +1755,11 @@ void i915_hangcheck_elapsed(unsigned long data)
>  {
>  	struct drm_device *dev = (struct drm_device *)data;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
> -	uint32_t acthd, instdone, instdone1;
>  	bool err = false;
>  
>  	if (!i915_enable_hangcheck)
>  		return;
>  
> -	/* If all work is done then ACTHD clearly hasn't advanced. */
>  	if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) &&
>  	    i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) &&
>  	    i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) {
> @@ -1692,50 +1769,30 @@ void i915_hangcheck_elapsed(unsigned long data)
>  		return;
>  	}
>  
> -	if (INTEL_INFO(dev)->gen < 4) {
> -		acthd = I915_READ(ACTHD);
> -		instdone = I915_READ(INSTDONE);
> -		instdone1 = 0;
> -	} else {
> -		acthd = I915_READ(ACTHD_I965);
> -		instdone = I915_READ(INSTDONE_I965);
> -		instdone1 = I915_READ(INSTDONE1);
> -	}
> -
> -	if (dev_priv->last_acthd == acthd &&
> -	    dev_priv->last_instdone == instdone &&
> -	    dev_priv->last_instdone1 == instdone1) {
> -		if (dev_priv->hangcheck_count++ > 1) {
> -			DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
> +	if (gpu_stuck(dev)) {
> +		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
>  
> -			if (!IS_GEN2(dev)) {
> -				/* Is the chip hanging on a WAIT_FOR_EVENT?
> -				 * If so we can simply poke the RB_WAIT bit
> -				 * and break the hang. This should work on
> -				 * all but the second generation chipsets.
> -				 */
> -
> -				if (kick_ring(&dev_priv->ring[RCS]))
> -					goto repeat;
> +		if (!IS_GEN2(dev)) {
> +			/* Is the chip hanging on a WAIT_FOR_EVENT?
> +			 * If so we can simply poke the RB_WAIT bit
> +			 * and break the hang. This should work on
> +			 * all but the second generation chipsets.
> +			 */
>  
> -				if (HAS_BSD(dev) &&
> -				    kick_ring(&dev_priv->ring[VCS]))
> -					goto repeat;
> +			if (kick_ring(&dev_priv->ring[RCS]))
> +				goto repeat;
>  
> -				if (HAS_BLT(dev) &&
> -				    kick_ring(&dev_priv->ring[BCS]))
> -					goto repeat;
> -			}
> +			if (HAS_BSD(dev) &&
> +			    kick_ring(&dev_priv->ring[VCS]))
> +				goto repeat;
>  
> -			i915_handle_error(dev, true);
> -			return;
> +			if (HAS_BLT(dev) &&
> +			    kick_ring(&dev_priv->ring[BCS]))
> +				goto repeat;
>  		}
> -	} else {
> -		dev_priv->hangcheck_count = 0;
>  
> -		dev_priv->last_acthd = acthd;
> -		dev_priv->last_instdone = instdone;
> -		dev_priv->last_instdone1 = instdone1;
> +		i915_handle_error(dev, true);
> +		return;
>  	}
>  
>  repeat:
> -- 
> 1.7.6.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Mail: daniel at ffwll.ch
Mobile: +41 (0)79 365 57 48



More information about the Intel-gfx mailing list