[Intel-gfx] [PATCH] drm/i915: Include HW status page in error capture

Ville Syrjälä ville.syrjala at linux.intel.com
Fri Jan 24 12:25:58 CET 2014


On Thu, Jan 23, 2014 at 10:40:36PM +0000, Chris Wilson wrote:
> Many times in the past we have concluded that the cause of the GPU hang
> has been that the hw status page was stale, usually because the GPU and
> CPU disagreed over the address of the page. Having stumbled across yet
> another issue that seems to be related to the HWSP, it is time to
> include that information in the GPU error dump.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

We now have two copies of the page->str code and the HSW PGA code,
so a bit of refactoring might be warranted. But I won't insist on
it, so:

Reviewed-by: Ville Syrjälä <ville.syrjala at linux.intel.com>

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  3 ++-
>  drivers/gpu/drm/i915/i915_gpu_error.c | 50 +++++++++++++++++++++++++++++++++++
>  2 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 64a1aca7804d..b6887a88424d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -309,6 +309,7 @@ struct drm_i915_error_state {
>  	u32 tail[I915_NUM_RINGS];
>  	u32 head[I915_NUM_RINGS];
>  	u32 ctl[I915_NUM_RINGS];
> +	u32 hws[I915_NUM_RINGS];
>  	u32 ipeir[I915_NUM_RINGS];
>  	u32 ipehr[I915_NUM_RINGS];
>  	u32 instdone[I915_NUM_RINGS];
> @@ -339,7 +340,7 @@ struct drm_i915_error_state {
>  			int page_count;
>  			u32 gtt_offset;
>  			u32 *pages[0];
> -		} *ringbuffer, *batchbuffer, *ctx;
> +		} *ringbuffer, *batchbuffer, *ctx, *hws;
>  		struct drm_i915_error_request {
>  			long jiffies;
>  			u32 seqno;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index e2af1d490f8d..a6b100b1dff5 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -247,6 +247,7 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "  HEAD: 0x%08x\n", error->head[ring]);
>  	err_printf(m, "  TAIL: 0x%08x\n", error->tail[ring]);
>  	err_printf(m, "  CTL: 0x%08x\n", error->ctl[ring]);
> +	err_printf(m, "  HWS: 0x%08x\n", error->hws[ring]);
>  	err_printf(m, "  ACTHD: 0x%08x\n", error->acthd[ring]);
>  	err_printf(m, "  IPEIR: 0x%08x\n", error->ipeir[ring]);
>  	err_printf(m, "  IPEHR: 0x%08x\n", error->ipehr[ring]);
> @@ -390,6 +391,22 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  			}
>  		}
>  
> +		if ((obj = error->ring[i].hws)) {
> +			err_printf(m, "%s --- HW Status = 0x%08x\n",
> +				   dev_priv->ring[i].name,
> +				   obj->gtt_offset);
> +			offset = 0;
> +			for (elt = 0; elt < PAGE_SIZE/16; elt += 4) {
> +				err_printf(m, "[%04x] %08x %08x %08x %08x\n",
> +					   offset,
> +					   obj->pages[0][elt],
> +					   obj->pages[0][elt+1],
> +					   obj->pages[0][elt+2],
> +					   obj->pages[0][elt+3]);
> +					offset += 16;
> +			}
> +		}
> +
>  		if ((obj = error->ring[i].ctx)) {
>  			err_printf(m, "%s --- HW Context = 0x%08x\n",
>  				   dev_priv->ring[i].name,
> @@ -472,6 +489,7 @@ static void i915_error_state_free(struct kref *error_ref)
>  	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
>  		i915_error_object_free(error->ring[i].batchbuffer);
>  		i915_error_object_free(error->ring[i].ringbuffer);
> +		i915_error_object_free(error->ring[i].hws);
>  		i915_error_object_free(error->ring[i].ctx);
>  		kfree(error->ring[i].requests);
>  	}
> @@ -793,6 +811,35 @@ static void i915_record_ring_state(struct drm_device *dev,
>  	error->tail[ring->id] = I915_READ_TAIL(ring);
>  	error->ctl[ring->id] = I915_READ_CTL(ring);
>  
> +	if (I915_NEED_GFX_HWS(dev)) {
> +		int mmio;
> +
> +		if (IS_GEN7(dev)) {
> +			switch (ring->id) {
> +			default:
> +			case RCS:
> +				mmio = RENDER_HWS_PGA_GEN7;
> +				break;
> +			case BCS:
> +				mmio = BLT_HWS_PGA_GEN7;
> +				break;
> +			case VCS:
> +				mmio = BSD_HWS_PGA_GEN7;
> +				break;
> +			case VECS:
> +				mmio = VEBOX_HWS_PGA_GEN7;
> +				break;
> +			}
> +		} else if (IS_GEN6(ring->dev)) {
> +			mmio = RING_HWS_PGA_GEN6(ring->mmio_base);
> +		} else {
> +			/* XXX: gen8 returns to sanity */
> +			mmio = RING_HWS_PGA(ring->mmio_base);
> +		}
> +
> +		error->hws[ring->id] = I915_READ(mmio);
> +	}
> +
>  	error->cpu_ring_head[ring->id] = ring->head;
>  	error->cpu_ring_tail[ring->id] = ring->tail;
>  
> @@ -846,6 +893,9 @@ static void i915_gem_record_rings(struct drm_device *dev,
>  		error->ring[i].ringbuffer =
>  			i915_error_ggtt_object_create(dev_priv, ring->obj);
>  
> +		if (ring->status_page.obj)
> +			error->ring[i].hws =
> +				i915_error_ggtt_object_create(dev_priv, ring->status_page.obj);
>  
>  		i915_gem_record_active_context(ring, error, &error->ring[i]);
>  
> -- 
> 1.8.5.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC



More information about the Intel-gfx mailing list