[Intel-gfx] [PATCH 3/4] drm/i915: Record batch buffer following GPU error

Jesse Barnes jbarnes at virtuousgeek.org
Mon Jan 4 20:32:28 CET 2010


On Mon,  4 Jan 2010 18:57:58 +0000
Chris Wilson <chris at chris-wilson.co.uk> wrote:

> In order to improve our diagnostic capabilities following a GPU hang
> and subsequent reset, we need to record the batch buffer that
> triggered the error. We assume that the current batch buffer, plus a
> few details about what else is on the active list, will be sufficient
> -- at the very least an improvement over nothing.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |   32 ++++++++++
>  drivers/gpu/drm/i915/i915_drv.h     |   11 +++
>  drivers/gpu/drm/i915/i915_irq.c     |  118
> +++++++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_reg.h     |    1 + 4 files changed, 162
> insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
> b/drivers/gpu/drm/i915/i915_debugfs.c index 463e8d0..6521c83 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -380,6 +380,38 @@ static int i915_error_state(struct seq_file *m,
> void *unused) seq_printf(m, "  INSTDONE1: 0x%08x\n",
> error->instdone1); }
>  
> +	if (error->active_bo_count) {
> +		int i;
> +
> +		seq_printf(m, "Buffers [%d]:\n",
> error->active_bo_count); +
> +		for (i = 0; i < error->active_bo_count; i++) {
> +			seq_printf(m, "  %08x %8zd %08x %08x",
> +				   error->active_bo[i].gtt_offset,
> +				   error->active_bo[i].size,
> +				   error->active_bo[i].read_domains,
> +				   error->active_bo[i].write_domain);
> +
> +			if (error->active_bo[i].name)
> +				seq_printf(m, " (name: %d)",
> error->active_bo[i].name);
> +			if (error->active_bo[i].fence_reg !=
> I915_FENCE_REG_NONE)
> +				seq_printf(m, " (fence: %d)",
> error->active_bo[i].fence_reg); +
> +			seq_printf(m, "\n");
> +		}
> +	}
> +
> +	if (error->batchbuffer &&
> +	    i915_gem_object_get_pages(error->batchbuffer) == 0) {
> +		struct drm_gem_object *obj = error->batchbuffer;
> +		struct drm_i915_gem_object *obj_priv =
> obj->driver_private; +
> +		seq_printf(m, "--- gtt_offset = 0x%08x\n",
> obj_priv->gtt_offset);
> +		i915_dump_pages(m, obj_priv->pages, obj->size /
> PAGE_SIZE); +
> +		i915_gem_object_put_pages(obj);
> +	}
> +
>  out:
>  	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h index 29dd676..7b7ea9e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -150,7 +150,18 @@ struct drm_i915_error_state {
>  	u32 instps;
>  	u32 instdone1;
>  	u32 seqno;
> +	u64 bbaddr;
>  	struct timeval time;
> +	struct drm_gem_object *batchbuffer;
> +	struct drm_i915_error_buffer {
> +		size_t size;
> +		u32 name;
> +		u32 gtt_offset;
> +		u32 read_domains;
> +		u32 write_domain;
> +		u32 fence_reg;
> +	} *active_bo;
> +	u32 active_bo_count;
>  };
>  
>  struct drm_i915_display_funcs {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c
> b/drivers/gpu/drm/i915/i915_irq.c index 7cd8110..86b2f53 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -370,6 +370,51 @@ static void i915_error_work_func(struct
> work_struct *work) }
>  }
>  
> +static struct drm_gem_object
> +*clone_obj(struct drm_device *dev,
> +	   struct drm_gem_object *src)
> +{
> +	struct drm_gem_object *dst;
> +	struct drm_i915_gem_object *src_priv, *dst_priv;
> +	int page, page_count;
> +
> +	dst = drm_gem_object_alloc(dev, src->size);
> +	if (dst == NULL)
> +		return NULL;
> +
> +	if (i915_gem_object_get_pages(src))
> +		goto error_unref;
> +
> +	if (i915_gem_object_get_pages(dst))
> +		goto error_src;
> +
> +	src_priv = src->driver_private;
> +	dst_priv = dst->driver_private;
> +
> +	page_count = src->size / PAGE_SIZE;
> +	for (page = 0; page < page_count; page++) {
> +		memcpy(kmap_atomic(dst_priv->pages[page], KM_USER1),
> +		       kmap_atomic(src_priv->pages[page], KM_USER0),
> +		       PAGE_SIZE);
> +		kunmap_atomic(dst_priv->pages[page], KM_USER1);
> +		kunmap_atomic(src_priv->pages[page], KM_USER0);
> +	}
> +
> +	i915_gem_object_put_pages(dst);
> +	i915_gem_object_put_pages(src);
> +
> +	/* We lie here, but it makes later analysis easier. */
> +	dst_priv->gtt_offset = src_priv->gtt_offset;
> +
> +	return dst;
> +
> +error_src:
> +	i915_gem_object_put_pages(src);
> +error_unref:
> +	drm_gem_object_unreference(dst);
> +	return NULL;
> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later
> analysis
>   * @dev: drm device
> @@ -382,8 +427,10 @@ static void i915_error_work_func(struct
> work_struct *work) static void i915_capture_error_state(struct
> drm_device *dev) {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_i915_gem_object *obj_priv;
>  	struct drm_i915_error_state *error;
>  	unsigned long flags;
> +	int count;
>  
>  	spin_lock_irqsave(&dev_priv->error_lock, flags);
>  	if (dev_priv->first_error)
> @@ -405,6 +452,7 @@ static void i915_capture_error_state(struct
> drm_device *dev) error->ipehr = I915_READ(IPEHR);
>  		error->instdone = I915_READ(INSTDONE);
>  		error->acthd = I915_READ(ACTHD);
> +		error->bbaddr = 0; /* XXX ? */
>  	} else {
>  		error->ipeir = I915_READ(IPEIR_I965);
>  		error->ipehr = I915_READ(IPEHR_I965);
> @@ -412,7 +460,53 @@ static void i915_capture_error_state(struct
> drm_device *dev) error->instps = I915_READ(INSTPS);
>  		error->instdone1 = I915_READ(INSTDONE1);
>  		error->acthd = I915_READ(ACTHD_I965);
> +		error->bbaddr = I915_READ64(BB_ADDR);
> +	}
> +
> +	/* Grab the current batchbuffer, most likely to have
> crashed. */
> +	error->batchbuffer = NULL;
> +	spin_lock(&dev_priv->mm.active_list_lock);
> +	list_for_each_entry(obj_priv, &dev_priv->mm.active_list,
> list) {
> +		struct drm_gem_object *obj = obj_priv->obj;
> +
> +		if (error->bbaddr >= obj_priv->gtt_offset &&
> +		    error->bbaddr < obj_priv->gtt_offset +
> obj->size) {
> +			/* We need to copy this to an anonymous
> buffer as
> +			 * the simplest method to avoid being
> overwritten
> +			 * by userpace.
> +			 */
> +			error->batchbuffer = clone_obj(dev, obj);
> +			break;
> +		}
> +	}
> +
> +	/* Record buffers on the active list. */
> +	error->active_bo = NULL;
> +	error->active_bo_count = 0;
> +
> +	count = 0;
> +	list_for_each_entry(obj_priv, &dev_priv->mm.active_list,
> list)
> +		count++;
> +
> +	if (count)
> +		error->active_bo =
> kmalloc(sizeof(*error->active_bo)*count, GFP_ATOMIC); +
> +	if (error->active_bo) {
> +		count = 0;
> +		list_for_each_entry(obj_priv,
> &dev_priv->mm.active_list, list) {
> +			struct drm_gem_object *obj = obj_priv->obj;
> +
> +			error->active_bo[count].size = obj->size;
> +			error->active_bo[count].name = obj->name;
> +			error->active_bo[count].gtt_offset =
> obj_priv->gtt_offset;
> +			error->active_bo[count].read_domains =
> obj->read_domains;
> +			error->active_bo[count].write_domain =
> obj->write_domain;
> +			error->active_bo[count].fence_reg =
> obj_priv->fence_reg;
> +			count++;
> +		}
> +		error->active_bo_count = count;
>  	}
> +	spin_unlock(&dev_priv->mm.active_list_lock);
>  
>  	do_gettimeofday(&error->time);
>  
> @@ -422,6 +516,30 @@ out:
>  	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
>  }
>  
> +void i915_destroy_error_state(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_i915_error_state *error;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&dev_priv->error_lock, flags);
> +	error = dev_priv->first_error;
> +	dev_priv->first_error = NULL;
> +	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
> +
> +	if (error == NULL)
> +		return;
> +
> +	if (error->batchbuffer) {
> +		mutex_lock(&dev->struct_mutex);
> +		drm_gem_object_unreference(error->batchbuffer);
> +		mutex_unlock(&dev->struct_mutex);
> +	}
> +
> +	kfree(error->active_bo);
> +	kfree(error);
> +}
> +
>  /**
>   * i915_handle_error - handle an error interrupt
>   * @dev: drm device
> diff --git a/drivers/gpu/drm/i915/i915_reg.h
> b/drivers/gpu/drm/i915/i915_reg.h index f79b133..e0ee576 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -324,6 +324,7 @@
>  #define   CM0_COLOR_EVICT_DISABLE (1<<3)
>  #define   CM0_DEPTH_WRITE_DISABLE (1<<1)
>  #define   CM0_RC_OP_FLUSH_DISABLE (1<<0)
> +#define BB_ADDR		0x02140 /* 8 bytes */
>  #define GFX_FLSH_CNTL	0x02170 /* 915+ only */

Nice.  Assuming we can actually export the whole batch from the
error_state file this looks great.

Reviewed-by: Jesse Barnes <jbarnes at virtuousgeek.org>

-- 
Jesse Barnes, Intel Open Source Technology Center



More information about the Intel-gfx mailing list