[Intel-gfx] [PATCH] drm/i915: Record batch buffer following GPU error

Chris Wilson chris at chris-wilson.co.uk
Thu Feb 11 15:15:34 CET 2010


In order to improve our diagnostic capabilities following a GPU hang
and subsequent reset, we need to record the batch buffer that triggered
the error. We assume that the current batch buffer, plus a few details
about what else is on the active list, will be sufficient -- at the very
least an improvement over nothing.

v2: Avoid excessive work under spinlocks.
v3: Include ringbuffer for latter analysis.
v4: Use kunmap correctly and record more buffer state.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   77 +++++++++++++++
 drivers/gpu/drm/i915/i915_dma.c     |    2 +
 drivers/gpu/drm/i915/i915_drv.h     |   17 ++++
 drivers/gpu/drm/i915/i915_irq.c     |  180 +++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_reg.h     |    1 +
 5 files changed, 270 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 55340de..e5bfbf3 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -350,6 +350,36 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
 	return 0;
 }
 
+static const char *pin_flag(int pinned)
+{
+	if (pinned > 0)
+		return " P";
+	else if (pinned < 0)
+		return " p";
+	else
+		return "";
+}
+
+static const char *tiling_flag(int tiling)
+{
+	switch (tiling) {
+	default:
+	case I915_TILING_NONE: return "";
+	case I915_TILING_X: return " X";
+	case I915_TILING_Y: return " Y";
+	}
+}
+
+static const char *dirty_flag(int dirty)
+{
+	return dirty ? " dirty" : "";
+}
+
+static const char *purgeable_flag(int purgeable)
+{
+	return purgeable ? " purgeable" : "";
+}
+
 static int i915_error_state(struct seq_file *m, void *unused)
 {
 	struct drm_info_node *node = (struct drm_info_node *) m->private;
@@ -380,6 +410,53 @@ static int i915_error_state(struct seq_file *m, void *unused)
 		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
 	}
 
+	if (error->active_bo_count) {
+		int i;
+
+		seq_printf(m, "Buffers [%d]:\n", error->active_bo_count);
+
+		for (i = 0; i < error->active_bo_count; i++) {
+			seq_printf(m, "  %08x %8zd %08x %08x %s%s%s%s",
+				   error->active_bo[i].gtt_offset,
+				   error->active_bo[i].size,
+				   error->active_bo[i].read_domains,
+				   error->active_bo[i].write_domain,
+				   pin_flag(error->active_bo[i].pinned),
+				   tiling_flag(error->active_bo[i].tiling),
+				   dirty_flag(error->active_bo[i].dirty),
+				   purgeable_flag(error->active_bo[i].purgeable));
+
+			if (error->active_bo[i].name)
+				seq_printf(m, " (name: %d)", error->active_bo[i].name);
+			if (error->active_bo[i].fence_reg != I915_FENCE_REG_NONE)
+				seq_printf(m, " (fence: %d)", error->active_bo[i].fence_reg);
+
+			seq_printf(m, "\n");
+		}
+	}
+
+	if (error->batchbuffer &&
+	    i915_gem_object_get_pages(error->batchbuffer, 0) == 0) {
+		struct drm_gem_object *obj = error->batchbuffer;
+		struct drm_i915_gem_object *obj_priv = obj->driver_private;
+
+		seq_printf(m, "--- gtt_offset = 0x%08x\n", obj_priv->gtt_offset);
+		i915_dump_pages(m, obj_priv->pages, obj->size / PAGE_SIZE);
+
+		i915_gem_object_put_pages(obj);
+	}
+
+	if (error->ringbuffer &&
+	    i915_gem_object_get_pages(error->ringbuffer, 0) == 0) {
+		struct drm_gem_object *obj = error->ringbuffer;
+		struct drm_i915_gem_object *obj_priv = obj->driver_private;
+
+		seq_printf(m, "--- ringbuffer = 0x%08x\n", obj_priv->gtt_offset);
+		i915_dump_pages(m, obj_priv->pages, obj->size / PAGE_SIZE);
+
+		i915_gem_object_put_pages(obj);
+	}
+
 out:
 	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
 
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 6a17c60..c9b243a 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1640,6 +1640,8 @@ int i915_driver_unload(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	i915_destroy_error_state(dev);
+
 	destroy_workqueue(dev_priv->wq);
 	del_timer_sync(&dev_priv->hangcheck_timer);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bd83ce9..1521d8b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -150,7 +150,23 @@ struct drm_i915_error_state {
 	u32 instps;
 	u32 instdone1;
 	u32 seqno;
+	u64 bbaddr;
 	struct timeval time;
+	struct drm_gem_object *ringbuffer;
+	struct drm_gem_object *batchbuffer;
+	struct drm_i915_error_buffer {
+		size_t size;
+		u32 name;
+		u32 gtt_offset;
+		u32 read_domains;
+		u32 write_domain;
+		u32 fence_reg;
+		s32 pinned:2;
+		u32 tiling:1;
+		u32 dirty:1;
+		u32 purgeable:1;
+	} *active_bo;
+	u32 active_bo_count;
 };
 
 struct drm_i915_display_funcs {
@@ -757,6 +773,7 @@ extern int i965_reset(struct drm_device *dev, u8 flags);
 
 /* i915_irq.c */
 void i915_hangcheck_elapsed(unsigned long data);
+void i915_destroy_error_state(struct drm_device *dev);
 extern int i915_irq_emit(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int i915_irq_wait(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 462c7f8..5776081 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -432,6 +432,65 @@ static void i915_error_work_func(struct work_struct *work)
 	}
 }
 
+static struct drm_gem_object *
+clone_obj(struct drm_device *dev,
+	  struct drm_gem_object *src)
+{
+	struct drm_gem_object *dst;
+	struct drm_i915_gem_object *src_priv, *dst_priv;
+	int page, page_count;
+
+	dst = drm_gem_object_alloc(dev, src->size);
+	if (dst == NULL)
+		return NULL;
+
+	if (i915_gem_object_get_pages(src, 0))
+		goto error_unref;
+
+	if (i915_gem_object_get_pages(dst, 0))
+		goto error_src;
+
+	src_priv = src->driver_private;
+	dst_priv = dst->driver_private;
+
+	page_count = src->size / PAGE_SIZE;
+	for (page = 0; page < page_count; page++) {
+		void *s = kmap_atomic(src_priv->pages[page], KM_USER0);
+		void *d = kmap_atomic(dst_priv->pages[page], KM_USER1);
+		memcpy(d, s, PAGE_SIZE);
+		kunmap_atomic(d, KM_USER1);
+		kunmap_atomic(s, KM_USER0);
+	}
+
+	i915_gem_object_put_pages(dst);
+	i915_gem_object_put_pages(src);
+
+	/* We lie here, but it makes later analysis easier. */
+	dst_priv->gtt_offset = src_priv->gtt_offset;
+
+	return dst;
+
+error_src:
+	i915_gem_object_put_pages(src);
+error_unref:
+	drm_gem_object_unreference(dst);
+	return NULL;
+}
+
+static void i915_error_state_free(struct drm_device *dev,
+				  struct drm_i915_error_state *error)
+{
+	if (error->batchbuffer || error->ringbuffer) {
+		mutex_lock(&dev->struct_mutex);
+		drm_gem_object_unreference(error->ringbuffer);
+		drm_gem_object_unreference(error->batchbuffer);
+		mutex_unlock(&dev->struct_mutex);
+	}
+
+	kfree(error->active_bo);
+	kfree(error);
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -444,17 +503,22 @@ static void i915_error_work_func(struct work_struct *work)
 static void i915_capture_error_state(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_gem_object *obj_priv;
 	struct drm_i915_error_state *error;
 	unsigned long flags;
+	u32 bbaddr;
+	int count;
 
 	spin_lock_irqsave(&dev_priv->error_lock, flags);
-	if (dev_priv->first_error)
-		goto out;
+	error = dev_priv->first_error;
+	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+	if (error)
+		return;
 
 	error = kmalloc(sizeof(*error), GFP_ATOMIC);
 	if (!error) {
-		DRM_DEBUG_DRIVER("out ot memory, not capturing error state\n");
-		goto out;
+		DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
+		return;
 	}
 
 	error->eir = I915_READ(EIR);
@@ -467,6 +531,9 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->ipehr = I915_READ(IPEHR);
 		error->instdone = I915_READ(INSTDONE);
 		error->acthd = I915_READ(ACTHD);
+		error->bbaddr = 0;
+
+		bbaddr = error->acthd;
 	} else {
 		error->ipeir = I915_READ(IPEIR_I965);
 		error->ipehr = I915_READ(IPEHR_I965);
@@ -474,14 +541,113 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->instps = I915_READ(INSTPS);
 		error->instdone1 = I915_READ(INSTDONE1);
 		error->acthd = I915_READ(ACTHD_I965);
+		error->bbaddr = I915_READ64(BB_ADDR);
+
+		bbaddr = error->bbaddr;
 	}
 
-	do_gettimeofday(&error->time);
+	/* Grab the current batchbuffer, most likely to have crashed. */
+	error->batchbuffer = NULL;
+	count = 0;
+	spin_lock_irqsave(&dev_priv->mm.active_list_lock, flags);
+	list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+		struct drm_gem_object *obj = obj_priv->obj;
+
+		if (error->batchbuffer == NULL &&
+		    bbaddr >= obj_priv->gtt_offset &&
+		    bbaddr < obj_priv->gtt_offset + obj->size)
+			error->batchbuffer = obj;
+
+		count++;
+	}
+	if (error->batchbuffer == NULL) {
+		/* Current batch buffer not found, capture the most recent
+		 * one instead...
+		 */
+		list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+			struct drm_gem_object *obj = obj_priv->obj;
+
+			if (obj->read_domains & I915_GEM_DOMAIN_COMMAND) {
+				error->batchbuffer = obj;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dev_priv->mm.active_list_lock, flags);
+
+	if (error->batchbuffer) {
+		/* We need to copy this to an anonymous buffer as
+		 * the simplest method to avoid being overwritten
+		 * by userpace.
+		 */
+		error->batchbuffer = clone_obj(dev, error->batchbuffer);
+	}
+
+	/* Record the ringbuffer */
+	error->ringbuffer = clone_obj(dev, dev_priv->ring.ring_obj);
+
+	/* Record buffers on the active list. */
+	error->active_bo = NULL;
+	error->active_bo_count = 0;
+
+	if (count)
+		error->active_bo = kmalloc(sizeof(*error->active_bo)*count, GFP_ATOMIC);
+
+	if (error->active_bo) {
+		int i;
+
+		i = 0;
+		spin_lock_irqsave(&dev_priv->mm.active_list_lock, flags);
+		list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+			struct drm_gem_object *obj = obj_priv->obj;
+
+			error->active_bo[i].size = obj->size;
+			error->active_bo[i].name = obj->name;
+			error->active_bo[i].gtt_offset = obj_priv->gtt_offset;
+			error->active_bo[i].read_domains = obj->read_domains;
+			error->active_bo[i].write_domain = obj->write_domain;
+			error->active_bo[i].fence_reg = obj_priv->fence_reg;
+			error->active_bo[i].pinned = 0;
+			if (obj_priv->pin_count > 0)
+				error->active_bo[i].pinned = 1;
+			if (obj_priv->user_pin_count > 0)
+				error->active_bo[i].pinned = -1;
+			error->active_bo[i].tiling = obj_priv->tiling_mode;
+			error->active_bo[i].dirty = obj_priv->dirty;
+			error->active_bo[i].purgeable = obj_priv->madv == __I915_MADV_PURGED;
+
+			if (++i == count)
+				break;
+		}
+		spin_unlock_irqrestore(&dev_priv->mm.active_list_lock, flags);
+		error->active_bo_count = i;
+	}
 
-	dev_priv->first_error = error;
+	do_gettimeofday(&error->time);
 
-out:
+	spin_lock_irqsave(&dev_priv->error_lock, flags);
+	if (dev_priv->first_error == NULL) {
+		dev_priv->first_error = error;
+		error = NULL;
+	}
 	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+	if (error)
+		i915_error_state_free(dev, error);
+}
+
+void i915_destroy_error_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_error_state *error;
+
+	spin_lock(&dev_priv->error_lock);
+	error = dev_priv->first_error;
+	dev_priv->first_error = NULL;
+	spin_unlock(&dev_priv->error_lock);
+
+	if (error)
+		i915_error_state_free(dev, error);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 487b818..af7dd2f 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -325,6 +325,7 @@
 #define   CM0_COLOR_EVICT_DISABLE (1<<3)
 #define   CM0_DEPTH_WRITE_DISABLE (1<<1)
 #define   CM0_RC_OP_FLUSH_DISABLE (1<<0)
+#define BB_ADDR		0x02140 /* 8 bytes */
 #define GFX_FLSH_CNTL	0x02170 /* 915+ only */
 
 
-- 
1.6.6.1




More information about the Intel-gfx mailing list