[Intel-gfx] [PATCH] drm/i915: Record batch buffer following GPU error
Chris Wilson
chris at chris-wilson.co.uk
Sat Feb 13 21:33:10 CET 2010
In order to improve our diagnostic capabilities following a GPU hang
and subsequent reset, we need to record the batch buffer that triggered
the error. We assume that the current batch buffer, plus a few details
about what else is on the active list, will be sufficient -- at the very
least an improvement over nothing.
v2: Avoid excessive work under spinlocks.
v3: Include ringbuffer for later analysis.
v4: Use kunmap correctly and record more buffer state.
v5: Search ringbuffer for current batch buffer
v6: Use a work fn for the impossible IRQ error case.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_debugfs.c | 83 ++++++++++++
drivers/gpu/drm/i915/i915_dma.c | 2 +
drivers/gpu/drm/i915/i915_drv.h | 21 +++
drivers/gpu/drm/i915/i915_irq.c | 234 +++++++++++++++++++++++++++++++++-
drivers/gpu/drm/i915/i915_reg.h | 1 +
5 files changed, 334 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 55340de..9854c76 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -350,6 +350,36 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
return 0;
}
+static const char *pin_flag(int pinned)
+{
+ if (pinned > 0)
+ return " P";
+ else if (pinned < 0)
+ return " p";
+ else
+ return "";
+}
+
+static const char *tiling_flag(int tiling)
+{
+ switch (tiling) {
+ default:
+ case I915_TILING_NONE: return "";
+ case I915_TILING_X: return " X";
+ case I915_TILING_Y: return " Y";
+ }
+}
+
+static const char *dirty_flag(int dirty)
+{
+ return dirty ? " dirty" : "";
+}
+
+static const char *purgeable_flag(int purgeable)
+{
+ return purgeable ? " purgeable" : "";
+}
+
static int i915_error_state(struct seq_file *m, void *unused)
{
struct drm_info_node *node = (struct drm_info_node *) m->private;
@@ -357,6 +387,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
drm_i915_private_t *dev_priv = dev->dev_private;
struct drm_i915_error_state *error;
unsigned long flags;
+ int i;
spin_lock_irqsave(&dev_priv->error_lock, flags);
if (!dev_priv->first_error) {
@@ -368,6 +399,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
error->time.tv_usec);
+ seq_printf(m, "PCI ID: 0x%04x\n", dev->pci_device);
seq_printf(m, "EIR: 0x%08x\n", error->eir);
seq_printf(m, " PGTBL_ER: 0x%08x\n", error->pgtbl_er);
seq_printf(m, " INSTPM: 0x%08x\n", error->instpm);
@@ -379,6 +411,57 @@ static int i915_error_state(struct seq_file *m, void *unused)
seq_printf(m, " INSTPS: 0x%08x\n", error->instps);
seq_printf(m, " INSTDONE1: 0x%08x\n", error->instdone1);
}
+ seq_printf(m, "seqno: 0x%08x\n", error->seqno);
+
+ if (error->active_bo_count) {
+ int i;
+
+ seq_printf(m, "Buffers [%d]:\n", error->active_bo_count);
+
+ for (i = 0; i < error->active_bo_count; i++) {
+ seq_printf(m, " %08x %8zd %08x %08x %08x%s%s%s%s",
+ error->active_bo[i].gtt_offset,
+ error->active_bo[i].size,
+ error->active_bo[i].read_domains,
+ error->active_bo[i].write_domain,
+ error->active_bo[i].seqno,
+ pin_flag(error->active_bo[i].pinned),
+ tiling_flag(error->active_bo[i].tiling),
+ dirty_flag(error->active_bo[i].dirty),
+ purgeable_flag(error->active_bo[i].purgeable));
+
+ if (error->active_bo[i].name)
+ seq_printf(m, " (name: %d)", error->active_bo[i].name);
+ if (error->active_bo[i].fence_reg != I915_FENCE_REG_NONE)
+ seq_printf(m, " (fence: %d)", error->active_bo[i].fence_reg);
+
+ seq_printf(m, "\n");
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(error->batchbuffer); i++) {
+ if (error->batchbuffer[i] &&
+ i915_gem_object_get_pages(error->batchbuffer[i], 0) == 0) {
+ struct drm_gem_object *obj = error->batchbuffer[i];
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+
+ seq_printf(m, "--- gtt_offset = 0x%08x\n", obj_priv->gtt_offset);
+ i915_dump_pages(m, obj_priv->pages, obj->size / PAGE_SIZE);
+
+ i915_gem_object_put_pages(obj);
+ }
+ }
+
+ if (error->ringbuffer &&
+ i915_gem_object_get_pages(error->ringbuffer, 0) == 0) {
+ struct drm_gem_object *obj = error->ringbuffer;
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+
+ seq_printf(m, "--- ringbuffer = 0x%08x\n", obj_priv->gtt_offset);
+ i915_dump_pages(m, obj_priv->pages, obj->size / PAGE_SIZE);
+
+ i915_gem_object_put_pages(obj);
+ }
out:
spin_unlock_irqrestore(&dev_priv->error_lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 3afe361..6228dad 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1642,6 +1642,8 @@ int i915_driver_unload(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
+ i915_destroy_error_state(dev);
+
destroy_workqueue(dev_priv->wq);
del_timer_sync(&dev_priv->hangcheck_timer);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2e493ec..c8a1143 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -138,6 +138,9 @@ struct sdvo_device_mapping {
};
struct drm_i915_error_state {
+ struct work_struct work;
+ struct drm_device *dev;
+
u32 eir;
u32 pgtbl_er;
u32 pipeastat;
@@ -150,7 +153,24 @@ struct drm_i915_error_state {
u32 instps;
u32 instdone1;
u32 seqno;
+ u64 bbaddr;
struct timeval time;
+ struct drm_gem_object *ringbuffer;
+ struct drm_gem_object *batchbuffer[2];
+ struct drm_i915_error_buffer {
+ size_t size;
+ u32 name;
+ u32 seqno;
+ u32 gtt_offset;
+ u32 read_domains;
+ u32 write_domain;
+ u32 fence_reg;
+ s32 pinned:2;
+ u32 tiling:2;
+ u32 dirty:1;
+ u32 purgeable:1;
+ } *active_bo;
+ u32 active_bo_count;
};
struct drm_i915_display_funcs {
@@ -768,6 +788,7 @@ extern int i965_reset(struct drm_device *dev, u8 flags);
/* i915_irq.c */
void i915_hangcheck_elapsed(unsigned long data);
+void i915_destroy_error_state(struct drm_device *dev);
extern int i915_irq_emit(struct drm_device *dev, void *data,
struct drm_file *file_priv);
extern int i915_irq_wait(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 8b35f5e..3f7e85a 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -432,6 +432,121 @@ static void i915_error_work_func(struct work_struct *work)
}
}
+static struct drm_gem_object *
+clone_obj(struct drm_device *dev,
+ struct drm_gem_object *src)
+{
+ struct drm_gem_object *dst;
+ struct drm_i915_gem_object *src_priv, *dst_priv;
+ int page, page_count;
+
+ dst = drm_gem_object_alloc(dev, src->size);
+ if (dst == NULL)
+ return NULL;
+
+ if (i915_gem_object_get_pages(src, 0))
+ goto error_unref;
+
+ if (i915_gem_object_get_pages(dst, 0))
+ goto error_src;
+
+ src_priv = src->driver_private;
+ dst_priv = dst->driver_private;
+
+ page_count = src->size / PAGE_SIZE;
+ for (page = 0; page < page_count; page++) {
+ void *s = kmap_atomic(src_priv->pages[page], KM_USER0);
+ void *d = kmap_atomic(dst_priv->pages[page], KM_USER1);
+ memcpy(d, s, PAGE_SIZE);
+ kunmap_atomic(d, KM_USER1);
+ kunmap_atomic(s, KM_USER0);
+ }
+
+ i915_gem_object_put_pages(dst);
+ i915_gem_object_put_pages(src);
+
+ /* We lie here, but it makes later analysis easier. */
+ dst_priv->gtt_offset = src_priv->gtt_offset;
+
+ return dst;
+
+error_src:
+ i915_gem_object_put_pages(src);
+error_unref:
+ drm_gem_object_unreference(dst);
+ return NULL;
+}
+
+static void i915_error_state_free(struct drm_device *dev,
+ struct drm_i915_error_state *error)
+{
+ if (error->batchbuffer[0] || error->batchbuffer[1] || error->ringbuffer) {
+ mutex_lock(&dev->struct_mutex);
+ drm_gem_object_unreference(error->ringbuffer);
+ drm_gem_object_unreference(error->batchbuffer[0]);
+ drm_gem_object_unreference(error->batchbuffer[1]);
+ mutex_unlock(&dev->struct_mutex);
+ }
+
+ kfree(error->active_bo);
+ kfree(error);
+}
+
+static u32
+i915_get_bbaddr(struct drm_device *dev, u32 *ring)
+{
+ u32 cmd;
+
+ if (IS_I830(dev) || IS_845G(dev))
+ cmd = MI_BATCH_BUFFER;
+ else if (IS_I965G(dev))
+ cmd = (MI_BATCH_BUFFER_START | (2 << 6) |
+ MI_BATCH_NON_SECURE_I965);
+ else
+ cmd = (MI_BATCH_BUFFER_START | (2 << 6));
+
+ return ring[0] == cmd ? ring[1] : 0;
+}
+
+static u32
+i915_ringbuffer_last_batch(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ u32 head, bbaddr;
+ u32 *ring;
+
+ /* Locate the current position in the ringbuffer and walk back
+ * to find the most recently dispatched batch buffer.
+ */
+ bbaddr = 0;
+ head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
+ ring = (u32 *)(dev_priv->ring.virtual_start + head);
+
+ while (--ring > (u32 *)dev_priv->ring.virtual_start) {
+ bbaddr = i915_get_bbaddr(dev, ring);
+ if (bbaddr)
+ break;
+ }
+
+ if (bbaddr == 0) {
+ ring = (u32 *)(dev_priv->ring.virtual_start + dev_priv->ring.Size);
+ while (--ring > (u32 *)dev_priv->ring.virtual_start) {
+ bbaddr = i915_get_bbaddr(dev, ring);
+ if (bbaddr)
+ break;
+ }
+ }
+
+ return bbaddr;
+}
+
+static void i915_error_state_free_work_fn(struct work_struct *work)
+{
+ struct drm_i915_error_state *error =
+ container_of(work, struct drm_i915_error_state, work);
+ i915_error_state_free(error->dev, error);
+}
+
/**
* i915_capture_error_state - capture an error record for later analysis
* @dev: drm device
@@ -444,19 +559,26 @@ static void i915_error_work_func(struct work_struct *work)
static void i915_capture_error_state(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
+ struct drm_i915_gem_object *obj_priv;
struct drm_i915_error_state *error;
unsigned long flags;
+ u32 bbaddr;
+ int count;
spin_lock_irqsave(&dev_priv->error_lock, flags);
- if (dev_priv->first_error)
- goto out;
+ error = dev_priv->first_error;
+ spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+ if (error)
+ return;
error = kmalloc(sizeof(*error), GFP_ATOMIC);
if (!error) {
- DRM_DEBUG_DRIVER("out ot memory, not capturing error state\n");
- goto out;
+ DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
+ return;
}
+ error->dev = dev;
+ error->seqno = i915_get_gem_seqno(dev);
error->eir = I915_READ(EIR);
error->pgtbl_er = I915_READ(PGTBL_ER);
error->pipeastat = I915_READ(PIPEASTAT);
@@ -467,6 +589,7 @@ static void i915_capture_error_state(struct drm_device *dev)
error->ipehr = I915_READ(IPEHR);
error->instdone = I915_READ(INSTDONE);
error->acthd = I915_READ(ACTHD);
+ error->bbaddr = 0;
} else {
error->ipeir = I915_READ(IPEIR_I965);
error->ipehr = I915_READ(IPEHR_I965);
@@ -474,14 +597,111 @@ static void i915_capture_error_state(struct drm_device *dev)
error->instps = I915_READ(INSTPS);
error->instdone1 = I915_READ(INSTDONE1);
error->acthd = I915_READ(ACTHD_I965);
+ error->bbaddr = I915_READ64(BB_ADDR);
}
- do_gettimeofday(&error->time);
+ bbaddr = i915_ringbuffer_last_batch(dev);
+
+ /* Grab the current batchbuffer, most likely to have crashed. */
+ error->batchbuffer[0] = NULL;
+ error->batchbuffer[1] = NULL;
+ count = 0;
+ spin_lock_irqsave(&dev_priv->mm.active_list_lock, flags);
+ list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+ struct drm_gem_object *obj = obj_priv->obj;
+
+ if (error->batchbuffer[0] == NULL &&
+ bbaddr >= obj_priv->gtt_offset &&
+ bbaddr < obj_priv->gtt_offset + obj->size)
+ error->batchbuffer[0] = obj;
+
+ if (error->batchbuffer[1] == NULL &&
+ error->acthd >= obj_priv->gtt_offset &&
+ error->acthd < obj_priv->gtt_offset + obj->size &&
+ error->batchbuffer[0] != obj)
+ error->batchbuffer[1] = obj;
+
+ count++;
+ }
+ spin_unlock_irqrestore(&dev_priv->mm.active_list_lock, flags);
+
+ /* We need to copy these to an anonymous buffer as the simplest
+ * method to avoid being overwritten by userpace.
+ */
+ if (error->batchbuffer[0])
+ error->batchbuffer[0] = clone_obj(dev, error->batchbuffer[0]);
+ if (error->batchbuffer[1])
+ error->batchbuffer[1] = clone_obj(dev, error->batchbuffer[1]);
+
+ /* Record the ringbuffer */
+ error->ringbuffer = clone_obj(dev, dev_priv->ring.ring_obj);
+
+ /* Record buffers on the active list. */
+ error->active_bo = NULL;
+ error->active_bo_count = 0;
+
+ if (count)
+ error->active_bo = kmalloc(sizeof(*error->active_bo)*count, GFP_ATOMIC);
+
+ if (error->active_bo) {
+ int i;
+
+ i = 0;
+ spin_lock_irqsave(&dev_priv->mm.active_list_lock, flags);
+ list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+ struct drm_gem_object *obj = obj_priv->obj;
+
+ error->active_bo[i].size = obj->size;
+ error->active_bo[i].name = obj->name;
+ error->active_bo[i].seqno = obj_priv->last_rendering_seqno;
+ error->active_bo[i].gtt_offset = obj_priv->gtt_offset;
+ error->active_bo[i].read_domains = obj->read_domains;
+ error->active_bo[i].write_domain = obj->write_domain;
+ error->active_bo[i].fence_reg = obj_priv->fence_reg;
+ error->active_bo[i].pinned = 0;
+ if (obj_priv->pin_count > 0)
+ error->active_bo[i].pinned = 1;
+ if (obj_priv->user_pin_count > 0)
+ error->active_bo[i].pinned = -1;
+ error->active_bo[i].tiling = obj_priv->tiling_mode;
+ error->active_bo[i].dirty = obj_priv->dirty;
+ error->active_bo[i].purgeable = obj_priv->madv != I915_MADV_WILLNEED;
+
+ if (++i == count)
+ break;
+ }
+ spin_unlock_irqrestore(&dev_priv->mm.active_list_lock, flags);
+ error->active_bo_count = i;
+ }
- dev_priv->first_error = error;
+ do_gettimeofday(&error->time);
-out:
+ spin_lock_irqsave(&dev_priv->error_lock, flags);
+ if (dev_priv->first_error == NULL) {
+ dev_priv->first_error = error;
+ error = NULL;
+ }
spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+ if (error) {
+ /* as we are currently in IRQ context we need to free from a workqueue */
+ INIT_WORK(&error->work, i915_error_state_free_work_fn);
+ queue_work(dev_priv->wq, &error->work);
+ }
+}
+
+void i915_destroy_error_state(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ struct drm_i915_error_state *error;
+
+ spin_lock(&dev_priv->error_lock);
+ error = dev_priv->first_error;
+ dev_priv->first_error = NULL;
+ spin_unlock(&dev_priv->error_lock);
+
+ if (error)
+ i915_error_state_free(dev, error);
}
/**
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 3ecb00b..2ad81bd 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -325,6 +325,7 @@
#define CM0_COLOR_EVICT_DISABLE (1<<3)
#define CM0_DEPTH_WRITE_DISABLE (1<<1)
#define CM0_RC_OP_FLUSH_DISABLE (1<<0)
+#define BB_ADDR 0x02140 /* 8 bytes */
#define GFX_FLSH_CNTL 0x02170 /* 915+ only */
--
1.6.6.2
More information about the Intel-gfx
mailing list