[Intel-gfx] [PATCH 1/1] drm/i915: track first and last processes that touch gem objects

Eugeni Dodonov eugeni.dodonov at intel.com
Fri Feb 3 16:31:39 CET 2012


This allows to hopefully find out who was responsible for the GPU death.
We record the 1st and last process to touch each object, to keep track of
the process which created the object originally and the last process to
touch it.

To simplify post-mortem analysis, we also search for the processes names
when gathering the i915_error_state and when peeking at the list of active
gem objects in debugfs. This is not perfect for tracking all the
processes, as they can quit or die before their batchbuffers got executed,
but having to track them during the entire object lifetime would be
excessively memcpy hungry.

v2: also track objects accessed via mmap or pwrite.

CC: Konstantin Belousov <kostikbel at gmail.com>
CC: Eric Anholt <eric at anholt.net>
CC: Daniel Vetter <daniel at ffwll.ch>
CC: Ben Widawsky <ben at bwidawsk.net>
Signed-off-by: Eugeni Dodonov <eugeni.dodonov at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   45 ++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h            |    5 +++
 drivers/gpu/drm/i915/i915_gem.c            |   12 +++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    6 ++++
 drivers/gpu/drm/i915/i915_irq.c            |   22 +++++++++++++
 5 files changed, 90 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a017b98..2eb28d2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -121,6 +121,7 @@ static const char *cache_level_str(int type)
 static void
 describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 {
+	int i;
 	seq_printf(m, "%p: %s%s %8zdKiB %04x %04x %d %d%s%s%s",
 		   &obj->base,
 		   get_pin_flag(obj),
@@ -151,6 +152,28 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 	}
 	if (obj->ring != NULL)
 		seq_printf(m, " (%s)", obj->ring->name);
+
+	/* Describe 1st and last process to touch the object */
+	for (i=0; i < 2; i++) {
+		struct pid *p;
+		struct task_struct *tsk = NULL;
+
+		/* Skip objects that have no associated pid */
+		if (!obj->pid[i])
+			continue;
+
+		p = find_get_pid(obj->pid[i]);
+		if (p) {
+			tsk = get_pid_task(p, PIDTYPE_PID);
+			put_pid(p);
+		}
+
+		seq_printf(m, " (pid_%s: %5d [%s])",
+				(i==0) ? "first" : "last",
+				obj->pid[i],
+				(tsk) ? tsk->comm : "unknown");
+
+	}
 }
 
 static int i915_gem_object_list_info(struct seq_file *m, void *data)
@@ -710,6 +733,7 @@ static void print_error_buffers(struct seq_file *m,
 				struct drm_i915_error_buffer *err,
 				int count)
 {
+	int i;
 	seq_printf(m, "%s [%d]:\n", name, count);
 
 	while (count--) {
@@ -731,6 +755,27 @@ static void print_error_buffers(struct seq_file *m,
 		if (err->fence_reg != I915_FENCE_REG_NONE)
 			seq_printf(m, " (fence: %d)", err->fence_reg);
 
+		/* Describe 1st and last process to touch the object */
+		for (i=0; i < 2; i++) {
+			struct pid *p;
+			struct task_struct *tsk = NULL;
+
+			/* Skip objects that have no associated pid */
+			if (!err->pid[i])
+				continue;
+
+			p = find_get_pid(err->pid[i]);
+			if (p)
+				tsk = get_pid_task(p, PIDTYPE_PID);
+
+			seq_printf(m, " (pid_%s: %5d [%s])",
+					(i==0) ? "first" : "last",
+					err->pid[i],
+					(tsk) ? tsk->comm : "unknown");
+
+			put_pid(p);
+		}
+
 		seq_printf(m, "\n");
 		err++;
 	}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9689ca3..9711ff0a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -193,6 +193,8 @@ struct drm_i915_error_state {
 		u32 purgeable:1;
 		u32 ring:4;
 		u32 cache_level:2;
+		u32 pid[2];
+		char comm[2][TASK_COMM_LEN];
 	} *active_bo, *pinned_bo;
 	u32 active_bo_count, pinned_bo_count;
 	struct intel_overlay_error_state *overlay;
@@ -891,6 +893,9 @@ struct drm_i915_gem_object {
 	/** for phy allocated objects */
 	struct drm_i915_gem_phys_object *phys_obj;
 
+	/** pid of first and last process to touch the object */
+	uint32_t pid[2];
+
 	/**
 	 * Number of crtcs where this object is currently the fb, but
 	 * will be page flipped away on the next vblank.  When it
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e55badb..135b387 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -988,6 +988,11 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 		goto out;
 	}
 
+	/* Discover pid of caller process */
+	if (!obj->pid[0])
+		obj->pid[0] = file->pid;
+	obj->pid[1] = file->pid;
+
 	trace_i915_gem_object_pwrite(obj, args->offset, args->size);
 
 	/* We can only do the GTT pwrite on untiled buffers, as otherwise
@@ -1144,6 +1149,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_mmap *args = data;
 	struct drm_gem_object *obj;
+	struct drm_i915_gem_object *i915_obj;
 	unsigned long addr;
 
 	if (!(dev->driver->driver_features & DRIVER_GEM))
@@ -1158,6 +1164,12 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 		return -E2BIG;
 	}
 
+	/* Discover pid of caller process */
+	i915_obj = to_intel_bo(obj);
+	if (!i915_obj->pid[0])
+		i915_obj->pid[0] = file->pid;
+	i915_obj->pid[1] = file->pid;
+
 	down_write(&current->mm->mmap_sem);
 	addr = do_mmap(obj->filp, 0, args->size,
 		       PROT_READ | PROT_WRITE, MAP_SHARED,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 65e1f00..188893d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1140,6 +1140,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		list_add_tail(&obj->exec_list, &objects);
 		obj->exec_handle = exec[i].handle;
 		obj->exec_entry = &exec[i];
+
+		/* Discover pid of caller process */
+		if (!obj->pid[0])
+			obj->pid[0] = file->pid;
+		obj->pid[1] = file->pid;
+
 		eb_add_object(eb, obj);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5bd4361..4b6fb56 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -789,8 +789,30 @@ static u32 capture_bo_list(struct drm_i915_error_buffer *err,
 	int i = 0;
 
 	list_for_each_entry(obj, head, mm_list) {
+		struct pid *p = NULL;
+		struct task_struct *tsk = NULL;
+		int j;
+
 		err->size = obj->base.size;
 		err->name = obj->base.name;
+
+		/* Record processes which touched this object and collect their
+		 * names to simplify further analysis.
+		 */
+		for (j=0; j < 2; j++) {
+			err->pid[j] = obj->pid[j];
+
+			if (err->pid[j])
+				p = find_get_pid(err->pid[j]);
+			if (p) {
+				tsk = get_pid_task(p, PIDTYPE_PID);
+				put_pid(p);
+			}
+
+			snprintf(err->comm[j], TASK_COMM_LEN,
+					(tsk) ? tsk->comm : "unknown");
+		}
+
 		err->seqno = obj->last_rendering_seqno;
 		err->gtt_offset = obj->gtt_offset;
 		err->read_domains = obj->base.read_domains;
-- 
1.7.8.1




More information about the Intel-gfx mailing list