[Intel-gfx] [PATCH 2/3] drm/i915: Record pid/comm of hanging task
Mika Kuoppala
mika.kuoppala at linux.intel.com
Mon Feb 10 15:30:50 CET 2014
From: Chris Wilson <chris at chris-wilson.co.uk>
After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.
This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.
v2: Also capture the workaround buffer for gen2, so that we can compare
its contents against the intended batch for the active request.
v3: Rebase (Mika)
Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> (v2)
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com> (v3)
Cc: Ben Widawsky <ben at bwidawsk.net>
---
drivers/gpu/drm/i915/i915_drv.h | 6 +-
drivers/gpu/drm/i915/i915_gem.c | 1 +
drivers/gpu/drm/i915/i915_gpu_error.c | 121 ++++++++++++++++++---------------
3 files changed, 74 insertions(+), 54 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index eb260bc..2a61a29 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -356,7 +356,7 @@ struct drm_i915_error_state {
int page_count;
u32 gtt_offset;
u32 *pages[0];
- } *ringbuffer, *batchbuffer, *ctx, *hws_page;
+ } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
struct drm_i915_error_request {
long jiffies;
@@ -371,6 +371,9 @@ struct drm_i915_error_state {
u32 pp_dir_base;
};
} vm_info;
+
+ pid_t pid;
+ char comm[TASK_COMM_LEN];
} ring[I915_NUM_RINGS];
struct drm_i915_error_buffer {
u32 size;
@@ -1786,6 +1789,7 @@ struct drm_i915_gem_request {
struct drm_i915_file_private {
struct drm_i915_private *dev_priv;
+ struct drm_file *file;
struct {
spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 20e55ef..efbd1dc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4883,6 +4883,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
file->driver_priv = file_priv;
file_priv->dev_priv = dev->dev_private;
+ file_priv->file = file;
spin_lock_init(&file_priv->mm.lock);
INIT_LIST_HEAD(&file_priv->mm.request_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5bd075a..a90971a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -301,13 +301,28 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
va_end(args);
}
+static void print_error_obj(struct drm_i915_error_state_buf *m,
+ struct drm_i915_error_object *obj)
+{
+ int page, offset, elt;
+
+ for (page = offset = 0; page < obj->page_count; page++) {
+ for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+ err_printf(m, "%08x : %08x\n", offset,
+ obj->pages[page][elt]);
+ offset += 4;
+ }
+ }
+}
+
int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
const struct i915_error_state_file_priv *error_priv)
{
struct drm_device *dev = error_priv->dev;
drm_i915_private_t *dev_priv = dev->dev_private;
struct drm_i915_error_state *error = error_priv->error;
- int i, j, page, offset, elt;
+ int i, j, offset, elt;
+ int max_hangcheck_score;
if (!error) {
err_printf(m, "no error state collected\n");
@@ -317,6 +332,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
error->time.tv_usec);
err_printf(m, "Kernel: " UTS_RELEASE "\n");
+ max_hangcheck_score = 0;
+ for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+ if (error->ring[i].hangcheck_score > max_hangcheck_score)
+ max_hangcheck_score = error->ring[i].hangcheck_score;
+ }
+ for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+ if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+ error->ring[i].pid != -1) {
+ err_printf(m, "Active process (on ring %s): %s [%d]\n",
+ ring_str(i),
+ error->ring[i].comm,
+ error->ring[i].pid);
+ }
+ }
err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
err_printf(m, "EIR: 0x%08x\n", error->eir);
err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -360,17 +389,19 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
struct drm_i915_error_object *obj;
if ((obj = error->ring[i].batchbuffer)) {
- err_printf(m, "%s --- gtt_offset = 0x%08x\n",
- dev_priv->ring[i].name,
+ err_puts(m, dev_priv->ring[i].name);
+ if (error->ring[i].pid != -1)
+ err_printf(m, " (submitted by %s [%d])",
+ error->ring[i].comm, error->ring[i].pid);
+ err_printf(m, " --- gtt_offset = 0x%08x\n",
obj->gtt_offset);
- offset = 0;
- for (page = 0; page < obj->page_count; page++) {
- for (elt = 0; elt < PAGE_SIZE/4; elt++) {
- err_printf(m, "%08x : %08x\n", offset,
- obj->pages[page][elt]);
- offset += 4;
- }
- }
+ print_error_obj(m, obj);
+ }
+
+ if ((obj = error->ring[i].wa_batchbuffer)) {
+ err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
+ dev_priv->ring[i].name, obj->gtt_offset);
+ print_error_obj(m, obj);
}
if (error->ring[i].num_requests) {
@@ -389,15 +420,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
err_printf(m, "%s --- ringbuffer = 0x%08x\n",
dev_priv->ring[i].name,
obj->gtt_offset);
- offset = 0;
- for (page = 0; page < obj->page_count; page++) {
- for (elt = 0; elt < PAGE_SIZE/4; elt++) {
- err_printf(m, "%08x : %08x\n",
- offset,
- obj->pages[page][elt]);
- offset += 4;
- }
- }
+ print_error_obj(m, obj);
}
if ((obj = error->ring[i].hws_page)) {
@@ -713,37 +736,6 @@ static void i915_gem_record_fences(struct drm_device *dev,
}
}
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
- struct intel_ring_buffer *ring)
-{
- struct drm_i915_gem_request *request;
-
- if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
- struct drm_i915_gem_object *obj;
- u32 acthd = I915_READ(ACTHD);
-
- if (WARN_ON(ring->id != RCS))
- return NULL;
-
- obj = ring->scratch.obj;
- if (obj != NULL &&
- acthd >= i915_gem_obj_ggtt_offset(obj) &&
- acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
- return i915_error_ggtt_object_create(dev_priv, obj);
- }
-
- request = i915_gem_find_active_request(ring);
- if (request) {
- /* We need to copy these to an anonymous buffer as the simplest
- * method to avoid being overwritten by userspace.
- */
- return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
- }
-
- return NULL;
-}
-
static void i915_record_ring_state(struct drm_device *dev,
struct intel_ring_buffer *ring,
struct drm_i915_error_ring *ering)
@@ -892,8 +884,31 @@ static void i915_gem_record_rings(struct drm_device *dev,
i915_record_ring_state(dev, ring, &error->ring[i]);
- error->ring[i].batchbuffer =
- i915_error_first_batchbuffer(dev_priv, ring);
+ error->ring[i].pid = -1;
+ request = i915_gem_find_active_request(ring);
+ if (request) {
+ /* We need to copy these to an anonymous buffer as the simplest
+ * method to avoid being overwritten by userspace.
+ */
+ error->ring[i].batchbuffer =
+ i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+
+ if (HAS_BROKEN_CS_TLB(dev_priv->dev) && ring->scratch.obj)
+ error->ring[i].wa_batchbuffer =
+ i915_error_ggtt_object_create(dev_priv, ring->scratch.obj);
+
+ if (request->file_priv) {
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
+ if (task) {
+ strcpy(error->ring[i].comm, task->comm);
+ error->ring[i].pid = task->pid;
+ }
+ rcu_read_unlock();
+ }
+ }
error->ring[i].ringbuffer =
i915_error_ggtt_object_create(dev_priv, ring->obj);
--
1.7.9.5
More information about the Intel-gfx
mailing list