[PATCH 1/2] drm/i915: Drop rq->ring->vma peeking from error capture

Fri Apr 24 19:38:29 UTC 2020

We only hold the active spinlock while dumping the error state, and this
does not prevent another thread from retiring the request -- as it is
quite possible that despite us capturing the current state, the GPU has
completed the request. As such, it is dangerous to dereference state
below the request as it may already be freed, and the simplest way to
avoid the danger is not include it in the error state.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1788
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Cc: Andi Shyti <andi.shyti at intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 18 ++++++++++++------
 drivers/gpu/drm/i915/i915_gpu_error.h |  1 -
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 424ad975a360..6dd2fc0b0d47 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -467,14 +467,14 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
 	if (!erq->seqno)
 		return;
 
-	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n",
+	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n",
 		   prefix, erq->pid, erq->context, erq->seqno,
 		   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 			    &erq->flags) ? "!" : "",
 		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 			    &erq->flags) ? "+" : "",
 		   erq->sched_attr.priority,
-		   erq->start, erq->head, erq->tail);
+		   erq->head, erq->tail);
 }
 
 static void error_print_context(struct drm_i915_error_state_buf *m,
@@ -1204,16 +1204,18 @@ static void engine_record_registers(struct intel_engine_coredump *ee)
 	}
 }
 
-static void record_request(const struct i915_request *request,
+static bool record_request(const struct i915_request *request,
 			   struct i915_request_coredump *erq)
 {
 	const struct i915_gem_context *ctx;
 
+	if (i915_request_completed(request))
+		return false;
+
 	erq->flags = request->fence.flags;
 	erq->context = request->fence.context;
 	erq->seqno = request->fence.seqno;
 	erq->sched_attr = request->sched.attr;
-	erq->start = i915_ggtt_offset(request->ring->vma);
 	erq->head = request->head;
 	erq->tail = request->tail;
 
@@ -1223,6 +1225,8 @@ static void record_request(const struct i915_request *request,
 	if (ctx)
 		erq->pid = pid_nr(ctx->pid);
 	rcu_read_unlock();
+
+	return true;
 }
 
 static void engine_record_execlists(struct intel_engine_coredump *ee)
@@ -1231,8 +1235,10 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
 	struct i915_request * const *port = el->active;
 	unsigned int n = 0;
 
-	while (*port)
-		record_request(*port++, &ee->execlist[n++]);
+	while (*port) {
+		if (record_request(*port++, &ee->execlist[n]))
+			n++;
+	}
 
 	ee->num_ports = n;
 }
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 0d1f6c8ff355..fa2d82a6de04 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -50,7 +50,6 @@ struct i915_request_coredump {
 	pid_t pid;
 	u32 context;
 	u32 seqno;
-	u32 start;
 	u32 head;
 	u32 tail;
 	struct i915_sched_attr sched_attr;
-- 
2.20.1