[Intel-gfx] [PATCH 1/2] drm/i915: Allow error capture without a request

Tue Dec 13 01:52:23 UTC 2022

On Tue, Nov 29, 2022 at 01:12:52PM -0800, John.C.Harrison at Intel.com wrote:
>From: John Harrison <John.C.Harrison at Intel.com>
>
>There was a report of error captures occurring without any hung
>context being indicated despite the capture being initiated by a 'hung
>context notification' from GuC. The problem was not reproducible.
>However, it is possible to happen if the context in question has no
>active requests. For example, if the hang was in the context switch
>itself then the breadcrumb write would have occurred and the KMD would
>see an idle context.
>
>In the interests of attempting to provide as much information as
>possible about a hang, it seems wise to include the engine info
>regardless of whether a request was found or not. As opposed to just
>prentending there was no hang at all.
>
>So update the error capture code to always record engine information
>if an engine is given. Which means updating record_context() to take a
>context instead of a request (which it only ever used to find the
>context anyway). And split the request agnostic parts of
>intel_engine_coredump_add_request() out into a seaprate function.
>
>Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>---
> drivers/gpu/drm/i915/i915_gpu_error.c | 55 +++++++++++++++++++--------
> 1 file changed, 40 insertions(+), 15 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>index 9d5d5a397b64e..2ed1c84c9fab4 100644
>--- a/drivers/gpu/drm/i915/i915_gpu_error.c
>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>@@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
> }
>
> static bool record_context(struct i915_gem_context_coredump *e,
>-			   const struct i915_request *rq)
>+			   struct intel_context *ce)
> {
> 	struct i915_gem_context *ctx;
> 	struct task_struct *task;
> 	bool simulated;
>
> 	rcu_read_lock();
>-	ctx = rcu_dereference(rq->context->gem_context);
>+	ctx = rcu_dereference(ce->gem_context);
> 	if (ctx && !kref_get_unless_zero(&ctx->ref))
> 		ctx = NULL;
> 	rcu_read_unlock();
>@@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
> 	e->guilty = atomic_read(&ctx->guilty_count);
> 	e->active = atomic_read(&ctx->active_count);
>
>-	e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
>-	e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>+	e->total_runtime = intel_context_get_total_runtime_ns(ce);
>+	e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>
> 	simulated = i915_gem_context_no_error_capture(ctx);
>
>@@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
> 	return ee;
> }
>
>+static struct intel_engine_capture_vma *
>+engine_coredump_add_context(struct intel_engine_coredump *ee,
>+			    struct intel_context *ce,
>+			    gfp_t gfp)
>+{
>+	struct intel_engine_capture_vma *vma = NULL;
>+
>+	ee->simulated |= record_context(&ee->context, ce);
>+	if (ee->simulated)
>+		return NULL;
>+
>+	/*
>+	 * We need to copy these to an anonymous buffer
>+	 * as the simplest method to avoid being overwritten
>+	 * by userspace.
>+	 */
>+	vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>+	vma = capture_vma(vma, ce->state, "HW context", gfp);
>+
>+	return vma;
>+}
>+
> struct intel_engine_capture_vma *
> intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
> 				  struct i915_request *rq,
> 				  gfp_t gfp)
> {
>-	struct intel_engine_capture_vma *vma = NULL;
>+	struct intel_engine_capture_vma *vma;
>
>-	ee->simulated |= record_context(&ee->context, rq);
>-	if (ee->simulated)
>+	vma = engine_coredump_add_context(ee, rq->context, gfp);
>+	if (!vma)
> 		return NULL;
>
> 	/*
>@@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
> 	 */
> 	vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
> 	vma = capture_user(vma, rq, gfp);
>-	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>-	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>
> 	ee->rq_head = rq->head;
> 	ee->rq_post = rq->postfix;
>@@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine,
> 	if (ce) {
> 		intel_engine_clear_hung_context(engine);
> 		rq = intel_context_find_active_request(ce);
>-		if (!rq || !i915_request_started(rq))
>-			goto no_request_capture;
>+		if (rq && !i915_request_started(rq)) {
>+			drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
>+				 engine->name);
>+			rq = NULL;
>+		}
> 	} else {
> 		/*
> 		 * Getting here with GuC enabled means it is a forced error capture
>@@ -1625,12 +1648,14 @@ capture_engine(struct intel_engine_cs *engine,
> 	if (rq)
> 		rq = i915_request_get_rcu(rq);
>
>-	if (!rq)
>-		goto no_request_capture;
>+	if (rq)
>+		capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);

2 back-to-back if (rq) could merge together,


otherwise, lgtm

Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>

Umesh
>+	else if (ce)
>+		capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>
>-	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> 	if (!capture) {
>-		i915_request_put(rq);
>+		if (rq)
>+			i915_request_put(rq);
> 		goto no_request_capture;
> 	}
> 	if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>-- 
>2.37.3
>