[Intel-gfx] [PATCH 1/2] drm/i915: Allow error capture without a request
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Tue Dec 13 01:52:23 UTC 2022
On Tue, Nov 29, 2022 at 01:12:52PM -0800, John.C.Harrison at Intel.com wrote:
>From: John Harrison <John.C.Harrison at Intel.com>
>
>There was a report of error captures occurring without any hung
>context being indicated despite the capture being initiated by a 'hung
>context notification' from GuC. The problem was not reproducible.
>However, it is possible to happen if the context in question has no
>active requests. For example, if the hang was in the context switch
>itself then the breadcrumb write would have occurred and the KMD would
>see an idle context.
>
>In the interests of attempting to provide as much information as
>possible about a hang, it seems wise to include the engine info
>regardless of whether a request was found or not. As opposed to just
>prentending there was no hang at all.
>
>So update the error capture code to always record engine information
>if an engine is given. Which means updating record_context() to take a
>context instead of a request (which it only ever used to find the
>context anyway). And split the request agnostic parts of
>intel_engine_coredump_add_request() out into a seaprate function.
>
>Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>---
> drivers/gpu/drm/i915/i915_gpu_error.c | 55 +++++++++++++++++++--------
> 1 file changed, 40 insertions(+), 15 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>index 9d5d5a397b64e..2ed1c84c9fab4 100644
>--- a/drivers/gpu/drm/i915/i915_gpu_error.c
>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>@@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
> }
>
> static bool record_context(struct i915_gem_context_coredump *e,
>- const struct i915_request *rq)
>+ struct intel_context *ce)
> {
> struct i915_gem_context *ctx;
> struct task_struct *task;
> bool simulated;
>
> rcu_read_lock();
>- ctx = rcu_dereference(rq->context->gem_context);
>+ ctx = rcu_dereference(ce->gem_context);
> if (ctx && !kref_get_unless_zero(&ctx->ref))
> ctx = NULL;
> rcu_read_unlock();
>@@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
> e->guilty = atomic_read(&ctx->guilty_count);
> e->active = atomic_read(&ctx->active_count);
>
>- e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
>- e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>+ e->total_runtime = intel_context_get_total_runtime_ns(ce);
>+ e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>
> simulated = i915_gem_context_no_error_capture(ctx);
>
>@@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
> return ee;
> }
>
>+static struct intel_engine_capture_vma *
>+engine_coredump_add_context(struct intel_engine_coredump *ee,
>+ struct intel_context *ce,
>+ gfp_t gfp)
>+{
>+ struct intel_engine_capture_vma *vma = NULL;
>+
>+ ee->simulated |= record_context(&ee->context, ce);
>+ if (ee->simulated)
>+ return NULL;
>+
>+ /*
>+ * We need to copy these to an anonymous buffer
>+ * as the simplest method to avoid being overwritten
>+ * by userspace.
>+ */
>+ vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>+ vma = capture_vma(vma, ce->state, "HW context", gfp);
>+
>+ return vma;
>+}
>+
> struct intel_engine_capture_vma *
> intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
> struct i915_request *rq,
> gfp_t gfp)
> {
>- struct intel_engine_capture_vma *vma = NULL;
>+ struct intel_engine_capture_vma *vma;
>
>- ee->simulated |= record_context(&ee->context, rq);
>- if (ee->simulated)
>+ vma = engine_coredump_add_context(ee, rq->context, gfp);
>+ if (!vma)
> return NULL;
>
> /*
>@@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
> */
> vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
> vma = capture_user(vma, rq, gfp);
>- vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>- vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>
> ee->rq_head = rq->head;
> ee->rq_post = rq->postfix;
>@@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs *engine,
> if (ce) {
> intel_engine_clear_hung_context(engine);
> rq = intel_context_find_active_request(ce);
>- if (!rq || !i915_request_started(rq))
>- goto no_request_capture;
>+ if (rq && !i915_request_started(rq)) {
>+ drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
>+ engine->name);
>+ rq = NULL;
>+ }
> } else {
> /*
> * Getting here with GuC enabled means it is a forced error capture
>@@ -1625,12 +1648,14 @@ capture_engine(struct intel_engine_cs *engine,
> if (rq)
> rq = i915_request_get_rcu(rq);
>
>- if (!rq)
>- goto no_request_capture;
>+ if (rq)
>+ capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
2 back-to-back if (rq) could merge together,
otherwise, lgtm
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
Umesh
>+ else if (ce)
>+ capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>
>- capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> if (!capture) {
>- i915_request_put(rq);
>+ if (rq)
>+ i915_request_put(rq);
> goto no_request_capture;
> }
> if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>--
>2.37.3
>
More information about the dri-devel
mailing list