[Intel-gfx] [PATCH v3 3/6] drm/i915: Allow error capture without a request
Ceraolo Spurio, Daniele
daniele.ceraolospurio at intel.com
Fri Jan 20 01:54:55 UTC 2023
On 1/18/2023 10:49 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> There was a report of error captures occurring without any hung
> context being indicated despite the capture being initiated by a 'hung
> context notification' from GuC. The problem was not reproducible.
> However, it is possible to happen if the context in question has no
> active requests. For example, if the hang was in the context switch
> itself then the breadcrumb write would have occurred and the KMD would
> see an idle context.
>
> In the interests of attempting to provide as much information as
> possible about a hang, it seems wise to include the engine info
> regardless of whether a request was found or not. As opposed to just
> prentending there was no hang at all.
>
> So update the error capture code to always record engine information
> if a context is given. Which means updating record_context() to take a
> context instead of a request (which it only ever used to find the
> context anyway). And split the request agnostic parts of
> intel_engine_coredump_add_request() out into a seaprate function.
>
> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
> pointer.
> v3: Tidy up request locking code flow (Tvrtko)
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> Acked-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_gpu_error.c | 70 ++++++++++++++++++---------
> 1 file changed, 48 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 78cf95e4dd230..743614fff5472 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct intel_engine_coredump *ee)
> }
>
> static bool record_context(struct i915_gem_context_coredump *e,
> - const struct i915_request *rq)
> + struct intel_context *ce)
> {
> struct i915_gem_context *ctx;
> struct task_struct *task;
> bool simulated;
>
> rcu_read_lock();
> - ctx = rcu_dereference(rq->context->gem_context);
> + ctx = rcu_dereference(ce->gem_context);
> if (ctx && !kref_get_unless_zero(&ctx->ref))
> ctx = NULL;
> rcu_read_unlock();
> @@ -1396,8 +1396,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
> e->guilty = atomic_read(&ctx->guilty_count);
> e->active = atomic_read(&ctx->active_count);
>
> - e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
> - e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
> + e->total_runtime = intel_context_get_total_runtime_ns(ce);
> + e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>
> simulated = i915_gem_context_no_error_capture(ctx);
>
> @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_
> return ee;
> }
>
> +static struct intel_engine_capture_vma *
> +engine_coredump_add_context(struct intel_engine_coredump *ee,
> + struct intel_context *ce,
> + gfp_t gfp)
> +{
> + struct intel_engine_capture_vma *vma = NULL;
> +
> + ee->simulated |= record_context(&ee->context, ce);
> + if (ee->simulated)
> + return NULL;
> +
> + /*
> + * We need to copy these to an anonymous buffer
> + * as the simplest method to avoid being overwritten
> + * by userspace.
> + */
> + vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
> + vma = capture_vma(vma, ce->state, "HW context", gfp);
> +
> + return vma;
> +}
> +
> struct intel_engine_capture_vma *
> intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
> struct i915_request *rq,
> gfp_t gfp)
> {
> - struct intel_engine_capture_vma *vma = NULL;
> + struct intel_engine_capture_vma *vma;
>
> - ee->simulated |= record_context(&ee->context, rq);
> - if (ee->simulated)
> + vma = engine_coredump_add_context(ee, rq->context, gfp);
> + if (!vma)
> return NULL;
>
> /*
> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
> */
> vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
> vma = capture_user(vma, rq, gfp);
> - vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
> - vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>
> ee->rq_head = rq->head;
> ee->rq_post = rq->postfix;
> @@ -1604,25 +1624,31 @@ capture_engine(struct intel_engine_cs *engine,
> return NULL;
>
> intel_get_hung_entity(engine, &ce, &rq);
> - if (!rq || !i915_request_started(rq))
> - goto no_request_capture;
> + if (rq && !i915_request_started(rq)) {
> + drm_info(&engine->gt->i915->drm, "Got hung context on %s with no active request!\n",
> + engine->name);
Shouldn't this print be inside the "else if" case below? otherwise if we
don't have a rq at all we won't see it.
> + i915_request_put(rq);
> + rq = NULL;
> + }
> +
> + if (rq) {
> + capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> + i915_request_put(rq);
> + } else if (ce) {
> + capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
> + }
>
> - capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
> - if (!capture)
> - goto no_request_capture;
> if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
> intel_guc_capture_get_matching_node(engine->gt, ee, ce);
Are you keeping this outside the "if (capture)" below to make sure we
consume the GuC engine capture even if we fail to produce the error
state? if so, a comment might be useful.
Daniele
>
> - intel_engine_coredump_add_vma(ee, capture, compress);
> - i915_request_put(rq);
> + if (capture) {
> + intel_engine_coredump_add_vma(ee, capture, compress);
> + } else {
> + kfree(ee);
> + ee = NULL;
> + }
>
> return ee;
> -
> -no_request_capture:
> - if (rq)
> - i915_request_put(rq);
> - kfree(ee);
> - return NULL;
> }
>
> static void
More information about the dri-devel
mailing list