[Intel-gfx] [PATCH v4 3/7] drm/i915: Allow error capture without a request

John Harrison john.c.harrison at intel.com
Wed Jan 25 00:56:49 UTC 2023


On 1/24/2023 16:39, Ceraolo Spurio, Daniele wrote:
> On 1/20/2023 3:28 PM, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> There was a report of error captures occurring without any hung
>> context being indicated despite the capture being initiated by a 'hung
>> context notification' from GuC. The problem was not reproducible.
>> However, it is possible to happen if the context in question has no
>> active requests. For example, if the hang was in the context switch
>> itself then the breadcrumb write would have occurred and the KMD would
>> see an idle context.
>>
>> In the interests of attempting to provide as much information as
>> possible about a hang, it seems wise to include the engine info
>> regardless of whether a request was found or not. As opposed to just
>> prentending there was no hang at all.
>>
>> So update the error capture code to always record engine information
>> if a context is given. Which means updating record_context() to take a
>> context instead of a request (which it only ever used to find the
>> context anyway). And split the request agnostic parts of
>> intel_engine_coredump_add_request() out into a seaprate function.
>>
>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of a null
>> pointer.
>> v3: Tidy up request locking code flow (Tvrtko)
>> v4: Pull in improved info message from next patch and fix up potential
>> leak of GuC register state (Daniele)
>
> In the very unlikely case that the capture fails, we're leaving the 
> data inside the GuC buffer. This is not new with this patch and not a 
> bug (that buffer is a ring and the stale data will be overwritten if 
> it gets full), but maybe something that can be improved as a follow-up.
Correct and correct.

John.

>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>
> Daniele
>
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> Acked-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gpu_error.c | 74 ++++++++++++++++++---------
>>   1 file changed, 50 insertions(+), 24 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index b20bd6365615b..225f1b11a6b93 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -1370,14 +1370,14 @@ static void engine_record_execlists(struct 
>> intel_engine_coredump *ee)
>>   }
>>     static bool record_context(struct i915_gem_context_coredump *e,
>> -               const struct i915_request *rq)
>> +               struct intel_context *ce)
>>   {
>>       struct i915_gem_context *ctx;
>>       struct task_struct *task;
>>       bool simulated;
>>         rcu_read_lock();
>> -    ctx = rcu_dereference(rq->context->gem_context);
>> +    ctx = rcu_dereference(ce->gem_context);
>>       if (ctx && !kref_get_unless_zero(&ctx->ref))
>>           ctx = NULL;
>>       rcu_read_unlock();
>> @@ -1396,8 +1396,8 @@ static bool record_context(struct 
>> i915_gem_context_coredump *e,
>>       e->guilty = atomic_read(&ctx->guilty_count);
>>       e->active = atomic_read(&ctx->active_count);
>>   -    e->total_runtime = 
>> intel_context_get_total_runtime_ns(rq->context);
>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>         simulated = i915_gem_context_no_error_capture(ctx);
>>   @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct 
>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>       return ee;
>>   }
>>   +static struct intel_engine_capture_vma *
>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>> +                struct intel_context *ce,
>> +                gfp_t gfp)
>> +{
>> +    struct intel_engine_capture_vma *vma = NULL;
>> +
>> +    ee->simulated |= record_context(&ee->context, ce);
>> +    if (ee->simulated)
>> +        return NULL;
>> +
>> +    /*
>> +     * We need to copy these to an anonymous buffer
>> +     * as the simplest method to avoid being overwritten
>> +     * by userspace.
>> +     */
>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>> +
>> +    return vma;
>> +}
>> +
>>   struct intel_engine_capture_vma *
>>   intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
>>                     struct i915_request *rq,
>>                     gfp_t gfp)
>>   {
>> -    struct intel_engine_capture_vma *vma = NULL;
>> +    struct intel_engine_capture_vma *vma;
>>   -    ee->simulated |= record_context(&ee->context, rq);
>> -    if (ee->simulated)
>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>> +    if (!vma)
>>           return NULL;
>>         /*
>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct 
>> intel_engine_coredump *ee,
>>        */
>>       vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
>>       vma = capture_user(vma, rq, gfp);
>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>> -    vma = capture_vma(vma, rq->context->state, "HW context", gfp);
>>         ee->rq_head = rq->head;
>>       ee->rq_post = rq->postfix;
>> @@ -1604,25 +1624,31 @@ capture_engine(struct intel_engine_cs *engine,
>>           return NULL;
>>         intel_engine_get_hung_entity(engine, &ce, &rq);
>> -    if (!rq || !i915_request_started(rq))
>> -        goto no_request_capture;
>> +    if (rq && !i915_request_started(rq)) {
>> +        drm_info(&engine->gt->i915->drm, "Got hung context on %s 
>> with active request %lld:%lld [0x%04X] not yet started\n",
>> +             engine->name, rq->fence.context, rq->fence.seqno, 
>> ce->guc_id.id);
>> +        i915_request_put(rq);
>> +        rq = NULL;
>> +    }
>>   -    capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>> -    if (!capture)
>> -        goto no_request_capture;
>> -    if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>> -        intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>> +    if (rq) {
>> +        capture = intel_engine_coredump_add_request(ee, rq, 
>> ATOMIC_MAYFAIL);
>> +        i915_request_put(rq);
>> +    } else if (ce) {
>> +        capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
>> +    }
>>   -    intel_engine_coredump_add_vma(ee, capture, compress);
>> -    i915_request_put(rq);
>> +    if (capture) {
>> +        intel_engine_coredump_add_vma(ee, capture, compress);
>>   -    return ee;
>> +        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>> +            intel_guc_capture_get_matching_node(engine->gt, ee, ce);
>> +    } else {
>> +        kfree(ee);
>> +        ee = NULL;
>> +    }
>>   -no_request_capture:
>> -    if (rq)
>> -        i915_request_put(rq);
>> -    kfree(ee);
>> -    return NULL;
>> +    return ee;
>>   }
>>     static void
>



More information about the dri-devel mailing list