[Intel-gfx] [PATCH 1/4] drm/i915: Allow error capture without a request

Fri Jan 13 21:29:33 UTC 2023

On 1/13/2023 09:46, Hellstrom, Thomas wrote:
> On Fri, 2023-01-13 at 09:51 +0000, Tvrtko Ursulin wrote:
>> On 12/01/2023 20:40, John Harrison wrote:
>>> On 1/12/2023 02:01, Tvrtko Ursulin wrote:
>>>> On 12/01/2023 02:53, John.C.Harrison at Intel.com wrote:
>>>>> From: John Harrison <John.C.Harrison at Intel.com>
>>>>>
>>>>> There was a report of error captures occurring without any hung
>>>>> context being indicated despite the capture being initiated by
>>>>> a 'hung
>>>>> context notification' from GuC. The problem was not
>>>>> reproducible.
>>>>> However, it is possible to happen if the context in question
>>>>> has no
>>>>> active requests. For example, if the hang was in the context
>>>>> switch
>>>>> itself then the breadcrumb write would have occurred and the
>>>>> KMD would
>>>>> see an idle context.
>>>>>
>>>>> In the interests of attempting to provide as much information
>>>>> as
>>>>> possible about a hang, it seems wise to include the engine info
>>>>> regardless of whether a request was found or not. As opposed to
>>>>> just
>>>>> prentending there was no hang at all.
>>>>>
>>>>> So update the error capture code to always record engine
>>>>> information
>>>>> if an engine is given. Which means updating record_context() to
>>>>> take a
>>>>> context instead of a request (which it only ever used to find
>>>>> the
>>>>> context anyway). And split the request agnostic parts of
>>>>> intel_engine_coredump_add_request() out into a seaprate
>>>>> function.
>>>>>
>>>>> v2: Remove a duplicate 'if' statement (Umesh) and fix a put of
>>>>> a null
>>>>> pointer.
>>>>>
>>>>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>>>> Reviewed-by: Umesh Nerlige Ramappa
>>>>> <umesh.nerlige.ramappa at intel.com>
>>>>> ---
>>>>>    drivers/gpu/drm/i915/i915_gpu_error.c | 61
>>>>> +++++++++++++++++++--------
>>>>>    1 file changed, 43 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> index 9d5d5a397b64e..bd2cf7d235df0 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>>>> @@ -1370,14 +1370,14 @@ static void
>>>>> engine_record_execlists(struct
>>>>> intel_engine_coredump *ee)
>>>>>    }
>>>>>      static bool record_context(struct i915_gem_context_coredump
>>>>> *e,
>>>>> -               const struct i915_request *rq)
>>>>> +               struct intel_context *ce)
>>>>>    {
>>>>>        struct i915_gem_context *ctx;
>>>>>        struct task_struct *task;
>>>>>        bool simulated;
>>>>>          rcu_read_lock();
>>>>> -    ctx = rcu_dereference(rq->context->gem_context);
>>>>> +    ctx = rcu_dereference(ce->gem_context);
>>>>>        if (ctx && !kref_get_unless_zero(&ctx->ref))
>>>>>            ctx = NULL;
>>>>>        rcu_read_unlock();
>>>>> @@ -1396,8 +1396,8 @@ static bool record_context(struct
>>>>> i915_gem_context_coredump *e,
>>>>>        e->guilty = atomic_read(&ctx->guilty_count);
>>>>>        e->active = atomic_read(&ctx->active_count);
>>>>>    -    e->total_runtime =
>>>>> intel_context_get_total_runtime_ns(rq->context);
>>>>> -    e->avg_runtime = intel_context_get_avg_runtime_ns(rq-
>>>>>> context);
>>>>> +    e->total_runtime = intel_context_get_total_runtime_ns(ce);
>>>>> +    e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
>>>>>          simulated = i915_gem_context_no_error_capture(ctx);
>>>>>    @@ -1532,15 +1532,37 @@ intel_engine_coredump_alloc(struct
>>>>> intel_engine_cs *engine, gfp_t gfp, u32 dump_
>>>>>        return ee;
>>>>>    }
>>>>>    +static struct intel_engine_capture_vma *
>>>>> +engine_coredump_add_context(struct intel_engine_coredump *ee,
>>>>> +                struct intel_context *ce,
>>>>> +                gfp_t gfp)
>>>>> +{
>>>>> +    struct intel_engine_capture_vma *vma = NULL;
>>>>> +
>>>>> +    ee->simulated |= record_context(&ee->context, ce);
>>>>> +    if (ee->simulated)
>>>>> +        return NULL;
>>>>> +
>>>>> +    /*
>>>>> +     * We need to copy these to an anonymous buffer
>>>>> +     * as the simplest method to avoid being overwritten
>>>>> +     * by userspace.
>>>>> +     */
>>>>> +    vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
>>>>> +    vma = capture_vma(vma, ce->state, "HW context", gfp);
>>>>> +
>>>>> +    return vma;
>>>>> +}
>>>>> +
>>>>>    struct intel_engine_capture_vma *
>>>>>    intel_engine_coredump_add_request(struct
>>>>> intel_engine_coredump *ee,
>>>>>                      struct i915_request *rq,
>>>>>                      gfp_t gfp)
>>>>>    {
>>>>> -    struct intel_engine_capture_vma *vma = NULL;
>>>>> +    struct intel_engine_capture_vma *vma;
>>>>>    -    ee->simulated |= record_context(&ee->context, rq);
>>>>> -    if (ee->simulated)
>>>>> +    vma = engine_coredump_add_context(ee, rq->context, gfp);
>>>>> +    if (!vma)
>>>>>            return NULL;
>>>>>          /*
>>>>> @@ -1550,8 +1572,6 @@ intel_engine_coredump_add_request(struct
>>>>> intel_engine_coredump *ee,
>>>>>         */
>>>>>        vma = capture_vma_snapshot(vma, rq->batch_res, gfp,
>>>>> "batch");
>>>>>        vma = capture_user(vma, rq, gfp);
>>>>> -    vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
>>>>> -    vma = capture_vma(vma, rq->context->state, "HW context",
>>>>> gfp);
>>>>>          ee->rq_head = rq->head;
>>>>>        ee->rq_post = rq->postfix;
>>>>> @@ -1608,8 +1628,11 @@ capture_engine(struct intel_engine_cs
>>>>> *engine,
>>>>>        if (ce) {
>>>>>            intel_engine_clear_hung_context(engine);
>>>>>            rq = intel_context_find_active_request(ce);
>>>>> -        if (!rq || !i915_request_started(rq))
>>>>> -            goto no_request_capture;
>>>>> +        if (rq && !i915_request_started(rq)) {
>>>>> +            drm_info(&engine->gt->i915->drm, "Got hung context
>>>>> on %s
>>>>> with no active request!\n",
>>>> Suggest s/active/started/ since we have both i915_request_active
>>>> and
>>>> i915_request_started, so to align the terminology.
>>> The message text was based on the intent of the activity not the
>>> naming
>>> of some internal helper function. Can change it if you really want
>>> but
>>> "with no started request" just reads like bad English to me. Plus
>>> it
>>> gets removed in the next patch anyway...
>>>
>>>
>>>>> +                 engine->name);
>>>>> +            rq = NULL;
>>>>> +        }
>>>>>        } else {
>>>>>            /*
>>>>>             * Getting here with GuC enabled means it is a forced
>>>>> error
>>>>> capture
>>>>> @@ -1622,22 +1645,24 @@ capture_engine(struct intel_engine_cs
>>>>> *engine,
>>>>>                               flags);
>>>>>            }
>>>>>        }
>>>>> -    if (rq)
>>>>> +    if (rq) {
>>>>>            rq = i915_request_get_rcu(rq);
>>>>> +        capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>> +    } else if (ce) {
>>>>> +        capture = engine_coredump_add_context(ee, ce,
>>>>> ATOMIC_MAYFAIL);
>>>>> +    }
>>>>>    -    if (!rq)
>>>>> -        goto no_request_capture;
>>>>> -
>>>>> -    capture = intel_engine_coredump_add_request(ee, rq,
>>>>> ATOMIC_MAYFAIL);
>>>>>        if (!capture) {
>>>>> -        i915_request_put(rq);
>>>>> +        if (rq)
>>>>> +            i915_request_put(rq);
>>>>>            goto no_request_capture;
>>>>>        }
>>>>>        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
>>>>>            intel_guc_capture_get_matching_node(engine->gt, ee,
>>>>> ce);
>>>> This step requires non-NULL ce, so if you move it under the "else
>>>> if
>>>> (ce)" above then I *think* exit from the function can be
>>>> consolidated
>>>> to just:
>>>>
>>>> if (capture) {
>>>>      intel_engine_coredump_add_vma(ee, capture, compress);
>>>>      if (rq)
>>>>          i915_request_put(rq);
>>> Is there any reason the rq ref needs to be held during the add_vma
>>> call?
>>> Can it now just be moved earlier to be:
>>>       if (rq) {
>>>           rq = i915_request_get_rcu(rq);
>>>           capture = intel_engine_coredump_add_request(ee, rq,
>>> ATOMIC_MAYFAIL);
>>>           i915_request_put(rq);
>>>       }
>>>
>>> The internals of the request object are only touched in the above
>>> _add_request() code. The later _add_vma() call fiddles around with
>>> vmas
>>> that pulled from the request but the capture_vma code inside
>>> _add_request() has already copied everything, hasn't it? Or rather,
>>> it
>>> has grabbed its own private vma resource locks. So there is no
>>> requirement to keep the request itself around still?
> That sounds correct. It was some time ago since I worked with this code
> but when i started IIRC KASAN told me the request along with the whole
> capture list could disappear under us due to a parallel capture.
>
> So the request reference added then might cover a bit too much now that
> we also hold references on vma resources, which it looks like we do in
> intel_engine_coredump_add_vma().
So that means we end up with:
     rq = intel_context_find_active_request(ce);
     ...
     [test stuff like i915_request_started(rq)]
     ...
      if (rq) {
         rq = i915_request_get_rcu(rq);
         capture = intel_engine_coredump_add_request(ee, rq, 
ATOMIC_MAYFAIL);
         i915_request_put(rq);
     }

What is special about coredump_add_request() that it needs the request 
to be extra locked for that call and only that call? If the request can 
magically vanish after being found then what protects the _started() 
query? For that matter, what stops the request_get_rcu() itself being 
called on a pointer that is no longer valid? And if we do actually have 
sufficient locking in place to prevent that, why doesn't that cover the 
coredump_add_request() usage?

John.

>
> Another thing which is crappy with the current error capture code is
> that the request capture list needs to be freed with the request and
> not when the request signals (We can't block request signalling in the
> capture code to keep the capture list around). There might be many
> signaled requests hanging around in non-pruned dma_resv objects and
> thus many unused capture lists with many unused vma resources. :/
>
> /Thomas
>
>
>> Don't know.. it is a question if changes from 60dc43d1190d
>> ("drm/i915:
>> Use struct vma_resource instead of struct vma_snapshot") removed the
>> need for holding the rq reference that "long" I guess? Adding Thomas
>> and
>> Matt to perhaps comment.
>>
>> Regards,
>>
>> Tvrtko
>>
>>
>>> John.
>>>
>>>
>>>> } else {
>>>>      kfree(ee);
>>>>      ee = NULL;
>>>> }
>>>>
>>>> return ee;
>>>>
>>>> No "if (rq) i915_request_put()" twice, and goto label can be
>>>> completely removed.
>>>>
>>>> Regards,
>>>>
>>>> Tvrtko
>>>>
>>>>>          intel_engine_coredump_add_vma(ee, capture, compress);
>>>>> -    i915_request_put(rq);
>>>>> +    if (rq)
>>>>> +        i915_request_put(rq);
>>>>>          return ee;