[Intel-xe] [PATCH v2] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages

Fri Dec 1 00:28:56 UTC 2023

On 11/30/2023 1:19 PM, Daniele Ceraolo Spurio wrote:
>
>
> On 11/30/2023 12:59 PM, Michal Wajdeczko wrote:
>>
>> On 16.11.2023 23:16, Daniele Ceraolo Spurio wrote:
>>> We're currently sending non-blocking H2G messages using the EVENT type,
>>> which suppresses all CTB protocol replies from the GuC, including the
>>> failure cases. This might cause errors to slip through and manifest as
>>> unexpected behavior (e.g. a context state might not be what the driver
>>> thinks it is because the state change command was silently rejected by
>>> the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
>>> type instead, which suppresses the reply only on success; this way we
>>> still get the advantage of not having to wait for an ack from the GuC
>>> (i.e. the H2G is still non-blocking) while still detecting errors.
>>> Since we can't escalate to the caller when a non-blocking message
>>> fails, we need to escalate to GT reset instead.
>>>
>>> Note that FAST_REQUEST failures are NOT expected and are usually a sign
>>> that the H2G was either malformed or requested an illegal operation.
>>>
>>> v2: assign fence values to FAST_REQUEST messages, fix abi doc, use 
>>> xe_gt
>>> printers (Michal).
>>>
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>>> Cc: John Harrison <John.C.Harrison at Intel.com>
>>> Cc: Matthew Brost <matthew.brost at intel.com>
>>> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/abi/guc_messages_abi.h |  2 +
>>>   drivers/gpu/drm/xe/xe_guc_ct.c            | 58 
>>> ++++++++++++++++++++---
>>>   2 files changed, 54 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h 
>>> b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> index 3d199016cf88..6544205cf40d 100644
>>> --- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> +++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> @@ -24,6 +24,7 @@
>>>    *  |   | 30:28 | **TYPE** - message 
>>> type                                      |
>>>    *  |   |       |   - _`GUC_HXG_TYPE_REQUEST` = 
>>> 0                              |
>>>    *  |   |       |   - _`GUC_HXG_TYPE_EVENT` = 
>>> 1                                |
>>> + *  |   |       |   - _`GUC_HXG_TYPE_FAST_REQUEST` = 
>>> 2                                |
>> -----------------------------------------------------------------------------------------^ 
>>
>> is this aligned correctly ?
>>
>>>    *  |   |       |   - _`GUC_HXG_TYPE_NO_RESPONSE_BUSY` = 
>>> 3                     |
>>>    *  |   |       |   - _`GUC_HXG_TYPE_NO_RESPONSE_RETRY` = 
>>> 5                    |
>>>    *  |   |       |   - _`GUC_HXG_TYPE_RESPONSE_FAILURE` = 
>>> 6                     |
>>> @@ -46,6 +47,7 @@
>>>   #define GUC_HXG_MSG_0_TYPE            (0x7 << 28)
>>>   #define   GUC_HXG_TYPE_REQUEST            0u
>>>   #define   GUC_HXG_TYPE_EVENT            1u
>>> +#define   GUC_HXG_TYPE_FAST_REQUEST        2u
>>>   #define   GUC_HXG_TYPE_NO_RESPONSE_BUSY        3u
>>>   #define   GUC_HXG_TYPE_NO_RESPONSE_RETRY    5u
>>>   #define   GUC_HXG_TYPE_RESPONSE_FAILURE        6u
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c 
>>> b/drivers/gpu/drm/xe/xe_guc_ct.c
>>> index a84e111bb36a..04e443c6fd8b 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>>> @@ -15,6 +15,7 @@
>>>   #include "xe_device.h"
>>>   #include "xe_gt.h"
>>>   #include "xe_gt_pagefault.h"
>>> +#include "xe_gt_printk.h"
>>>   #include "xe_gt_tlb_invalidation.h"
>>>   #include "xe_guc.h"
>>>   #include "xe_guc_submit.h"
>>> @@ -448,7 +449,7 @@ static int h2g_write(struct xe_guc_ct *ct, const 
>>> u32 *action, u32 len,
>>>                      GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>>>       } else {
>>>           cmd[1] =
>>> -            FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
>>> +            FIELD_PREP(GUC_HXG_MSG_0_TYPE, 
>>> GUC_HXG_TYPE_FAST_REQUEST) |
>>>               FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
>>>                      GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>>>       }
>>> @@ -475,11 +476,31 @@ static int h2g_write(struct xe_guc_ct *ct, 
>>> const u32 *action, u32 len,
>>>       return 0;
>>>   }
>>>   +/*
>>> + * The CT protocol accepts a 16 bits fence. This field is fully 
>>> owned by the
>>> + * driver, the GuC will just copy it to the reply message. Since we 
>>> need to
>>> + * be able to distinguish between replies to REQUEST and 
>>> FAST_REQUEST messages,
>>> + * we use one bit of the seqno as an indicator for that and a 
>>> rolling counter
>>> + * for the remaining 15 bits.
>>> + */
>>> +#define CT_SEQNO_MASK GENMASK(14, 0)
>>> +#define CT_SEQNO_IS_UNTRACKED BIT(15)
>>> +static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
>>> +{
>>> +    u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
>>> +
>>> +    if (!is_g2h_fence)
>>> +        seqno |= CT_SEQNO_IS_UNTRACKED;
>>> +
>>> +    return seqno;
>>> +}
>>> +
>>>   static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 
>>> *action,
>>>                   u32 len, u32 g2h_len, u32 num_g2h,
>>>                   struct g2h_fence *g2h_fence)
>>>   {
>>>       struct xe_device *xe = ct_to_xe(ct);
>>> +    u16 seqno;
>>>       int ret;
>>>         xe_assert(xe, !g2h_len || !g2h_fence);
>>> @@ -505,7 +526,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct 
>>> *ct, const u32 *action,
>>>           if (g2h_fence_needs_alloc(g2h_fence)) {
>>>               void *ptr;
>>>   -            g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
>>> +            g2h_fence->seqno = next_ct_seqno(ct, true);
>>>               ptr = xa_store(&ct->fence_lookup,
>>>                          g2h_fence->seqno,
>>>                          g2h_fence, GFP_ATOMIC);
>>> @@ -514,6 +535,10 @@ static int __guc_ct_send_locked(struct 
>>> xe_guc_ct *ct, const u32 *action,
>>>                   goto out;
>>>               }
>>>           }
>>> +
>>> +        seqno = g2h_fence->seqno;
>>> +    } else {
>>> +        seqno = next_ct_seqno(ct, false);
>> maybe instead of calling next_ct_seqno() twice we can set it earlier as:
>>
>>     seqno = next_ct_seqno(ct, !!g2h_fence);
>>
>>     if (g2h_fence_needs_alloc(g2h_fence)) {
>>         g2h_fence->seqno = seqno;
>>         ...

Missed this comment in the first reply. The reason I didn't do it this 
way is that I didn't want to leave holes in the assigned seqno sequence 
by calling next_ct_seqno() when g2h_fence_needs_alloc() is false.

Daniele

>>
>>>       }
>>>         if (g2h_len)
>>> @@ -523,8 +548,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct 
>>> *ct, const u32 *action,
>>>       if (unlikely(ret))
>>>           goto out_unlock;
>>>   -    ret = h2g_write(ct, action, len, g2h_fence ? g2h_fence->seqno 
>>> : 0,
>>> -            !!g2h_fence);
>>> +    ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
>>>       if (unlikely(ret)) {
>>>           if (ret == -EAGAIN)
>>>               goto retry;
>>> @@ -786,7 +810,8 @@ static int parse_g2h_event(struct xe_guc_ct *ct, 
>>> u32 *msg, u32 len)
>>>     static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, 
>>> u32 len)
>>>   {
>>> -    struct xe_device *xe = ct_to_xe(ct);
>>> +    struct xe_gt *gt =  ct_to_gt(ct);
>>> +    struct xe_device *xe = gt_to_xe(gt);
>>>       u32 response_len = len - GUC_CTB_MSG_MIN_LEN;
>>>       u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
>>>       u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
>>> @@ -794,10 +819,31 @@ static int parse_g2h_response(struct xe_guc_ct 
>>> *ct, u32 *msg, u32 len)
>>>         lockdep_assert_held(&ct->lock);
>>>   +    /*
>>> +     * Fences for FAST_REQUEST messages are not tracked in 
>>> ct->fence_lookup.
>>> +     * Those messages should never fail, so if we do get an error 
>>> back it
>>> +     * means we're likely doing an illegal operation and the GuC is
>>> +     * rejecting it. We have no way to inform the code that 
>>> submitted the
>>> +     * H2G that the message was rejected, so we need to escalate the
>>> +     * failure to trigger a reset.
>> if we trigger a reset then we might lost GuC log where additional
>> information about why the call was rejected could be included
>>
>> maybe WARN or taint will be sufficient ?
>
> Since we don't know which message it was that failed, we don't know if 
> we need to release memory in the G2H buffer and in case how much, so 
> the only safe option is to do a reset and start fresh. The GuC log 
> should be included as part of the error capture, like we do with i915.
>
>>
>>> +     */
>>> +    if (fence & CT_SEQNO_IS_UNTRACKED) {
>>> +        if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
>>> +            xe_gt_err(gt, "FAST_REQ H2G 0x%x failed! e=0x%x, h=%u\n",
>> maybe add "fence" as one could interpret that hex as action code, while
>> code below us using %u to print a fence
>>
>>     "FAST_REQ H2G (fence %u) failed! ...
>>
>>> +                  fence,
>>> +                  FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
>>> +                  FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
>>> +        else
>>> +            xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G 
>>> 0x%x!\n",
>>> +                  fence, type);
>> wrong params order ?
>
> D'oh!
>
>>
>>> +
>>> +        return -EPROTO;
>>> +    }
>>> +
>>>       g2h_fence = xa_erase(&ct->fence_lookup, fence);
>>>       if (unlikely(!g2h_fence)) {
>>>           /* Don't tear down channel, as send could've timed out */
>>> -        drm_warn(&xe->drm, "G2H fence (%u) not found!\n", fence);
>>> +        xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
>> nit: it's a shame that we even don't try to dump unexpected messages
>> (those could be legitimate but late SUCCESS, but equally could be spam
>> that we will be just tracking as unknown fence
>>
>>>           g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
>> btw, I'm not sure if this is correct as we reserved different len, no?
>
> No, when we use a g2h_fence we always reserve GUC_CTB_HXG_MSG_MAX_LEN. 
> I had the same doubt when coding this and confirmed it is correct.
>
> Daniele
>
>>
>>>           return 0;
>>>       }
>