[Intel-xe] [PATCH v2] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages
Daniele Ceraolo Spurio
daniele.ceraolospurio at intel.com
Fri Dec 1 00:28:56 UTC 2023
On 11/30/2023 1:19 PM, Daniele Ceraolo Spurio wrote:
>
>
> On 11/30/2023 12:59 PM, Michal Wajdeczko wrote:
>>
>> On 16.11.2023 23:16, Daniele Ceraolo Spurio wrote:
>>> We're currently sending non-blocking H2G messages using the EVENT type,
>>> which suppresses all CTB protocol replies from the GuC, including the
>>> failure cases. This might cause errors to slip through and manifest as
>>> unexpected behavior (e.g. a context state might not be what the driver
>>> thinks it is because the state change command was silently rejected by
>>> the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
>>> type instead, which suppresses the reply only on success; this way we
>>> still get the advantage of not having to wait for an ack from the GuC
>>> (i.e. the H2G is still non-blocking) while still detecting errors.
>>> Since we can't escalate to the caller when a non-blocking message
>>> fails, we need to escalate to GT reset instead.
>>>
>>> Note that FAST_REQUEST failures are NOT expected and are usually a sign
>>> that the H2G was either malformed or requested an illegal operation.
>>>
>>> v2: assign fence values to FAST_REQUEST messages, fix abi doc, use
>>> xe_gt
>>> printers (Michal).
>>>
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>>> Cc: John Harrison <John.C.Harrison at Intel.com>
>>> Cc: Matthew Brost <matthew.brost at intel.com>
>>> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/abi/guc_messages_abi.h | 2 +
>>> drivers/gpu/drm/xe/xe_guc_ct.c | 58
>>> ++++++++++++++++++++---
>>> 2 files changed, 54 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> index 3d199016cf88..6544205cf40d 100644
>>> --- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> +++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
>>> @@ -24,6 +24,7 @@
>>> * | | 30:28 | **TYPE** - message
>>> type |
>>> * | | | - _`GUC_HXG_TYPE_REQUEST` =
>>> 0 |
>>> * | | | - _`GUC_HXG_TYPE_EVENT` =
>>> 1 |
>>> + * | | | - _`GUC_HXG_TYPE_FAST_REQUEST` =
>>> 2 |
>> -----------------------------------------------------------------------------------------^
>>
>> is this aligned correctly ?
>>
>>> * | | | - _`GUC_HXG_TYPE_NO_RESPONSE_BUSY` =
>>> 3 |
>>> * | | | - _`GUC_HXG_TYPE_NO_RESPONSE_RETRY` =
>>> 5 |
>>> * | | | - _`GUC_HXG_TYPE_RESPONSE_FAILURE` =
>>> 6 |
>>> @@ -46,6 +47,7 @@
>>> #define GUC_HXG_MSG_0_TYPE (0x7 << 28)
>>> #define GUC_HXG_TYPE_REQUEST 0u
>>> #define GUC_HXG_TYPE_EVENT 1u
>>> +#define GUC_HXG_TYPE_FAST_REQUEST 2u
>>> #define GUC_HXG_TYPE_NO_RESPONSE_BUSY 3u
>>> #define GUC_HXG_TYPE_NO_RESPONSE_RETRY 5u
>>> #define GUC_HXG_TYPE_RESPONSE_FAILURE 6u
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c
>>> b/drivers/gpu/drm/xe/xe_guc_ct.c
>>> index a84e111bb36a..04e443c6fd8b 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>>> @@ -15,6 +15,7 @@
>>> #include "xe_device.h"
>>> #include "xe_gt.h"
>>> #include "xe_gt_pagefault.h"
>>> +#include "xe_gt_printk.h"
>>> #include "xe_gt_tlb_invalidation.h"
>>> #include "xe_guc.h"
>>> #include "xe_guc_submit.h"
>>> @@ -448,7 +449,7 @@ static int h2g_write(struct xe_guc_ct *ct, const
>>> u32 *action, u32 len,
>>> GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>>> } else {
>>> cmd[1] =
>>> - FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
>>> + FIELD_PREP(GUC_HXG_MSG_0_TYPE,
>>> GUC_HXG_TYPE_FAST_REQUEST) |
>>> FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
>>> GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>>> }
>>> @@ -475,11 +476,31 @@ static int h2g_write(struct xe_guc_ct *ct,
>>> const u32 *action, u32 len,
>>> return 0;
>>> }
>>> +/*
>>> + * The CT protocol accepts a 16 bits fence. This field is fully
>>> owned by the
>>> + * driver, the GuC will just copy it to the reply message. Since we
>>> need to
>>> + * be able to distinguish between replies to REQUEST and
>>> FAST_REQUEST messages,
>>> + * we use one bit of the seqno as an indicator for that and a
>>> rolling counter
>>> + * for the remaining 15 bits.
>>> + */
>>> +#define CT_SEQNO_MASK GENMASK(14, 0)
>>> +#define CT_SEQNO_IS_UNTRACKED BIT(15)
>>> +static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
>>> +{
>>> + u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
>>> +
>>> + if (!is_g2h_fence)
>>> + seqno |= CT_SEQNO_IS_UNTRACKED;
>>> +
>>> + return seqno;
>>> +}
>>> +
>>> static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32
>>> *action,
>>> u32 len, u32 g2h_len, u32 num_g2h,
>>> struct g2h_fence *g2h_fence)
>>> {
>>> struct xe_device *xe = ct_to_xe(ct);
>>> + u16 seqno;
>>> int ret;
>>> xe_assert(xe, !g2h_len || !g2h_fence);
>>> @@ -505,7 +526,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct
>>> *ct, const u32 *action,
>>> if (g2h_fence_needs_alloc(g2h_fence)) {
>>> void *ptr;
>>> - g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
>>> + g2h_fence->seqno = next_ct_seqno(ct, true);
>>> ptr = xa_store(&ct->fence_lookup,
>>> g2h_fence->seqno,
>>> g2h_fence, GFP_ATOMIC);
>>> @@ -514,6 +535,10 @@ static int __guc_ct_send_locked(struct
>>> xe_guc_ct *ct, const u32 *action,
>>> goto out;
>>> }
>>> }
>>> +
>>> + seqno = g2h_fence->seqno;
>>> + } else {
>>> + seqno = next_ct_seqno(ct, false);
>> maybe instead of calling next_ct_seqno() twice we can set it earlier as:
>>
>> seqno = next_ct_seqno(ct, !!g2h_fence);
>>
>> if (g2h_fence_needs_alloc(g2h_fence)) {
>> g2h_fence->seqno = seqno;
>> ...
Missed this comment in the first reply. The reason I didn't do it this
way is that I didn't want to leave holes in the assigned seqno sequence
by calling next_ct_seqno() when g2h_fence_needs_alloc() is false.
Daniele
>>
>>> }
>>> if (g2h_len)
>>> @@ -523,8 +548,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct
>>> *ct, const u32 *action,
>>> if (unlikely(ret))
>>> goto out_unlock;
>>> - ret = h2g_write(ct, action, len, g2h_fence ? g2h_fence->seqno
>>> : 0,
>>> - !!g2h_fence);
>>> + ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
>>> if (unlikely(ret)) {
>>> if (ret == -EAGAIN)
>>> goto retry;
>>> @@ -786,7 +810,8 @@ static int parse_g2h_event(struct xe_guc_ct *ct,
>>> u32 *msg, u32 len)
>>> static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg,
>>> u32 len)
>>> {
>>> - struct xe_device *xe = ct_to_xe(ct);
>>> + struct xe_gt *gt = ct_to_gt(ct);
>>> + struct xe_device *xe = gt_to_xe(gt);
>>> u32 response_len = len - GUC_CTB_MSG_MIN_LEN;
>>> u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
>>> u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
>>> @@ -794,10 +819,31 @@ static int parse_g2h_response(struct xe_guc_ct
>>> *ct, u32 *msg, u32 len)
>>> lockdep_assert_held(&ct->lock);
>>> + /*
>>> + * Fences for FAST_REQUEST messages are not tracked in
>>> ct->fence_lookup.
>>> + * Those messages should never fail, so if we do get an error
>>> back it
>>> + * means we're likely doing an illegal operation and the GuC is
>>> + * rejecting it. We have no way to inform the code that
>>> submitted the
>>> + * H2G that the message was rejected, so we need to escalate the
>>> + * failure to trigger a reset.
>> if we trigger a reset then we might lost GuC log where additional
>> information about why the call was rejected could be included
>>
>> maybe WARN or taint will be sufficient ?
>
> Since we don't know which message it was that failed, we don't know if
> we need to release memory in the G2H buffer and in case how much, so
> the only safe option is to do a reset and start fresh. The GuC log
> should be included as part of the error capture, like we do with i915.
>
>>
>>> + */
>>> + if (fence & CT_SEQNO_IS_UNTRACKED) {
>>> + if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
>>> + xe_gt_err(gt, "FAST_REQ H2G 0x%x failed! e=0x%x, h=%u\n",
>> maybe add "fence" as one could interpret that hex as action code, while
>> code below us using %u to print a fence
>>
>> "FAST_REQ H2G (fence %u) failed! ...
>>
>>> + fence,
>>> + FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
>>> + FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
>>> + else
>>> + xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G
>>> 0x%x!\n",
>>> + fence, type);
>> wrong params order ?
>
> D'oh!
>
>>
>>> +
>>> + return -EPROTO;
>>> + }
>>> +
>>> g2h_fence = xa_erase(&ct->fence_lookup, fence);
>>> if (unlikely(!g2h_fence)) {
>>> /* Don't tear down channel, as send could've timed out */
>>> - drm_warn(&xe->drm, "G2H fence (%u) not found!\n", fence);
>>> + xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
>> nit: it's a shame that we even don't try to dump unexpected messages
>> (those could be legitimate but late SUCCESS, but equally could be spam
>> that we will be just tracking as unknown fence
>>
>>> g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
>> btw, I'm not sure if this is correct as we reserved different len, no?
>
> No, when we use a g2h_fence we always reserve GUC_CTB_HXG_MSG_MAX_LEN.
> I had the same doubt when coding this and confirmed it is correct.
>
> Daniele
>
>>
>>> return 0;
>>> }
>
More information about the Intel-xe
mailing list