[PATCH] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages
Michal Wajdeczko
michal.wajdeczko at intel.com
Fri Dec 15 16:44:10 UTC 2023
On 06.12.2023 22:14, Daniele Ceraolo Spurio wrote:
> We're currently sending non-blocking H2G messages using the EVENT type,
> which suppresses all CTB protocol replies from the GuC, including the
> failure cases. This might cause errors to slip through and manifest as
> unexpected behavior (e.g. a context state might not be what the driver
> thinks it is because the state change command was silently rejected by
> the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
> type instead, which suppresses the reply only on success; this way we
> still get the advantage of not having to wait for an ack from the GuC
> (i.e. the H2G is still non-blocking) while still detecting errors.
> Since we can't escalate to the caller when a non-blocking message
> fails, we need to escalate to GT reset instead.
>
> Note that FAST_REQUEST failures are NOT expected and are usually a sign
> that the H2G was either malformed or requested an illegal operation.
>
> v2: assign fence values to FAST_REQUEST messages, fix abi doc, use xe_gt
> printers (Michal).
>
> v3: fix doc alignment, fix and improve prints (Michal)
>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: John Harrison <John.C.Harrison at Intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Reviewed-by: Matthew Brost <matthew.brost at intel.com> #v2
> ---
> drivers/gpu/drm/xe/abi/guc_messages_abi.h | 2 +
> drivers/gpu/drm/xe/xe_guc_ct.c | 58 ++++++++++++++++++++---
> 2 files changed, 54 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> index 3d199016cf88..ff888d16bd4f 100644
> --- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> +++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> @@ -24,6 +24,7 @@
> * | | 30:28 | **TYPE** - message type |
> * | | | - _`GUC_HXG_TYPE_REQUEST` = 0 |
> * | | | - _`GUC_HXG_TYPE_EVENT` = 1 |
> + * | | | - _`GUC_HXG_TYPE_FAST_REQUEST` = 2 |
> * | | | - _`GUC_HXG_TYPE_NO_RESPONSE_BUSY` = 3 |
> * | | | - _`GUC_HXG_TYPE_NO_RESPONSE_RETRY` = 5 |
> * | | | - _`GUC_HXG_TYPE_RESPONSE_FAILURE` = 6 |
> @@ -46,6 +47,7 @@
> #define GUC_HXG_MSG_0_TYPE (0x7 << 28)
> #define GUC_HXG_TYPE_REQUEST 0u
> #define GUC_HXG_TYPE_EVENT 1u
> +#define GUC_HXG_TYPE_FAST_REQUEST 2u
> #define GUC_HXG_TYPE_NO_RESPONSE_BUSY 3u
> #define GUC_HXG_TYPE_NO_RESPONSE_RETRY 5u
> #define GUC_HXG_TYPE_RESPONSE_FAILURE 6u
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index 24a33fa36496..2fb9be5ea132 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -17,6 +17,7 @@
> #include "xe_device.h"
> #include "xe_gt.h"
> #include "xe_gt_pagefault.h"
> +#include "xe_gt_printk.h"
> #include "xe_gt_tlb_invalidation.h"
> #include "xe_guc.h"
> #include "xe_guc_submit.h"
> @@ -448,7 +449,7 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
> GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
> } else {
> cmd[1] =
> - FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> + FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
> FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
> GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
> }
> @@ -475,11 +476,31 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
> return 0;
> }
>
> +/*
> + * The CT protocol accepts a 16 bits fence. This field is fully owned by the
> + * driver, the GuC will just copy it to the reply message. Since we need to
> + * be able to distinguish between replies to REQUEST and FAST_REQUEST messages,
> + * we use one bit of the seqno as an indicator for that and a rolling counter
> + * for the remaining 15 bits.
> + */
> +#define CT_SEQNO_MASK GENMASK(14, 0)
> +#define CT_SEQNO_IS_UNTRACKED BIT(15)
nit: maybe s/CT_SEQNO_IS_UNTRACKED/CT_SEQNO_UNTRACKED
> +static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
> +{
> + u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
> +
> + if (!is_g2h_fence)
> + seqno |= CT_SEQNO_IS_UNTRACKED;
> +
> + return seqno;
> +}
> +
> static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
> u32 len, u32 g2h_len, u32 num_g2h,
> struct g2h_fence *g2h_fence)
> {
> struct xe_device *xe = ct_to_xe(ct);
> + u16 seqno;
> int ret;
>
> xe_assert(xe, !g2h_len || !g2h_fence);
> @@ -505,7 +526,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
> if (g2h_fence_needs_alloc(g2h_fence)) {
> void *ptr;
>
> - g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
> + g2h_fence->seqno = next_ct_seqno(ct, true);
> ptr = xa_store(&ct->fence_lookup,
> g2h_fence->seqno,
> g2h_fence, GFP_ATOMIC);
> @@ -514,6 +535,10 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
> goto out;
> }
> }
> +
> + seqno = g2h_fence->seqno;
> + } else {
> + seqno = next_ct_seqno(ct, false);
> }
>
> if (g2h_len)
> @@ -523,8 +548,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
> if (unlikely(ret))
> goto out_unlock;
>
> - ret = h2g_write(ct, action, len, g2h_fence ? g2h_fence->seqno : 0,
> - !!g2h_fence);
> + ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
> if (unlikely(ret)) {
> if (ret == -EAGAIN)
> goto retry;
> @@ -786,7 +810,8 @@ static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len)
>
> static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
> {
> - struct xe_device *xe = ct_to_xe(ct);
> + struct xe_gt *gt = ct_to_gt(ct);
> + struct xe_device *xe = gt_to_xe(gt);
> u32 response_len = len - GUC_CTB_MSG_MIN_LEN;
> u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
> u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
> @@ -794,10 +819,31 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>
> lockdep_assert_held(&ct->lock);
>
> + /*
> + * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup.
> + * Those messages should never fail, so if we do get an error back it
> + * means we're likely doing an illegal operation and the GuC is
> + * rejecting it. We have no way to inform the code that submitted the
> + * H2G that the message was rejected, so we need to escalate the
> + * failure to trigger a reset.
> + */
> + if (fence & CT_SEQNO_IS_UNTRACKED) {
> + if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
> + xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n",
nit: you can drop explicit "0x" prefix in "0x%x" by just using "%#x"
> + fence,
> + FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
> + FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
> + else
> + xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n",
> + type, fence);
> +
> + return -EPROTO;
> + }
> +
> g2h_fence = xa_erase(&ct->fence_lookup, fence);
> if (unlikely(!g2h_fence)) {
> /* Don't tear down channel, as send could've timed out */
> - drm_warn(&xe->drm, "G2H fence (%u) not found!\n", fence);
> + xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
nit: if this is a response kind of message then this fence in the
message is host seqno aka H2G, no?
> g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
> return 0;
> }
few nits, but LGTM so,
Reviewed-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
More information about the Intel-xe
mailing list