[Intel-xe] [PATCH] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages

Thu Nov 16 17:12:23 UTC 2023

On 16.11.2023 00:40, Daniele Ceraolo Spurio wrote:
> We're currently sending non-blocking H2G messages using the EVENT type,
> which suppresses all CTB protocol replies from the GuC, including the
> failure cases. This might cause errors to slip through and manifest as
> unexpected behavior (e.g. a context state might not be what the driver
> thinks it is because the state change command was silently rejected by
> the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
> type instead, which suppresses the reply only on success; this way we
> still get the advantage of not having to wait for an ack from the GuC
> (i.e. the H2G is still non-blocking) while still detecting errors.
> Since we don't track the non-blocking messages, we don't exactly know
> which one of them failed and therefore can't attempt to recover and need
> to escalate to GT reset instead.
> 
> Note that FAST_REQUEST failures are NOT expected and are usually a sign
> that the H2G was either malformed or requested an illegal operation.
> 
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: John Harrison <John.C.Harrison at Intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
> 
> In the testing I did locally I didn't see any failure responses, but I
> didn't run the full CI suite. If failure responses do come out from the
> CI runs, I'll try to fix them before merging this patch.
> 
>  drivers/gpu/drm/xe/abi/guc_messages_abi.h |  1 +
>  drivers/gpu/drm/xe/xe_guc_ct.c            | 28 +++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_guc_ct_types.h      |  2 +-
>  3 files changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> index 3d199016cf88..e517e2a740d2 100644
> --- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> +++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> @@ -46,6 +46,7 @@
>  #define GUC_HXG_MSG_0_TYPE			(0x7 << 28)
>  #define   GUC_HXG_TYPE_REQUEST			0u
>  #define   GUC_HXG_TYPE_EVENT			1u
> +#define   GUC_HXG_TYPE_FAST_REQUEST		2u

you should also update doc above (bits 30:28)

>  #define   GUC_HXG_TYPE_NO_RESPONSE_BUSY		3u
>  #define   GUC_HXG_TYPE_NO_RESPONSE_RETRY	5u
>  #define   GUC_HXG_TYPE_RESPONSE_FAILURE		6u
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index a84e111bb36a..81d727b72709 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -448,7 +448,7 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
>  				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>  	} else {
>  		cmd[1] =
> -			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> +			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
>  			FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
>  				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>  	}
> @@ -505,7 +505,11 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
>  		if (g2h_fence_needs_alloc(g2h_fence)) {
>  			void *ptr;
>  
> -			g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
> +			/* we reserve fence 0 to indicate unset */

why ? it's not required by the HXG protocol

> +			g2h_fence->seqno = ct->fence_seqno++;
> +			if (unlikely(!g2h_fence->seqno))
> +				g2h_fence->seqno = ct->fence_seqno++;
> +
>  			ptr = xa_store(&ct->fence_lookup,
>  				       g2h_fence->seqno,
>  				       g2h_fence, GFP_ATOMIC);
> @@ -792,6 +796,26 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>  	u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
>  	struct g2h_fence *g2h_fence;
>  
> +	/*
> +	 * FAST_REQUEST messages don't have a fence and therefore are
> +	 * not tracked in ct->fence_lookup. Note that those messages should

but why we don't use fences for FAST_REQUEST ?

even if we don't expect any failures, from the protocol POV this is
another separate flow, and we shouldn't reuse same 0 (unset) fence for
multiple requests sent one by one.

it also will make any debugging much harder (if we get such failure
response) to figure out and match that error with actual request
(without deep analysis of GuC log, that might be lost)

> +	 * never fail, so if we do get an error back it means we're likely
> +	 * doing an illegal operation and the GuC is rejecting it. Since we
> +	 * don't know which action failed and therefore can't attempt to
> +	 * recover, we need to escalate the failure to trigger a reset.
> +	 */
> +	if (!fence) {
> +		if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
> +			drm_err(&xe->drm, "FAST_REQUEST H2G failed! e=0x%x, h=%u\n",
> +				FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
> +				FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
> +		else
> +			drm_err(&xe->drm, "unexpected response %u for FAST_REQUEST H2G!\n",
> +				type);

we should at least use xe_gt_err() to get the GT id in the log

> +
> +		return -EPROTO;
> +	}
> +
>  	lockdep_assert_held(&ct->lock);
>  
>  	g2h_fence = xa_erase(&ct->fence_lookup, fence);
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h
> index d814d4ee3fc6..62f98dcaeaf1 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h
> @@ -99,7 +99,7 @@ struct xe_guc_ct {
>  	/** @enabled: CT enabled */
>  	bool enabled;
>  	/** @fence_seqno: G2H fence seqno - 16 bits used by CT */
> -	u32 fence_seqno;
> +	u16 fence_seqno;
>  	/** @fence_lookup: G2H fence lookup */
>  	struct xarray fence_lookup;
>  	/** @wq: wait queue used for reliable CT sends and freeing G2H credits */