[Intel-xe] [PATCH] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages

Thu Nov 16 09:36:31 UTC 2023

On Wed, Nov 15, 2023 at 03:40:43PM -0800, Daniele Ceraolo Spurio wrote:
> We're currently sending non-blocking H2G messages using the EVENT type,
> which suppresses all CTB protocol replies from the GuC, including the
> failure cases. This might cause errors to slip through and manifest as
> unexpected behavior (e.g. a context state might not be what the driver
> thinks it is because the state change command was silently rejected by
> the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
> type instead, which suppresses the reply only on success; this way we
> still get the advantage of not having to wait for an ack from the GuC
> (i.e. the H2G is still non-blocking) while still detecting errors.
> Since we don't track the non-blocking messages, we don't exactly know
> which one of them failed and therefore can't attempt to recover and need
> to escalate to GT reset instead.
> 
> Note that FAST_REQUEST failures are NOT expected and are usually a sign
> that the H2G was either malformed or requested an illegal operation.
> 
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: John Harrison <John.C.Harrison at Intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> ---
> 
> In the testing I did locally I didn't see any failure responses, but I
> didn't run the full CI suite. If failure responses do come out from the
> CI runs, I'll try to fix them before merging this patch.
> 
>  drivers/gpu/drm/xe/abi/guc_messages_abi.h |  1 +
>  drivers/gpu/drm/xe/xe_guc_ct.c            | 28 +++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_guc_ct_types.h      |  2 +-
>  3 files changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> index 3d199016cf88..e517e2a740d2 100644
> --- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> +++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
> @@ -46,6 +46,7 @@
>  #define GUC_HXG_MSG_0_TYPE			(0x7 << 28)
>  #define   GUC_HXG_TYPE_REQUEST			0u
>  #define   GUC_HXG_TYPE_EVENT			1u
> +#define   GUC_HXG_TYPE_FAST_REQUEST		2u
>  #define   GUC_HXG_TYPE_NO_RESPONSE_BUSY		3u
>  #define   GUC_HXG_TYPE_NO_RESPONSE_RETRY	5u
>  #define   GUC_HXG_TYPE_RESPONSE_FAILURE		6u
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index a84e111bb36a..81d727b72709 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -448,7 +448,7 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
>  				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>  	} else {
>  		cmd[1] =
> -			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
> +			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
>  			FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
>  				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
>  	}
> @@ -505,7 +505,11 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
>  		if (g2h_fence_needs_alloc(g2h_fence)) {
>  			void *ptr;
>  
> -			g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
> +			/* we reserve fence 0 to indicate unset */
> +			g2h_fence->seqno = ct->fence_seqno++;
> +			if (unlikely(!g2h_fence->seqno))
> +				g2h_fence->seqno = ct->fence_seqno++;

Good cleanup but looks unrelated this patch.

> +
>  			ptr = xa_store(&ct->fence_lookup,
>  				       g2h_fence->seqno,
>  				       g2h_fence, GFP_ATOMIC);
> @@ -792,6 +796,26 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
>  	u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
>  	struct g2h_fence *g2h_fence;
>  
> +	/*
> +	 * FAST_REQUEST messages don't have a fence and therefore are
> +	 * not tracked in ct->fence_lookup. Note that those messages should
> +	 * never fail, so if we do get an error back it means we're likely
> +	 * doing an illegal operation and the GuC is rejecting it. Since we
> +	 * don't know which action failed and therefore can't attempt to
> +	 * recover, we need to escalate the failure to trigger a reset.
> +	 */
> +	if (!fence) {
> +		if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
> +			drm_err(&xe->drm, "FAST_REQUEST H2G failed! e=0x%x, h=%u\n",
> +				FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
> +				FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
> +		else
> +			drm_err(&xe->drm, "unexpected response %u for FAST_REQUEST H2G!\n",
> +				type);
> +
> +		return -EPROTO;
> +	}
> +

Was this intentional to before the lockdep assert? I'd put this new code after the lockdep assert.

Matt

>  	lockdep_assert_held(&ct->lock);
>  
>  	g2h_fence = xa_erase(&ct->fence_lookup, fence);
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h
> index d814d4ee3fc6..62f98dcaeaf1 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h
> @@ -99,7 +99,7 @@ struct xe_guc_ct {
>  	/** @enabled: CT enabled */
>  	bool enabled;
>  	/** @fence_seqno: G2H fence seqno - 16 bits used by CT */
> -	u32 fence_seqno;
> +	u16 fence_seqno;
>  	/** @fence_lookup: G2H fence lookup */
>  	struct xarray fence_lookup;
>  	/** @wq: wait queue used for reliable CT sends and freeing G2H credits */
> -- 
> 2.41.0
>