[Intel-xe] [PATCH] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages

Wed Nov 15 23:40:43 UTC 2023

We're currently sending non-blocking H2G messages using the EVENT type,
which suppresses all CTB protocol replies from the GuC, including the
failure cases. This might cause errors to slip through and manifest as
unexpected behavior (e.g. a context state might not be what the driver
thinks it is because the state change command was silently rejected by
the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
type instead, which suppresses the reply only on success; this way we
still get the advantage of not having to wait for an ack from the GuC
(i.e. the H2G is still non-blocking) while still detecting errors.
Since we don't track the non-blocking messages, we don't exactly know
which one of them failed and therefore can't attempt to recover and need
to escalate to GT reset instead.

Note that FAST_REQUEST failures are NOT expected and are usually a sign
that the H2G was either malformed or requested an illegal operation.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Cc: John Harrison <John.C.Harrison at Intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
---

In the testing I did locally I didn't see any failure responses, but I
didn't run the full CI suite. If failure responses do come out from the
CI runs, I'll try to fix them before merging this patch.

 drivers/gpu/drm/xe/abi/guc_messages_abi.h |  1 +
 drivers/gpu/drm/xe/xe_guc_ct.c            | 28 +++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_guc_ct_types.h      |  2 +-
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
index 3d199016cf88..e517e2a740d2 100644
--- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
@@ -46,6 +46,7 @@
 #define GUC_HXG_MSG_0_TYPE			(0x7 << 28)
 #define   GUC_HXG_TYPE_REQUEST			0u
 #define   GUC_HXG_TYPE_EVENT			1u
+#define   GUC_HXG_TYPE_FAST_REQUEST		2u
 #define   GUC_HXG_TYPE_NO_RESPONSE_BUSY		3u
 #define   GUC_HXG_TYPE_NO_RESPONSE_RETRY	5u
 #define   GUC_HXG_TYPE_RESPONSE_FAILURE		6u
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index a84e111bb36a..81d727b72709 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -448,7 +448,7 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
 				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
 	} else {
 		cmd[1] =
-			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
+			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
 			FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
 				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
 	}
@@ -505,7 +505,11 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
 		if (g2h_fence_needs_alloc(g2h_fence)) {
 			void *ptr;
 
-			g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
+			/* we reserve fence 0 to indicate unset */
+			g2h_fence->seqno = ct->fence_seqno++;
+			if (unlikely(!g2h_fence->seqno))
+				g2h_fence->seqno = ct->fence_seqno++;
+
 			ptr = xa_store(&ct->fence_lookup,
 				       g2h_fence->seqno,
 				       g2h_fence, GFP_ATOMIC);
@@ -792,6 +796,26 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
 	u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
 	struct g2h_fence *g2h_fence;
 
+	/*
+	 * FAST_REQUEST messages don't have a fence and therefore are
+	 * not tracked in ct->fence_lookup. Note that those messages should
+	 * never fail, so if we do get an error back it means we're likely
+	 * doing an illegal operation and the GuC is rejecting it. Since we
+	 * don't know which action failed and therefore can't attempt to
+	 * recover, we need to escalate the failure to trigger a reset.
+	 */
+	if (!fence) {
+		if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
+			drm_err(&xe->drm, "FAST_REQUEST H2G failed! e=0x%x, h=%u\n",
+				FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
+				FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
+		else
+			drm_err(&xe->drm, "unexpected response %u for FAST_REQUEST H2G!\n",
+				type);
+
+		return -EPROTO;
+	}
+
 	lockdep_assert_held(&ct->lock);
 
 	g2h_fence = xa_erase(&ct->fence_lookup, fence);
diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h
index d814d4ee3fc6..62f98dcaeaf1 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h
@@ -99,7 +99,7 @@ struct xe_guc_ct {
 	/** @enabled: CT enabled */
 	bool enabled;
 	/** @fence_seqno: G2H fence seqno - 16 bits used by CT */
-	u32 fence_seqno;
+	u16 fence_seqno;
 	/** @fence_lookup: G2H fence lookup */
 	struct xarray fence_lookup;
 	/** @wq: wait queue used for reliable CT sends and freeing G2H credits */
-- 
2.41.0