[Intel-xe] [PATCH v2] drm/xe/guc: Use FAST_REQUEST for non-blocking H2G messages

Daniele Ceraolo Spurio daniele.ceraolospurio at intel.com
Thu Nov 16 22:16:37 UTC 2023


We're currently sending non-blocking H2G messages using the EVENT type,
which suppresses all CTB protocol replies from the GuC, including the
failure cases. This might cause errors to slip through and manifest as
unexpected behavior (e.g. a context state might not be what the driver
thinks it is because the state change command was silently rejected by
the GuC). To avoid this kind of problems, we can use the FAST_REQUEST
type instead, which suppresses the reply only on success; this way we
still get the advantage of not having to wait for an ack from the GuC
(i.e. the H2G is still non-blocking) while still detecting errors.
Since we can't escalate to the caller when a non-blocking message
fails, we need to escalate to GT reset instead.

Note that FAST_REQUEST failures are NOT expected and are usually a sign
that the H2G was either malformed or requested an illegal operation.

v2: assign fence values to FAST_REQUEST messages, fix abi doc, use xe_gt
printers (Michal).

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Cc: John Harrison <John.C.Harrison at Intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
---
 drivers/gpu/drm/xe/abi/guc_messages_abi.h |  2 +
 drivers/gpu/drm/xe/xe_guc_ct.c            | 58 ++++++++++++++++++++---
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
index 3d199016cf88..6544205cf40d 100644
--- a/drivers/gpu/drm/xe/abi/guc_messages_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h
@@ -24,6 +24,7 @@
  *  |   | 30:28 | **TYPE** - message type                                      |
  *  |   |       |   - _`GUC_HXG_TYPE_REQUEST` = 0                              |
  *  |   |       |   - _`GUC_HXG_TYPE_EVENT` = 1                                |
+ *  |   |       |   - _`GUC_HXG_TYPE_FAST_REQUEST` = 2                                |
  *  |   |       |   - _`GUC_HXG_TYPE_NO_RESPONSE_BUSY` = 3                     |
  *  |   |       |   - _`GUC_HXG_TYPE_NO_RESPONSE_RETRY` = 5                    |
  *  |   |       |   - _`GUC_HXG_TYPE_RESPONSE_FAILURE` = 6                     |
@@ -46,6 +47,7 @@
 #define GUC_HXG_MSG_0_TYPE			(0x7 << 28)
 #define   GUC_HXG_TYPE_REQUEST			0u
 #define   GUC_HXG_TYPE_EVENT			1u
+#define   GUC_HXG_TYPE_FAST_REQUEST		2u
 #define   GUC_HXG_TYPE_NO_RESPONSE_BUSY		3u
 #define   GUC_HXG_TYPE_NO_RESPONSE_RETRY	5u
 #define   GUC_HXG_TYPE_RESPONSE_FAILURE		6u
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index a84e111bb36a..04e443c6fd8b 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -15,6 +15,7 @@
 #include "xe_device.h"
 #include "xe_gt.h"
 #include "xe_gt_pagefault.h"
+#include "xe_gt_printk.h"
 #include "xe_gt_tlb_invalidation.h"
 #include "xe_guc.h"
 #include "xe_guc_submit.h"
@@ -448,7 +449,7 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
 				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
 	} else {
 		cmd[1] =
-			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) |
+			FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
 			FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
 				   GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
 	}
@@ -475,11 +476,31 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
 	return 0;
 }
 
+/*
+ * The CT protocol accepts a 16 bits fence. This field is fully owned by the
+ * driver, the GuC will just copy it to the reply message. Since we need to
+ * be able to distinguish between replies to REQUEST and FAST_REQUEST messages,
+ * we use one bit of the seqno as an indicator for that and a rolling counter
+ * for the remaining 15 bits.
+ */
+#define CT_SEQNO_MASK GENMASK(14, 0)
+#define CT_SEQNO_IS_UNTRACKED BIT(15)
+static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
+{
+	u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
+
+	if (!is_g2h_fence)
+		seqno |= CT_SEQNO_IS_UNTRACKED;
+
+	return seqno;
+}
+
 static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
 				u32 len, u32 g2h_len, u32 num_g2h,
 				struct g2h_fence *g2h_fence)
 {
 	struct xe_device *xe = ct_to_xe(ct);
+	u16 seqno;
 	int ret;
 
 	xe_assert(xe, !g2h_len || !g2h_fence);
@@ -505,7 +526,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
 		if (g2h_fence_needs_alloc(g2h_fence)) {
 			void *ptr;
 
-			g2h_fence->seqno = (ct->fence_seqno++ & 0xffff);
+			g2h_fence->seqno = next_ct_seqno(ct, true);
 			ptr = xa_store(&ct->fence_lookup,
 				       g2h_fence->seqno,
 				       g2h_fence, GFP_ATOMIC);
@@ -514,6 +535,10 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
 				goto out;
 			}
 		}
+
+		seqno = g2h_fence->seqno;
+	} else {
+		seqno = next_ct_seqno(ct, false);
 	}
 
 	if (g2h_len)
@@ -523,8 +548,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
 	if (unlikely(ret))
 		goto out_unlock;
 
-	ret = h2g_write(ct, action, len, g2h_fence ? g2h_fence->seqno : 0,
-			!!g2h_fence);
+	ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
 	if (unlikely(ret)) {
 		if (ret == -EAGAIN)
 			goto retry;
@@ -786,7 +810,8 @@ static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len)
 
 static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
 {
-	struct xe_device *xe = ct_to_xe(ct);
+	struct xe_gt *gt =  ct_to_gt(ct);
+	struct xe_device *xe = gt_to_xe(gt);
 	u32 response_len = len - GUC_CTB_MSG_MIN_LEN;
 	u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
 	u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]);
@@ -794,10 +819,31 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
 
 	lockdep_assert_held(&ct->lock);
 
+	/*
+	 * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup.
+	 * Those messages should never fail, so if we do get an error back it
+	 * means we're likely doing an illegal operation and the GuC is
+	 * rejecting it. We have no way to inform the code that submitted the
+	 * H2G that the message was rejected, so we need to escalate the
+	 * failure to trigger a reset.
+	 */
+	if (fence & CT_SEQNO_IS_UNTRACKED) {
+		if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
+			xe_gt_err(gt, "FAST_REQ H2G 0x%x failed! e=0x%x, h=%u\n",
+				  fence,
+				  FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]),
+				  FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]));
+		else
+			xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G 0x%x!\n",
+				  fence, type);
+
+		return -EPROTO;
+	}
+
 	g2h_fence = xa_erase(&ct->fence_lookup, fence);
 	if (unlikely(!g2h_fence)) {
 		/* Don't tear down channel, as send could've timed out */
-		drm_warn(&xe->drm, "G2H fence (%u) not found!\n", fence);
+		xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
 		g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
 		return 0;
 	}
-- 
2.41.0



More information about the Intel-xe mailing list