[PATCH v4 2/2] drm/xe/guc: Port over the slow GuC loading support from i915

Thu Apr 4 19:19:06 UTC 2024

On Tue, Feb 27, 2024 at 05:09:56PM -0800, John.C.Harrison at Intel.com wrote:
>From: John Harrison <John.C.Harrison at Intel.com>
>
>GuC loading can take longer than it is supposed to for various
>reasons. So add in the code to cope with that and to report it when it
>happens. There are also many different reasons why GuC loading can
>fail, so add in the code for checking for those and for reporting
>issues in a meaningful manner rather than just hitting a timeout and
>saying 'fail: status = %x'.
>
>Also, remove the 'FIXME' comment about an i915 bug that has never been
>applicable to Xe!
>
>v2: Actually report the requested and granted frequencies rather than
>showing granted twice (review feedback from Badal).
>v3: Locally code all the timeout and end condition handling because a
>helper function is not allowed (review feedback from Lucas/Rodrigo).
>
>Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>---
> drivers/gpu/drm/xe/abi/guc_errors_abi.h |  26 ++-
> drivers/gpu/drm/xe/regs/xe_guc_regs.h   |   2 +
> drivers/gpu/drm/xe/xe_guc.c             | 226 +++++++++++++++++++-----
> drivers/gpu/drm/xe/xe_mmio.c            |  61 +++++++
> drivers/gpu/drm/xe/xe_mmio.h            |   2 +
> 5 files changed, 274 insertions(+), 43 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/abi/guc_errors_abi.h b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>index ec83551bf9c0..d0b5fed6876f 100644
>--- a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>+++ b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>@@ -7,8 +7,12 @@
> #define _ABI_GUC_ERRORS_ABI_H
>
> enum xe_guc_response_status {
>-	XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>-	XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>+	XE_GUC_RESPONSE_STATUS_SUCCESS                      = 0x0,
>+	XE_GUC_RESPONSE_NOT_SUPPORTED                       = 0x20,
>+	XE_GUC_RESPONSE_NO_ATTRIBUTE_TABLE                  = 0x201,
>+	XE_GUC_RESPONSE_NO_DECRYPTION_KEY                   = 0x202,
>+	XE_GUC_RESPONSE_DECRYPTION_FAILED                   = 0x204,
>+	XE_GUC_RESPONSE_STATUS_GENERIC_FAIL                 = 0xF000,
> };
>
> enum xe_guc_load_status {
>@@ -17,6 +21,9 @@ enum xe_guc_load_status {
> 	XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH       = 0x02,
> 	XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH       = 0x03,
> 	XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE      = 0x04,
>+	XE_GUC_LOAD_STATUS_HWCONFIG_START                   = 0x05,
>+	XE_GUC_LOAD_STATUS_HWCONFIG_DONE                    = 0x06,
>+	XE_GUC_LOAD_STATUS_HWCONFIG_ERROR                   = 0x07,
> 	XE_GUC_LOAD_STATUS_GDT_DONE                         = 0x10,
> 	XE_GUC_LOAD_STATUS_IDT_DONE                         = 0x20,
> 	XE_GUC_LOAD_STATUS_LAPIC_DONE                       = 0x30,
>@@ -34,4 +41,19 @@ enum xe_guc_load_status {
> 	XE_GUC_LOAD_STATUS_READY                            = 0xF0,
> };
>
>+enum xe_bootrom_load_status {
>+	XE_BOOTROM_STATUS_NO_KEY_FOUND                      = 0x13,
>+	XE_BOOTROM_STATUS_AES_PROD_KEY_FOUND                = 0x1A,
>+	XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE            = 0x2B,
>+	XE_BOOTROM_STATUS_RSA_FAILED                        = 0x50,
>+	XE_BOOTROM_STATUS_PAVPC_FAILED                      = 0x73,
>+	XE_BOOTROM_STATUS_WOPCM_FAILED                      = 0x74,
>+	XE_BOOTROM_STATUS_LOADLOC_FAILED                    = 0x75,
>+	XE_BOOTROM_STATUS_JUMP_PASSED                       = 0x76,
>+	XE_BOOTROM_STATUS_JUMP_FAILED                       = 0x77,
>+	XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED               = 0x79,
>+	XE_BOOTROM_STATUS_MPUMAP_INCORRECT                  = 0x7A,
>+	XE_BOOTROM_STATUS_EXCEPTION                         = 0x7E,
>+};
>+
> #endif
>diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>index 4e7f809d2b00..2e54c9e6a4fe 100644
>--- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>+++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>@@ -40,6 +40,8 @@
> #define   GS_BOOTROM_JUMP_PASSED		REG_FIELD_PREP(GS_BOOTROM_MASK, 0x76)
> #define   GS_MIA_IN_RESET			REG_BIT(0)
>
>+#define GUC_HEADER_INFO				XE_REG(0xc014)
>+
> #define GUC_WOPCM_SIZE				XE_REG(0xc050)
> #define   GUC_WOPCM_SIZE_MASK			REG_GENMASK(31, 12)
> #define   GUC_WOPCM_SIZE_LOCKED			REG_BIT(0)
>diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>index 0d2a2dd13f11..771971110600 100644
>--- a/drivers/gpu/drm/xe/xe_guc.c
>+++ b/drivers/gpu/drm/xe/xe_guc.c
>@@ -17,6 +17,7 @@
> #include "xe_device.h"
> #include "xe_force_wake.h"
> #include "xe_gt.h"
>+#include "xe_gt_throttle.h"
> #include "xe_guc_ads.h"
> #include "xe_guc_ct.h"
> #include "xe_guc_hwconfig.h"
>@@ -461,58 +462,201 @@ static int guc_xfer_rsa(struct xe_guc *guc)
> 	return 0;
> }
>
>-static int guc_wait_ucode(struct xe_guc *guc)
>+/*
>+ * Check a previously read GuC status register (GUC_STATUS) looking for
>+ * known terminal states (either completion or failure) of either the
>+ * microkernel status field or the boot ROM status field. Returns +1 for
>+ * successful completion, -1 for failure and 0 for any intermediate state.
>+ */
>+static int guc_load_done(u32 status)
> {
>-	struct xe_device *xe = guc_to_xe(guc);
>-	u32 status;
>-	int ret;
>+	u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, status);
>+	u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>+
>+	switch (uk_val) {
>+	case XE_GUC_LOAD_STATUS_READY:
>+		return 1;
>+
>+	case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH:
>+	case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH:
>+	case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE:
>+	case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR:
>+	case XE_GUC_LOAD_STATUS_DPC_ERROR:
>+	case XE_GUC_LOAD_STATUS_EXCEPTION:
>+	case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID:
>+	case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID:
>+	case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID:
>+		return -1;
>+	}
>+
>+	switch (br_val) {
>+	case XE_BOOTROM_STATUS_NO_KEY_FOUND:
>+	case XE_BOOTROM_STATUS_RSA_FAILED:
>+	case XE_BOOTROM_STATUS_PAVPC_FAILED:
>+	case XE_BOOTROM_STATUS_WOPCM_FAILED:
>+	case XE_BOOTROM_STATUS_LOADLOC_FAILED:
>+	case XE_BOOTROM_STATUS_JUMP_FAILED:
>+	case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED:
>+	case XE_BOOTROM_STATUS_MPUMAP_INCORRECT:
>+	case XE_BOOTROM_STATUS_EXCEPTION:
>+	case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE:
>+		return -1;
>+	}
>+
>+	return 0;
>+}
>+
>+static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
>+{
>+	u32 freq;
>+	int ret = xe_guc_pc_get_cur_freq(guc_pc, &freq);
>+
>+	return ret ? ret : freq;
>+}
>+
>+/*
>+ * Wait for the GuC to start up.
>+ *
>+ * Measurements indicate this should take no more than 20ms (assuming the GT
>+ * clock is at maximum frequency). However, thermal throttling and other issues
>+ * can prevent the clock hitting max and thus making the load take significantly
>+ * longer. Indeed, if the GT is clamped to minimum frequency then the load times
>+ * can be in the seconds range. As, there is a limit on how long an individual

				    ^ that comma seems misplaced

>+ * usleep_range() can wait for, the wait is wrapped in a loop. The loop count
>+ * is increased for debug builds so that problems can be detected and analysed.
>+ * For release builds, the timeout is kept short so that user's don't wait
>+ * forever to find out there is a problem. In either case, if the load took longer
>+ * than is reasonable even with some 'sensible' throttling, then flag a warning
>+ * because something is not right.
>+ *
>+ * Note that the only reason an end user should hit the timeout is in case of
>+ * extreme thermal throttling. And a system that is that hot during boot is
>+ * probably dead anyway!
>+ */
>+#if defined(CONFIG_DRM_XE_DEBUG)
>+#define GUC_LOAD_RETRY_LIMIT	20
>+#else
>+#define GUC_LOAD_RETRY_LIMIT	3
>+#endif

just sent a comment on this reasoning in the previous version.
Continuing from here.

>+#define GUC_LOAD_TIME_WARN      200

let's also add the unit, _MSEC

Also, comment above says 20, but here we warn on 200. Is that 10x
difference a typo or intended?

>
>+static int guc_wait_ucode(struct xe_guc *guc)
>+{
>+	struct xe_gt *gt = guc_to_gt(guc);
>+	struct xe_guc_pc *guc_pc = &gt->uc.guc.pc;
>+	ktime_t before, after, delta;
>+	int load_done;
>+	u32 status = 0;
>+	int ret, count;
>+	u64 delta_ms;
>+	u32 before_freq;
>+
>+	before_freq = xe_guc_pc_get_act_freq(guc_pc);
>+	before = ktime_get();
> 	/*
>-	 * Wait for the GuC to start up.
>-	 * NB: Docs recommend not using the interrupt for completion.
>-	 * Measurements indicate this should take no more than 20ms
>-	 * (assuming the GT clock is at maximum frequency). So, a
>-	 * timeout here indicates that the GuC has failed and is unusable.
>-	 * (Higher levels of the driver may decide to reset the GuC and
>-	 * attempt the ucode load again if this happens.)
>-	 *
>-	 * FIXME: There is a known (but exceedingly unlikely) race condition
>-	 * where the asynchronous frequency management code could reduce
>-	 * the GT clock while a GuC reload is in progress (during a full
>-	 * GT reset). A fix is in progress but there are complex locking
>-	 * issues to be resolved. In the meantime bump the timeout to
>-	 * 200ms. Even at slowest clock, this should be sufficient. And
>-	 * in the working case, a larger timeout makes no difference.
>+	 * Note, can't use any kind of timing information from the call to xe_mmio_wait.
>+	 * It could return a thousand intermediate stages at random times. Instead, must
>+	 * manually track the total time taken and locally implement the timeout.

that's something we may improve in future with a variant with a better
variant of this function to be used on loops.

> 	 */
>-	ret = xe_mmio_wait32(guc_to_gt(guc), GUC_STATUS, GS_UKERNEL_MASK,
>-			     FIELD_PREP(GS_UKERNEL_MASK, XE_GUC_LOAD_STATUS_READY),
>-			     200000, &status, false);
>+	do {
>+		u32 last_status = status & (GS_UKERNEL_MASK | GS_BOOTROM_MASK);
>
>-	if (ret) {
>-		struct drm_device *drm = &xe->drm;
>-
>-		drm_info(drm, "GuC load failed: status = 0x%08X\n", status);
>-		drm_info(drm, "GuC load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>-			 REG_FIELD_GET(GS_MIA_IN_RESET, status),
>-			 REG_FIELD_GET(GS_BOOTROM_MASK, status),
>-			 REG_FIELD_GET(GS_UKERNEL_MASK, status),
>-			 REG_FIELD_GET(GS_MIA_MASK, status),
>-			 REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>-
>-		if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
>-			drm_info(drm, "GuC firmware signature verification failed\n");
>+		/*
>+		 * Wait for any change (intermediate or terminal) in the status register.
>+		 * Note, the return value is a don't care. The only failure code is timeout
>+		 * but the timeouts need to be accumulated over all the intermediate partial
>+		 * timeouts rather than allowing a huge timeout each time. So basically, need
>+		 * to treat a timeout no different to a value change.
>+		 */
>+		xe_mmio_wait32_not(gt, GUC_STATUS, GS_UKERNEL_MASK | GS_BOOTROM_MASK,
>+				   last_status, 1000 * 1000, &status, false);

if I understood correctly the meaning of this function, it would better
be called xe_mmio_wait32_mask() or xe_mmio_wait32_field()?

i.e. you provide the mask (GS_UKERNEL_MASK | GS_BOOTROM_MASK)
and the previous register value and you wait for a change
in that field of the register. Is that the correct meaning of the
function?

ugh... this bool value to show atomic or otherwise shouldn't be there
(but it's pre-existent issue).

Lucas De Marchi