[PATCH v4 2/2] drm/xe/guc: Port over the slow GuC loading support from i915
Lucas De Marchi
lucas.demarchi at intel.com
Thu Apr 4 21:07:40 UTC 2024
On Thu, Apr 04, 2024 at 01:11:34PM -0700, John Harrison wrote:
>On 4/4/2024 13:03, Lucas De Marchi wrote:
>>On Thu, Apr 04, 2024 at 12:52:45PM -0700, John Harrison wrote:
>>>On 4/4/2024 12:19, Lucas De Marchi wrote:
>>>>On Tue, Feb 27, 2024 at 05:09:56PM -0800,
>>>>John.C.Harrison at Intel.com wrote:
>>>>>From: John Harrison <John.C.Harrison at Intel.com>
>>>>>
>>>>>GuC loading can take longer than it is supposed to for various
>>>>>reasons. So add in the code to cope with that and to report it when it
>>>>>happens. There are also many different reasons why GuC loading can
>>>>>fail, so add in the code for checking for those and for reporting
>>>>>issues in a meaningful manner rather than just hitting a timeout and
>>>>>saying 'fail: status = %x'.
>>>>>
>>>>>Also, remove the 'FIXME' comment about an i915 bug that has never been
>>>>>applicable to Xe!
>>>>>
>>>>>v2: Actually report the requested and granted frequencies rather than
>>>>>showing granted twice (review feedback from Badal).
>>>>>v3: Locally code all the timeout and end condition handling because a
>>>>>helper function is not allowed (review feedback from Lucas/Rodrigo).
>>>>>
>>>>>Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>>>>---
>>>>>drivers/gpu/drm/xe/abi/guc_errors_abi.h | 26 ++-
>>>>>drivers/gpu/drm/xe/regs/xe_guc_regs.h | 2 +
>>>>>drivers/gpu/drm/xe/xe_guc.c | 226 +++++++++++++++++++-----
>>>>>drivers/gpu/drm/xe/xe_mmio.c | 61 +++++++
>>>>>drivers/gpu/drm/xe/xe_mmio.h | 2 +
>>>>>5 files changed, 274 insertions(+), 43 deletions(-)
>>>>>
>>>>>diff --git a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>index ec83551bf9c0..d0b5fed6876f 100644
>>>>>--- a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>+++ b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>@@ -7,8 +7,12 @@
>>>>>#define _ABI_GUC_ERRORS_ABI_H
>>>>>
>>>>>enum xe_guc_response_status {
>>>>>- XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>>>>- XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>>>>+ XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>>>>+ XE_GUC_RESPONSE_NOT_SUPPORTED = 0x20,
>>>>>+ XE_GUC_RESPONSE_NO_ATTRIBUTE_TABLE = 0x201,
>>>>>+ XE_GUC_RESPONSE_NO_DECRYPTION_KEY = 0x202,
>>>>>+ XE_GUC_RESPONSE_DECRYPTION_FAILED = 0x204,
>>>>>+ XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>>>>};
>>>>>
>>>>>enum xe_guc_load_status {
>>>>>@@ -17,6 +21,9 @@ enum xe_guc_load_status {
>>>>> XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH = 0x02,
>>>>> XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH = 0x03,
>>>>> XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE = 0x04,
>>>>>+ XE_GUC_LOAD_STATUS_HWCONFIG_START = 0x05,
>>>>>+ XE_GUC_LOAD_STATUS_HWCONFIG_DONE = 0x06,
>>>>>+ XE_GUC_LOAD_STATUS_HWCONFIG_ERROR = 0x07,
>>>>> XE_GUC_LOAD_STATUS_GDT_DONE = 0x10,
>>>>> XE_GUC_LOAD_STATUS_IDT_DONE = 0x20,
>>>>> XE_GUC_LOAD_STATUS_LAPIC_DONE = 0x30,
>>>>>@@ -34,4 +41,19 @@ enum xe_guc_load_status {
>>>>> XE_GUC_LOAD_STATUS_READY = 0xF0,
>>>>>};
>>>>>
>>>>>+enum xe_bootrom_load_status {
>>>>>+ XE_BOOTROM_STATUS_NO_KEY_FOUND = 0x13,
>>>>>+ XE_BOOTROM_STATUS_AES_PROD_KEY_FOUND = 0x1A,
>>>>>+ XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE = 0x2B,
>>>>>+ XE_BOOTROM_STATUS_RSA_FAILED = 0x50,
>>>>>+ XE_BOOTROM_STATUS_PAVPC_FAILED = 0x73,
>>>>>+ XE_BOOTROM_STATUS_WOPCM_FAILED = 0x74,
>>>>>+ XE_BOOTROM_STATUS_LOADLOC_FAILED = 0x75,
>>>>>+ XE_BOOTROM_STATUS_JUMP_PASSED = 0x76,
>>>>>+ XE_BOOTROM_STATUS_JUMP_FAILED = 0x77,
>>>>>+ XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED = 0x79,
>>>>>+ XE_BOOTROM_STATUS_MPUMAP_INCORRECT = 0x7A,
>>>>>+ XE_BOOTROM_STATUS_EXCEPTION = 0x7E,
>>>>>+};
>>>>>+
>>>>>#endif
>>>>>diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>index 4e7f809d2b00..2e54c9e6a4fe 100644
>>>>>--- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>+++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>@@ -40,6 +40,8 @@
>>>>>#define GS_BOOTROM_JUMP_PASSED REG_FIELD_PREP(GS_BOOTROM_MASK, 0x76)
>>>>>#define GS_MIA_IN_RESET REG_BIT(0)
>>>>>
>>>>>+#define GUC_HEADER_INFO XE_REG(0xc014)
>>>>>+
>>>>>#define GUC_WOPCM_SIZE XE_REG(0xc050)
>>>>>#define GUC_WOPCM_SIZE_MASK REG_GENMASK(31, 12)
>>>>>#define GUC_WOPCM_SIZE_LOCKED REG_BIT(0)
>>>>>diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>>>>index 0d2a2dd13f11..771971110600 100644
>>>>>--- a/drivers/gpu/drm/xe/xe_guc.c
>>>>>+++ b/drivers/gpu/drm/xe/xe_guc.c
>>>>>@@ -17,6 +17,7 @@
>>>>>#include "xe_device.h"
>>>>>#include "xe_force_wake.h"
>>>>>#include "xe_gt.h"
>>>>>+#include "xe_gt_throttle.h"
>>>>>#include "xe_guc_ads.h"
>>>>>#include "xe_guc_ct.h"
>>>>>#include "xe_guc_hwconfig.h"
>>>>>@@ -461,58 +462,201 @@ static int guc_xfer_rsa(struct xe_guc *guc)
>>>>> return 0;
>>>>>}
>>>>>
>>>>>-static int guc_wait_ucode(struct xe_guc *guc)
>>>>>+/*
>>>>>+ * Check a previously read GuC status register (GUC_STATUS)
>>>>>looking for
>>>>>+ * known terminal states (either completion or failure) of either the
>>>>>+ * microkernel status field or the boot ROM status field.
>>>>>Returns +1 for
>>>>>+ * successful completion, -1 for failure and 0 for any
>>>>>intermediate state.
>>>>>+ */
>>>>>+static int guc_load_done(u32 status)
>>>>>{
>>>>>- struct xe_device *xe = guc_to_xe(guc);
>>>>>- u32 status;
>>>>>- int ret;
>>>>>+ u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, status);
>>>>>+ u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>>>>>+
>>>>>+ switch (uk_val) {
>>>>>+ case XE_GUC_LOAD_STATUS_READY:
>>>>>+ return 1;
>>>>>+
>>>>>+ case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH:
>>>>>+ case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH:
>>>>>+ case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE:
>>>>>+ case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR:
>>>>>+ case XE_GUC_LOAD_STATUS_DPC_ERROR:
>>>>>+ case XE_GUC_LOAD_STATUS_EXCEPTION:
>>>>>+ case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID:
>>>>>+ case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID:
>>>>>+ case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID:
>>>>>+ return -1;
>>>>>+ }
>>>>>+
>>>>>+ switch (br_val) {
>>>>>+ case XE_BOOTROM_STATUS_NO_KEY_FOUND:
>>>>>+ case XE_BOOTROM_STATUS_RSA_FAILED:
>>>>>+ case XE_BOOTROM_STATUS_PAVPC_FAILED:
>>>>>+ case XE_BOOTROM_STATUS_WOPCM_FAILED:
>>>>>+ case XE_BOOTROM_STATUS_LOADLOC_FAILED:
>>>>>+ case XE_BOOTROM_STATUS_JUMP_FAILED:
>>>>>+ case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED:
>>>>>+ case XE_BOOTROM_STATUS_MPUMAP_INCORRECT:
>>>>>+ case XE_BOOTROM_STATUS_EXCEPTION:
>>>>>+ case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE:
>>>>>+ return -1;
>>>>>+ }
>>>>>+
>>>>>+ return 0;
>>>>>+}
>>>>>+
>>>>>+static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
>>>>>+{
>>>>>+ u32 freq;
>>>>>+ int ret = xe_guc_pc_get_cur_freq(guc_pc, &freq);
>>>>>+
>>>>>+ return ret ? ret : freq;
>>>>>+}
>>>>>+
>>>>>+/*
>>>>>+ * Wait for the GuC to start up.
>>>>>+ *
>>>>>+ * Measurements indicate this should take no more than 20ms
>>>>>(assuming the GT
>>>>>+ * clock is at maximum frequency). However, thermal
>>>>>throttling and other issues
>>>>>+ * can prevent the clock hitting max and thus making the load
>>>>>take significantly
>>>>>+ * longer. Indeed, if the GT is clamped to minimum frequency
>>>>>then the load times
>>>>>+ * can be in the seconds range. As, there is a limit on how
>>>>>long an individual
>>>>
>>>> ^ that comma seems misplaced
>>>>
>>>>>+ * usleep_range() can wait for, the wait is wrapped in a
>>>>>loop. The loop count
>>>>>+ * is increased for debug builds so that problems can be
>>>>>detected and analysed.
>>>>>+ * For release builds, the timeout is kept short so that
>>>>>user's don't wait
>>>>>+ * forever to find out there is a problem. In either case, if
>>>>>the load took longer
>>>>>+ * than is reasonable even with some 'sensible' throttling,
>>>>>then flag a warning
>>>>>+ * because something is not right.
>>>>>+ *
>>>>>+ * Note that the only reason an end user should hit the
>>>>>timeout is in case of
>>>>>+ * extreme thermal throttling. And a system that is that hot
>>>>>during boot is
>>>>>+ * probably dead anyway!
>>>>>+ */
>>>>>+#if defined(CONFIG_DRM_XE_DEBUG)
>>>>>+#define GUC_LOAD_RETRY_LIMIT 20
>>>>>+#else
>>>>>+#define GUC_LOAD_RETRY_LIMIT 3
>>>>>+#endif
>>>>
>>>>just sent a comment on this reasoning in the previous version.
>>>>Continuing from here.
>>>>
>>>>>+#define GUC_LOAD_TIME_WARN 200
>>>>
>>>>let's also add the unit, _MSEC
>>>>
>>>>Also, comment above says 20, but here we warn on 200. Is that 10x
>>>>difference a typo or intended?
>>>It's a safety margin to allow for situations such as a system that
>>>just rebooted due to thermal shutdown and is still running hot and
>>>therefore slow. I'll update the comment above to explain.
>>>
>>>>
>>>>>
>>>>>+static int guc_wait_ucode(struct xe_guc *guc)
>>>>>+{
>>>>>+ struct xe_gt *gt = guc_to_gt(guc);
>>>>>+ struct xe_guc_pc *guc_pc = >->uc.guc.pc;
>>>>>+ ktime_t before, after, delta;
>>>>>+ int load_done;
>>>>>+ u32 status = 0;
>>>>>+ int ret, count;
>>>>>+ u64 delta_ms;
>>>>>+ u32 before_freq;
>>>>>+
>>>>>+ before_freq = xe_guc_pc_get_act_freq(guc_pc);
>>>>>+ before = ktime_get();
>>>>> /*
>>>>>- * Wait for the GuC to start up.
>>>>>- * NB: Docs recommend not using the interrupt for completion.
>>>>>- * Measurements indicate this should take no more than 20ms
>>>>>- * (assuming the GT clock is at maximum frequency). So, a
>>>>>- * timeout here indicates that the GuC has failed and is
>>>>>unusable.
>>>>>- * (Higher levels of the driver may decide to reset the GuC and
>>>>>- * attempt the ucode load again if this happens.)
>>>>>- *
>>>>>- * FIXME: There is a known (but exceedingly unlikely)
>>>>>race condition
>>>>>- * where the asynchronous frequency management code could reduce
>>>>>- * the GT clock while a GuC reload is in progress (during a full
>>>>>- * GT reset). A fix is in progress but there are complex locking
>>>>>- * issues to be resolved. In the meantime bump the timeout to
>>>>>- * 200ms. Even at slowest clock, this should be sufficient. And
>>>>>- * in the working case, a larger timeout makes no difference.
>>>>>+ * Note, can't use any kind of timing information from
>>>>>the call to xe_mmio_wait.
>>>>>+ * It could return a thousand intermediate stages at
>>>>>random times. Instead, must
>>>>>+ * manually track the total time taken and locally
>>>>>implement the timeout.
>>>>
>>>>that's something we may improve in future with a variant with a better
>>>>variant of this function to be used on loops.
>>>
>>>Or maybe a variant which allows a custom comparison function to be
>>>passed in and keeps all the time tracking internal...
>>
>>yep, that would be ok I think. Do you want to give this a shot?
>>
>>Lucas De Marchi
>
>Um. That's exactly what it was using to start with and exactly what
>got shot down as not being allowed at all ever under any
>circumstances.
no, that was not the version 1. Pasting here from version 1:
diff --git a/drivers/gpu/drm/xe/xe_macros.h b/drivers/gpu/drm/xe/xe_macros.h
index daf56c846d03..eac8f2c9fba5 100644
--- a/drivers/gpu/drm/xe/xe_macros.h
+++ b/drivers/gpu/drm/xe/xe_macros.h
@@ -15,4 +15,36 @@
"Ioctl argument check failed at %s:%d: %s", \
__FILE__, __LINE__, #cond), 1))
+/*
+ * xe_wait_for - magic wait macro
+ *
+ * Macro to help avoid open coding check/wait/timeout patterns. Note that it's
+ * important that we check the condition again after having timed out, since the
+ * timeout could be due to preemption or similar and we've never had a chance to
+ * check the condition before the timeout.
+ */
+#define xe_wait_for(COND, US) ({ \
What I'm telling is to have a xe_mmio_wait() variant that is in the
proper namespace, wait on a mmio, etc. They can even share part of the implementation
inside xe_mmio.c. Currently the only wait condition is == value. Having
a different wait condition as return from a functor is not the same thing
as starting a kitchen-sink-header.h. That's what the push back was on.
Lucas De Marchi
>
>John.
>
>>
>>>
>>>>
>>>>> */
>>>>>- ret = xe_mmio_wait32(guc_to_gt(guc), GUC_STATUS, GS_UKERNEL_MASK,
>>>>>- FIELD_PREP(GS_UKERNEL_MASK,
>>>>>XE_GUC_LOAD_STATUS_READY),
>>>>>- 200000, &status, false);
>>>>>+ do {
>>>>>+ u32 last_status = status & (GS_UKERNEL_MASK |
>>>>>GS_BOOTROM_MASK);
>>>>>
>>>>>- if (ret) {
>>>>>- struct drm_device *drm = &xe->drm;
>>>>>-
>>>>>- drm_info(drm, "GuC load failed: status = 0x%08X\n", status);
>>>>>- drm_info(drm, "GuC load failed: status: Reset = %d,
>>>>>BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth =
>>>>>0x%02X\n",
>>>>>- REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>>>>- REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>>>>- REG_FIELD_GET(GS_UKERNEL_MASK, status),
>>>>>- REG_FIELD_GET(GS_MIA_MASK, status),
>>>>>- REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>>>>-
>>>>>- if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
>>>>>- drm_info(drm, "GuC firmware signature
>>>>>verification failed\n");
>>>>>+ /*
>>>>>+ * Wait for any change (intermediate or terminal) in
>>>>>the status register.
>>>>>+ * Note, the return value is a don't care. The only
>>>>>failure code is timeout
>>>>>+ * but the timeouts need to be accumulated over all
>>>>>the intermediate partial
>>>>>+ * timeouts rather than allowing a huge timeout each
>>>>>time. So basically, need
>>>>>+ * to treat a timeout no different to a value change.
>>>>>+ */
>>>>>+ xe_mmio_wait32_not(gt, GUC_STATUS, GS_UKERNEL_MASK |
>>>>>GS_BOOTROM_MASK,
>>>>>+ last_status, 1000 * 1000, &status, false);
>>>>
>>>>if I understood correctly the meaning of this function, it would better
>>>>be called xe_mmio_wait32_mask() or xe_mmio_wait32_field()?
>>>>
>>>>i.e. you provide the mask (GS_UKERNEL_MASK | GS_BOOTROM_MASK)
>>>>and the previous register value and you wait for a change
>>>>in that field of the register. Is that the correct meaning of the
>>>>function?
>>>The test is 'read != given'. That sounds like a not to me. A _mask
>>>or _field suffix (to me) would imply just that the regular wait
>>>semantics are applied with a mask or to a specific field (which is
>>>already the case, isn't it?). I.e. wait until '(read & mask) ==
>>>given' or even '(read & mask) == (given << shift)' for a field
>>>version. This is the negative version of that test. It must wait
>>>until the value has changed, i.e. is not equal to the given
>>>target.
>>>
>>>>
>>>>ugh... this bool value to show atomic or otherwise shouldn't be there
>>>>(but it's pre-existent issue).
>>>Feel free to fix the baseline xe_mmio_wait32 function. I'm just
>>>copying that and trying to stay as close as possible but just
>>>inverting the comparison logic.
>>>
>>>John.
>>>
>>>>
>>>>Lucas De Marchi
>>>
>
More information about the Intel-xe
mailing list