[PATCH v4 2/2] drm/xe/guc: Port over the slow GuC loading support from i915
John Harrison
john.c.harrison at intel.com
Thu Apr 4 20:11:34 UTC 2024
On 4/4/2024 13:03, Lucas De Marchi wrote:
> On Thu, Apr 04, 2024 at 12:52:45PM -0700, John Harrison wrote:
>> On 4/4/2024 12:19, Lucas De Marchi wrote:
>>> On Tue, Feb 27, 2024 at 05:09:56PM -0800, John.C.Harrison at Intel.com
>>> wrote:
>>>> From: John Harrison <John.C.Harrison at Intel.com>
>>>>
>>>> GuC loading can take longer than it is supposed to for various
>>>> reasons. So add in the code to cope with that and to report it when it
>>>> happens. There are also many different reasons why GuC loading can
>>>> fail, so add in the code for checking for those and for reporting
>>>> issues in a meaningful manner rather than just hitting a timeout and
>>>> saying 'fail: status = %x'.
>>>>
>>>> Also, remove the 'FIXME' comment about an i915 bug that has never been
>>>> applicable to Xe!
>>>>
>>>> v2: Actually report the requested and granted frequencies rather than
>>>> showing granted twice (review feedback from Badal).
>>>> v3: Locally code all the timeout and end condition handling because a
>>>> helper function is not allowed (review feedback from Lucas/Rodrigo).
>>>>
>>>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>>> ---
>>>> drivers/gpu/drm/xe/abi/guc_errors_abi.h | 26 ++-
>>>> drivers/gpu/drm/xe/regs/xe_guc_regs.h | 2 +
>>>> drivers/gpu/drm/xe/xe_guc.c | 226 +++++++++++++++++++-----
>>>> drivers/gpu/drm/xe/xe_mmio.c | 61 +++++++
>>>> drivers/gpu/drm/xe/xe_mmio.h | 2 +
>>>> 5 files changed, 274 insertions(+), 43 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>> b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>> index ec83551bf9c0..d0b5fed6876f 100644
>>>> --- a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>> +++ b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>> @@ -7,8 +7,12 @@
>>>> #define _ABI_GUC_ERRORS_ABI_H
>>>>
>>>> enum xe_guc_response_status {
>>>> - XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>>> - XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>>> + XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>>> + XE_GUC_RESPONSE_NOT_SUPPORTED = 0x20,
>>>> + XE_GUC_RESPONSE_NO_ATTRIBUTE_TABLE = 0x201,
>>>> + XE_GUC_RESPONSE_NO_DECRYPTION_KEY = 0x202,
>>>> + XE_GUC_RESPONSE_DECRYPTION_FAILED = 0x204,
>>>> + XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>>> };
>>>>
>>>> enum xe_guc_load_status {
>>>> @@ -17,6 +21,9 @@ enum xe_guc_load_status {
>>>> XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH = 0x02,
>>>> XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH = 0x03,
>>>> XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE = 0x04,
>>>> + XE_GUC_LOAD_STATUS_HWCONFIG_START = 0x05,
>>>> + XE_GUC_LOAD_STATUS_HWCONFIG_DONE = 0x06,
>>>> + XE_GUC_LOAD_STATUS_HWCONFIG_ERROR = 0x07,
>>>> XE_GUC_LOAD_STATUS_GDT_DONE = 0x10,
>>>> XE_GUC_LOAD_STATUS_IDT_DONE = 0x20,
>>>> XE_GUC_LOAD_STATUS_LAPIC_DONE = 0x30,
>>>> @@ -34,4 +41,19 @@ enum xe_guc_load_status {
>>>> XE_GUC_LOAD_STATUS_READY = 0xF0,
>>>> };
>>>>
>>>> +enum xe_bootrom_load_status {
>>>> + XE_BOOTROM_STATUS_NO_KEY_FOUND = 0x13,
>>>> + XE_BOOTROM_STATUS_AES_PROD_KEY_FOUND = 0x1A,
>>>> + XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE = 0x2B,
>>>> + XE_BOOTROM_STATUS_RSA_FAILED = 0x50,
>>>> + XE_BOOTROM_STATUS_PAVPC_FAILED = 0x73,
>>>> + XE_BOOTROM_STATUS_WOPCM_FAILED = 0x74,
>>>> + XE_BOOTROM_STATUS_LOADLOC_FAILED = 0x75,
>>>> + XE_BOOTROM_STATUS_JUMP_PASSED = 0x76,
>>>> + XE_BOOTROM_STATUS_JUMP_FAILED = 0x77,
>>>> + XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED = 0x79,
>>>> + XE_BOOTROM_STATUS_MPUMAP_INCORRECT = 0x7A,
>>>> + XE_BOOTROM_STATUS_EXCEPTION = 0x7E,
>>>> +};
>>>> +
>>>> #endif
>>>> diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>> b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>> index 4e7f809d2b00..2e54c9e6a4fe 100644
>>>> --- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>> +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>> @@ -40,6 +40,8 @@
>>>> #define GS_BOOTROM_JUMP_PASSED REG_FIELD_PREP(GS_BOOTROM_MASK, 0x76)
>>>> #define GS_MIA_IN_RESET REG_BIT(0)
>>>>
>>>> +#define GUC_HEADER_INFO XE_REG(0xc014)
>>>> +
>>>> #define GUC_WOPCM_SIZE XE_REG(0xc050)
>>>> #define GUC_WOPCM_SIZE_MASK REG_GENMASK(31, 12)
>>>> #define GUC_WOPCM_SIZE_LOCKED REG_BIT(0)
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>>> index 0d2a2dd13f11..771971110600 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc.c
>>>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>>>> @@ -17,6 +17,7 @@
>>>> #include "xe_device.h"
>>>> #include "xe_force_wake.h"
>>>> #include "xe_gt.h"
>>>> +#include "xe_gt_throttle.h"
>>>> #include "xe_guc_ads.h"
>>>> #include "xe_guc_ct.h"
>>>> #include "xe_guc_hwconfig.h"
>>>> @@ -461,58 +462,201 @@ static int guc_xfer_rsa(struct xe_guc *guc)
>>>> return 0;
>>>> }
>>>>
>>>> -static int guc_wait_ucode(struct xe_guc *guc)
>>>> +/*
>>>> + * Check a previously read GuC status register (GUC_STATUS)
>>>> looking for
>>>> + * known terminal states (either completion or failure) of either the
>>>> + * microkernel status field or the boot ROM status field. Returns
>>>> +1 for
>>>> + * successful completion, -1 for failure and 0 for any
>>>> intermediate state.
>>>> + */
>>>> +static int guc_load_done(u32 status)
>>>> {
>>>> - struct xe_device *xe = guc_to_xe(guc);
>>>> - u32 status;
>>>> - int ret;
>>>> + u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, status);
>>>> + u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>>>> +
>>>> + switch (uk_val) {
>>>> + case XE_GUC_LOAD_STATUS_READY:
>>>> + return 1;
>>>> +
>>>> + case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH:
>>>> + case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH:
>>>> + case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE:
>>>> + case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR:
>>>> + case XE_GUC_LOAD_STATUS_DPC_ERROR:
>>>> + case XE_GUC_LOAD_STATUS_EXCEPTION:
>>>> + case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID:
>>>> + case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID:
>>>> + case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID:
>>>> + return -1;
>>>> + }
>>>> +
>>>> + switch (br_val) {
>>>> + case XE_BOOTROM_STATUS_NO_KEY_FOUND:
>>>> + case XE_BOOTROM_STATUS_RSA_FAILED:
>>>> + case XE_BOOTROM_STATUS_PAVPC_FAILED:
>>>> + case XE_BOOTROM_STATUS_WOPCM_FAILED:
>>>> + case XE_BOOTROM_STATUS_LOADLOC_FAILED:
>>>> + case XE_BOOTROM_STATUS_JUMP_FAILED:
>>>> + case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED:
>>>> + case XE_BOOTROM_STATUS_MPUMAP_INCORRECT:
>>>> + case XE_BOOTROM_STATUS_EXCEPTION:
>>>> + case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE:
>>>> + return -1;
>>>> + }
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
>>>> +{
>>>> + u32 freq;
>>>> + int ret = xe_guc_pc_get_cur_freq(guc_pc, &freq);
>>>> +
>>>> + return ret ? ret : freq;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Wait for the GuC to start up.
>>>> + *
>>>> + * Measurements indicate this should take no more than 20ms
>>>> (assuming the GT
>>>> + * clock is at maximum frequency). However, thermal throttling and
>>>> other issues
>>>> + * can prevent the clock hitting max and thus making the load take
>>>> significantly
>>>> + * longer. Indeed, if the GT is clamped to minimum frequency then
>>>> the load times
>>>> + * can be in the seconds range. As, there is a limit on how long
>>>> an individual
>>>
>>> ^ that comma seems misplaced
>>>
>>>> + * usleep_range() can wait for, the wait is wrapped in a loop. The
>>>> loop count
>>>> + * is increased for debug builds so that problems can be detected
>>>> and analysed.
>>>> + * For release builds, the timeout is kept short so that user's
>>>> don't wait
>>>> + * forever to find out there is a problem. In either case, if the
>>>> load took longer
>>>> + * than is reasonable even with some 'sensible' throttling, then
>>>> flag a warning
>>>> + * because something is not right.
>>>> + *
>>>> + * Note that the only reason an end user should hit the timeout is
>>>> in case of
>>>> + * extreme thermal throttling. And a system that is that hot
>>>> during boot is
>>>> + * probably dead anyway!
>>>> + */
>>>> +#if defined(CONFIG_DRM_XE_DEBUG)
>>>> +#define GUC_LOAD_RETRY_LIMIT 20
>>>> +#else
>>>> +#define GUC_LOAD_RETRY_LIMIT 3
>>>> +#endif
>>>
>>> just sent a comment on this reasoning in the previous version.
>>> Continuing from here.
>>>
>>>> +#define GUC_LOAD_TIME_WARN 200
>>>
>>> let's also add the unit, _MSEC
>>>
>>> Also, comment above says 20, but here we warn on 200. Is that 10x
>>> difference a typo or intended?
>> It's a safety margin to allow for situations such as a system that
>> just rebooted due to thermal shutdown and is still running hot and
>> therefore slow. I'll update the comment above to explain.
>>
>>>
>>>>
>>>> +static int guc_wait_ucode(struct xe_guc *guc)
>>>> +{
>>>> + struct xe_gt *gt = guc_to_gt(guc);
>>>> + struct xe_guc_pc *guc_pc = >->uc.guc.pc;
>>>> + ktime_t before, after, delta;
>>>> + int load_done;
>>>> + u32 status = 0;
>>>> + int ret, count;
>>>> + u64 delta_ms;
>>>> + u32 before_freq;
>>>> +
>>>> + before_freq = xe_guc_pc_get_act_freq(guc_pc);
>>>> + before = ktime_get();
>>>> /*
>>>> - * Wait for the GuC to start up.
>>>> - * NB: Docs recommend not using the interrupt for completion.
>>>> - * Measurements indicate this should take no more than 20ms
>>>> - * (assuming the GT clock is at maximum frequency). So, a
>>>> - * timeout here indicates that the GuC has failed and is
>>>> unusable.
>>>> - * (Higher levels of the driver may decide to reset the GuC and
>>>> - * attempt the ucode load again if this happens.)
>>>> - *
>>>> - * FIXME: There is a known (but exceedingly unlikely) race
>>>> condition
>>>> - * where the asynchronous frequency management code could reduce
>>>> - * the GT clock while a GuC reload is in progress (during a full
>>>> - * GT reset). A fix is in progress but there are complex locking
>>>> - * issues to be resolved. In the meantime bump the timeout to
>>>> - * 200ms. Even at slowest clock, this should be sufficient. And
>>>> - * in the working case, a larger timeout makes no difference.
>>>> + * Note, can't use any kind of timing information from the
>>>> call to xe_mmio_wait.
>>>> + * It could return a thousand intermediate stages at random
>>>> times. Instead, must
>>>> + * manually track the total time taken and locally implement
>>>> the timeout.
>>>
>>> that's something we may improve in future with a variant with a better
>>> variant of this function to be used on loops.
>>
>> Or maybe a variant which allows a custom comparison function to be
>> passed in and keeps all the time tracking internal...
>
> yep, that would be ok I think. Do you want to give this a shot?
>
> Lucas De Marchi
Um. That's exactly what it was using to start with and exactly what got
shot down as not being allowed at all ever under any circumstances.
John.
>
>>
>>>
>>>> */
>>>> - ret = xe_mmio_wait32(guc_to_gt(guc), GUC_STATUS, GS_UKERNEL_MASK,
>>>> - FIELD_PREP(GS_UKERNEL_MASK,
>>>> XE_GUC_LOAD_STATUS_READY),
>>>> - 200000, &status, false);
>>>> + do {
>>>> + u32 last_status = status & (GS_UKERNEL_MASK |
>>>> GS_BOOTROM_MASK);
>>>>
>>>> - if (ret) {
>>>> - struct drm_device *drm = &xe->drm;
>>>> -
>>>> - drm_info(drm, "GuC load failed: status = 0x%08X\n", status);
>>>> - drm_info(drm, "GuC load failed: status: Reset = %d,
>>>> BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>>>> - REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>>> - REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>>> - REG_FIELD_GET(GS_UKERNEL_MASK, status),
>>>> - REG_FIELD_GET(GS_MIA_MASK, status),
>>>> - REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>>> -
>>>> - if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
>>>> - drm_info(drm, "GuC firmware signature verification
>>>> failed\n");
>>>> + /*
>>>> + * Wait for any change (intermediate or terminal) in the
>>>> status register.
>>>> + * Note, the return value is a don't care. The only
>>>> failure code is timeout
>>>> + * but the timeouts need to be accumulated over all the
>>>> intermediate partial
>>>> + * timeouts rather than allowing a huge timeout each time.
>>>> So basically, need
>>>> + * to treat a timeout no different to a value change.
>>>> + */
>>>> + xe_mmio_wait32_not(gt, GUC_STATUS, GS_UKERNEL_MASK |
>>>> GS_BOOTROM_MASK,
>>>> + last_status, 1000 * 1000, &status, false);
>>>
>>> if I understood correctly the meaning of this function, it would better
>>> be called xe_mmio_wait32_mask() or xe_mmio_wait32_field()?
>>>
>>> i.e. you provide the mask (GS_UKERNEL_MASK | GS_BOOTROM_MASK)
>>> and the previous register value and you wait for a change
>>> in that field of the register. Is that the correct meaning of the
>>> function?
>> The test is 'read != given'. That sounds like a not to me. A _mask or
>> _field suffix (to me) would imply just that the regular wait
>> semantics are applied with a mask or to a specific field (which is
>> already the case, isn't it?). I.e. wait until '(read & mask) ==
>> given' or even '(read & mask) == (given << shift)' for a field
>> version. This is the negative version of that test. It must wait
>> until the value has changed, i.e. is not equal to the given target.
>>
>>>
>>> ugh... this bool value to show atomic or otherwise shouldn't be there
>>> (but it's pre-existent issue).
>> Feel free to fix the baseline xe_mmio_wait32 function. I'm just
>> copying that and trying to stay as close as possible but just
>> inverting the comparison logic.
>>
>> John.
>>
>>>
>>> Lucas De Marchi
>>
More information about the Intel-xe
mailing list