[PATCH v4 2/2] drm/xe/guc: Port over the slow GuC loading support from i915

Mon Apr 8 23:48:02 UTC 2024

On 4/4/2024 14:07, Lucas De Marchi wrote:
> On Thu, Apr 04, 2024 at 01:11:34PM -0700, John Harrison wrote:
>> On 4/4/2024 13:03, Lucas De Marchi wrote:
>>> On Thu, Apr 04, 2024 at 12:52:45PM -0700, John Harrison wrote:
>>>> On 4/4/2024 12:19, Lucas De Marchi wrote:
>>>>> On Tue, Feb 27, 2024 at 05:09:56PM -0800, 
>>>>> John.C.Harrison at Intel.com wrote:
>>>>>> From: John Harrison <John.C.Harrison at Intel.com>
>>>>>>
>>>>>> GuC loading can take longer than it is supposed to for various
>>>>>> reasons. So add in the code to cope with that and to report it 
>>>>>> when it
>>>>>> happens. There are also many different reasons why GuC loading can
>>>>>> fail, so add in the code for checking for those and for reporting
>>>>>> issues in a meaningful manner rather than just hitting a timeout and
>>>>>> saying 'fail: status = %x'.
>>>>>>
>>>>>> Also, remove the 'FIXME' comment about an i915 bug that has never 
>>>>>> been
>>>>>> applicable to Xe!
>>>>>>
>>>>>> v2: Actually report the requested and granted frequencies rather 
>>>>>> than
>>>>>> showing granted twice (review feedback from Badal).
>>>>>> v3: Locally code all the timeout and end condition handling 
>>>>>> because a
>>>>>> helper function is not allowed (review feedback from Lucas/Rodrigo).
>>>>>>
>>>>>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/xe/abi/guc_errors_abi.h |  26 ++-
>>>>>> drivers/gpu/drm/xe/regs/xe_guc_regs.h   |   2 +
>>>>>> drivers/gpu/drm/xe/xe_guc.c             | 226 
>>>>>> +++++++++++++++++++-----
>>>>>> drivers/gpu/drm/xe/xe_mmio.c            |  61 +++++++
>>>>>> drivers/gpu/drm/xe/xe_mmio.h            |   2 +
>>>>>> 5 files changed, 274 insertions(+), 43 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/xe/abi/guc_errors_abi.h 
>>>>>> b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>> index ec83551bf9c0..d0b5fed6876f 100644
>>>>>> --- a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>> +++ b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>>>>> @@ -7,8 +7,12 @@
>>>>>> #define _ABI_GUC_ERRORS_ABI_H
>>>>>>
>>>>>> enum xe_guc_response_status {
>>>>>> -    XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>>>>> -    XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>>>>> +    XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>>>>> +    XE_GUC_RESPONSE_NOT_SUPPORTED = 0x20,
>>>>>> +    XE_GUC_RESPONSE_NO_ATTRIBUTE_TABLE = 0x201,
>>>>>> +    XE_GUC_RESPONSE_NO_DECRYPTION_KEY = 0x202,
>>>>>> +    XE_GUC_RESPONSE_DECRYPTION_FAILED = 0x204,
>>>>>> +    XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>>>>> };
>>>>>>
>>>>>> enum xe_guc_load_status {
>>>>>> @@ -17,6 +21,9 @@ enum xe_guc_load_status {
>>>>>>     XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH = 0x02,
>>>>>>     XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH = 0x03,
>>>>>>     XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE = 0x04,
>>>>>> +    XE_GUC_LOAD_STATUS_HWCONFIG_START = 0x05,
>>>>>> +    XE_GUC_LOAD_STATUS_HWCONFIG_DONE = 0x06,
>>>>>> +    XE_GUC_LOAD_STATUS_HWCONFIG_ERROR = 0x07,
>>>>>>     XE_GUC_LOAD_STATUS_GDT_DONE = 0x10,
>>>>>>     XE_GUC_LOAD_STATUS_IDT_DONE = 0x20,
>>>>>>     XE_GUC_LOAD_STATUS_LAPIC_DONE = 0x30,
>>>>>> @@ -34,4 +41,19 @@ enum xe_guc_load_status {
>>>>>>     XE_GUC_LOAD_STATUS_READY = 0xF0,
>>>>>> };
>>>>>>
>>>>>> +enum xe_bootrom_load_status {
>>>>>> +    XE_BOOTROM_STATUS_NO_KEY_FOUND = 0x13,
>>>>>> +    XE_BOOTROM_STATUS_AES_PROD_KEY_FOUND = 0x1A,
>>>>>> +    XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE = 0x2B,
>>>>>> +    XE_BOOTROM_STATUS_RSA_FAILED = 0x50,
>>>>>> +    XE_BOOTROM_STATUS_PAVPC_FAILED = 0x73,
>>>>>> +    XE_BOOTROM_STATUS_WOPCM_FAILED = 0x74,
>>>>>> +    XE_BOOTROM_STATUS_LOADLOC_FAILED = 0x75,
>>>>>> +    XE_BOOTROM_STATUS_JUMP_PASSED = 0x76,
>>>>>> +    XE_BOOTROM_STATUS_JUMP_FAILED = 0x77,
>>>>>> +    XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED = 0x79,
>>>>>> +    XE_BOOTROM_STATUS_MPUMAP_INCORRECT = 0x7A,
>>>>>> +    XE_BOOTROM_STATUS_EXCEPTION = 0x7E,
>>>>>> +};
>>>>>> +
>>>>>> #endif
>>>>>> diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h 
>>>>>> b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>> index 4e7f809d2b00..2e54c9e6a4fe 100644
>>>>>> --- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>> +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>>>>> @@ -40,6 +40,8 @@
>>>>>> #define   GS_BOOTROM_JUMP_PASSED REG_FIELD_PREP(GS_BOOTROM_MASK, 
>>>>>> 0x76)
>>>>>> #define   GS_MIA_IN_RESET            REG_BIT(0)
>>>>>>
>>>>>> +#define GUC_HEADER_INFO                XE_REG(0xc014)
>>>>>> +
>>>>>> #define GUC_WOPCM_SIZE                XE_REG(0xc050)
>>>>>> #define   GUC_WOPCM_SIZE_MASK            REG_GENMASK(31, 12)
>>>>>> #define   GUC_WOPCM_SIZE_LOCKED            REG_BIT(0)
>>>>>> diff --git a/drivers/gpu/drm/xe/xe_guc.c 
>>>>>> b/drivers/gpu/drm/xe/xe_guc.c
>>>>>> index 0d2a2dd13f11..771971110600 100644
>>>>>> --- a/drivers/gpu/drm/xe/xe_guc.c
>>>>>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>>>>>> @@ -17,6 +17,7 @@
>>>>>> #include "xe_device.h"
>>>>>> #include "xe_force_wake.h"
>>>>>> #include "xe_gt.h"
>>>>>> +#include "xe_gt_throttle.h"
>>>>>> #include "xe_guc_ads.h"
>>>>>> #include "xe_guc_ct.h"
>>>>>> #include "xe_guc_hwconfig.h"
>>>>>> @@ -461,58 +462,201 @@ static int guc_xfer_rsa(struct xe_guc *guc)
>>>>>>     return 0;
>>>>>> }
>>>>>>
>>>>>> -static int guc_wait_ucode(struct xe_guc *guc)
>>>>>> +/*
>>>>>> + * Check a previously read GuC status register (GUC_STATUS) 
>>>>>> looking for
>>>>>> + * known terminal states (either completion or failure) of 
>>>>>> either the
>>>>>> + * microkernel status field or the boot ROM status field. 
>>>>>> Returns +1 for
>>>>>> + * successful completion, -1 for failure and 0 for any 
>>>>>> intermediate state.
>>>>>> + */
>>>>>> +static int guc_load_done(u32 status)
>>>>>> {
>>>>>> -    struct xe_device *xe = guc_to_xe(guc);
>>>>>> -    u32 status;
>>>>>> -    int ret;
>>>>>> +    u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, status);
>>>>>> +    u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>>>>>> +
>>>>>> +    switch (uk_val) {
>>>>>> +    case XE_GUC_LOAD_STATUS_READY:
>>>>>> +        return 1;
>>>>>> +
>>>>>> +    case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH:
>>>>>> +    case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH:
>>>>>> +    case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE:
>>>>>> +    case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR:
>>>>>> +    case XE_GUC_LOAD_STATUS_DPC_ERROR:
>>>>>> +    case XE_GUC_LOAD_STATUS_EXCEPTION:
>>>>>> +    case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID:
>>>>>> +    case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID:
>>>>>> +    case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID:
>>>>>> +        return -1;
>>>>>> +    }
>>>>>> +
>>>>>> +    switch (br_val) {
>>>>>> +    case XE_BOOTROM_STATUS_NO_KEY_FOUND:
>>>>>> +    case XE_BOOTROM_STATUS_RSA_FAILED:
>>>>>> +    case XE_BOOTROM_STATUS_PAVPC_FAILED:
>>>>>> +    case XE_BOOTROM_STATUS_WOPCM_FAILED:
>>>>>> +    case XE_BOOTROM_STATUS_LOADLOC_FAILED:
>>>>>> +    case XE_BOOTROM_STATUS_JUMP_FAILED:
>>>>>> +    case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED:
>>>>>> +    case XE_BOOTROM_STATUS_MPUMAP_INCORRECT:
>>>>>> +    case XE_BOOTROM_STATUS_EXCEPTION:
>>>>>> +    case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE:
>>>>>> +        return -1;
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
>>>>>> +{
>>>>>> +    u32 freq;
>>>>>> +    int ret = xe_guc_pc_get_cur_freq(guc_pc, &freq);
>>>>>> +
>>>>>> +    return ret ? ret : freq;
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * Wait for the GuC to start up.
>>>>>> + *
>>>>>> + * Measurements indicate this should take no more than 20ms 
>>>>>> (assuming the GT
>>>>>> + * clock is at maximum frequency). However, thermal throttling 
>>>>>> and other issues
>>>>>> + * can prevent the clock hitting max and thus making the load 
>>>>>> take significantly
>>>>>> + * longer. Indeed, if the GT is clamped to minimum frequency 
>>>>>> then the load times
>>>>>> + * can be in the seconds range. As, there is a limit on how long 
>>>>>> an individual
>>>>>
>>>>>                     ^ that comma seems misplaced
>>>>>
>>>>>> + * usleep_range() can wait for, the wait is wrapped in a loop. 
>>>>>> The loop count
>>>>>> + * is increased for debug builds so that problems can be 
>>>>>> detected and analysed.
>>>>>> + * For release builds, the timeout is kept short so that user's 
>>>>>> don't wait
>>>>>> + * forever to find out there is a problem. In either case, if 
>>>>>> the load took longer
>>>>>> + * than is reasonable even with some 'sensible' throttling, then 
>>>>>> flag a warning
>>>>>> + * because something is not right.
>>>>>> + *
>>>>>> + * Note that the only reason an end user should hit the timeout 
>>>>>> is in case of
>>>>>> + * extreme thermal throttling. And a system that is that hot 
>>>>>> during boot is
>>>>>> + * probably dead anyway!
>>>>>> + */
>>>>>> +#if defined(CONFIG_DRM_XE_DEBUG)
>>>>>> +#define GUC_LOAD_RETRY_LIMIT    20
>>>>>> +#else
>>>>>> +#define GUC_LOAD_RETRY_LIMIT    3
>>>>>> +#endif
>>>>>
>>>>> just sent a comment on this reasoning in the previous version.
>>>>> Continuing from here.
>>>>>
>>>>>> +#define GUC_LOAD_TIME_WARN 200
>>>>>
>>>>> let's also add the unit, _MSEC
>>>>>
>>>>> Also, comment above says 20, but here we warn on 200. Is that 10x
>>>>> difference a typo or intended?
>>>> It's a safety margin to allow for situations such as a system that 
>>>> just rebooted due to thermal shutdown and is still running hot and 
>>>> therefore slow. I'll update the comment above to explain.
>>>>
>>>>>
>>>>>>
>>>>>> +static int guc_wait_ucode(struct xe_guc *guc)
>>>>>> +{
>>>>>> +    struct xe_gt *gt = guc_to_gt(guc);
>>>>>> +    struct xe_guc_pc *guc_pc = &gt->uc.guc.pc;
>>>>>> +    ktime_t before, after, delta;
>>>>>> +    int load_done;
>>>>>> +    u32 status = 0;
>>>>>> +    int ret, count;
>>>>>> +    u64 delta_ms;
>>>>>> +    u32 before_freq;
>>>>>> +
>>>>>> +    before_freq = xe_guc_pc_get_act_freq(guc_pc);
>>>>>> +    before = ktime_get();
>>>>>>     /*
>>>>>> -     * Wait for the GuC to start up.
>>>>>> -     * NB: Docs recommend not using the interrupt for completion.
>>>>>> -     * Measurements indicate this should take no more than 20ms
>>>>>> -     * (assuming the GT clock is at maximum frequency). So, a
>>>>>> -     * timeout here indicates that the GuC has failed and is 
>>>>>> unusable.
>>>>>> -     * (Higher levels of the driver may decide to reset the GuC and
>>>>>> -     * attempt the ucode load again if this happens.)
>>>>>> -     *
>>>>>> -     * FIXME: There is a known (but exceedingly unlikely) race 
>>>>>> condition
>>>>>> -     * where the asynchronous frequency management code could 
>>>>>> reduce
>>>>>> -     * the GT clock while a GuC reload is in progress (during a 
>>>>>> full
>>>>>> -     * GT reset). A fix is in progress but there are complex 
>>>>>> locking
>>>>>> -     * issues to be resolved. In the meantime bump the timeout to
>>>>>> -     * 200ms. Even at slowest clock, this should be sufficient. And
>>>>>> -     * in the working case, a larger timeout makes no difference.
>>>>>> +     * Note, can't use any kind of timing information from the 
>>>>>> call to xe_mmio_wait.
>>>>>> +     * It could return a thousand intermediate stages at random 
>>>>>> times. Instead, must
>>>>>> +     * manually track the total time taken and locally implement 
>>>>>> the timeout.
>>>>>
>>>>> that's something we may improve in future with a variant with a 
>>>>> better
>>>>> variant of this function to be used on loops.
>>>>
>>>> Or maybe a variant which allows a custom comparison function to be 
>>>> passed in and keeps all the time tracking internal...
>>>
>>> yep, that would be ok I think. Do you want to give this a shot?
>>>
>>> Lucas De Marchi
>>
>> Um. That's exactly what it was using to start with and exactly what 
>> got shot down as not being allowed at all ever under any circumstances.
>
>
> no, that was not the version 1. Pasting here from version 1:
>
>     diff --git a/drivers/gpu/drm/xe/xe_macros.h 
> b/drivers/gpu/drm/xe/xe_macros.h
>     index daf56c846d03..eac8f2c9fba5 100644
>     --- a/drivers/gpu/drm/xe/xe_macros.h
>     +++ b/drivers/gpu/drm/xe/xe_macros.h
>     @@ -15,4 +15,36 @@
>                     "Ioctl argument check failed at %s:%d: %s", \
>                     __FILE__, __LINE__, #cond), 1))
>
>     +/*
>     + * xe_wait_for - magic wait macro
>     + *
>     + * Macro to help avoid open coding check/wait/timeout patterns. 
> Note that it's
>     + * important that we check the condition again after having timed 
> out, since the
>     + * timeout could be due to preemption or similar and we've never 
> had a chance to
>     + * check the condition before the timeout.
>     + */
>     +#define xe_wait_for(COND, US) ({ \
>
> What I'm telling is to have a xe_mmio_wait() variant that is in the
> proper namespace, wait on a mmio, etc. They can even share part of the 
> implementation
> inside xe_mmio.c. Currently the only wait condition is == value. Having
> a different wait condition as return from a functor is not the same thing
> as starting a kitchen-sink-header.h. That's what the push back was on.
Not seeing the difference. As soon as you pass in a callback function, 
it is a kitchen sink helper. The callback can go off and do whatever it 
likes. The only difference is that it enforces a register read before 
calling the callback. You could just pass in a timestamp register and do 
whatever totally unrelated test in the callback. So why bother enforcing 
a register read at all? The only purpose/requirement of the helper is to 
manage the timing. I don't see how restricting a wait helper to only 
wait on register reads is making the driver cleaner or simpler. It just 
makes it more difficult to code up what the task actually requires which 
makes increases the maintenance burden down the line.

Feel free to rewrite this how you think it should be done later, but I 
just do not see any difference so I'm going to keep this as is.

John.

>
> Lucas De Marchi
>
>>
>> John.
>>
>>>
>>>>
>>>>>
>>>>>>      */
>>>>>> -    ret = xe_mmio_wait32(guc_to_gt(guc), GUC_STATUS, 
>>>>>> GS_UKERNEL_MASK,
>>>>>> -                 FIELD_PREP(GS_UKERNEL_MASK, 
>>>>>> XE_GUC_LOAD_STATUS_READY),
>>>>>> -                 200000, &status, false);
>>>>>> +    do {
>>>>>> +        u32 last_status = status & (GS_UKERNEL_MASK | 
>>>>>> GS_BOOTROM_MASK);
>>>>>>
>>>>>> -    if (ret) {
>>>>>> -        struct drm_device *drm = &xe->drm;
>>>>>> -
>>>>>> -        drm_info(drm, "GuC load failed: status = 0x%08X\n", 
>>>>>> status);
>>>>>> -        drm_info(drm, "GuC load failed: status: Reset = %d, 
>>>>>> BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>>>>>> -             REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>>>>> -             REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>>>>> -             REG_FIELD_GET(GS_UKERNEL_MASK, status),
>>>>>> -             REG_FIELD_GET(GS_MIA_MASK, status),
>>>>>> -             REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>>>>> -
>>>>>> -        if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
>>>>>> -            drm_info(drm, "GuC firmware signature verification 
>>>>>> failed\n");
>>>>>> +        /*
>>>>>> +         * Wait for any change (intermediate or terminal) in the 
>>>>>> status register.
>>>>>> +         * Note, the return value is a don't care. The only 
>>>>>> failure code is timeout
>>>>>> +         * but the timeouts need to be accumulated over all the 
>>>>>> intermediate partial
>>>>>> +         * timeouts rather than allowing a huge timeout each 
>>>>>> time. So basically, need
>>>>>> +         * to treat a timeout no different to a value change.
>>>>>> +         */
>>>>>> +        xe_mmio_wait32_not(gt, GUC_STATUS, GS_UKERNEL_MASK | 
>>>>>> GS_BOOTROM_MASK,
>>>>>> +                   last_status, 1000 * 1000, &status, false);
>>>>>
>>>>> if I understood correctly the meaning of this function, it would 
>>>>> better
>>>>> be called xe_mmio_wait32_mask() or xe_mmio_wait32_field()?
>>>>>
>>>>> i.e. you provide the mask (GS_UKERNEL_MASK | GS_BOOTROM_MASK)
>>>>> and the previous register value and you wait for a change
>>>>> in that field of the register. Is that the correct meaning of the
>>>>> function?
>>>> The test is 'read != given'. That sounds like a not to me. A _mask 
>>>> or _field suffix (to me) would imply just that the regular wait 
>>>> semantics are applied with a mask or to a specific field (which is 
>>>> already the case, isn't it?). I.e. wait until '(read & mask) == 
>>>> given' or even '(read & mask) == (given << shift)' for a field 
>>>> version. This is the negative version of that test. It must wait 
>>>> until the value has changed, i.e. is not equal to the given target.
>>>>
>>>>>
>>>>> ugh... this bool value to show atomic or otherwise shouldn't be there
>>>>> (but it's pre-existent issue).
>>>> Feel free to fix the baseline xe_mmio_wait32 function. I'm just 
>>>> copying that and trying to stay as close as possible but just 
>>>> inverting the comparison logic.
>>>>
>>>> John.
>>>>
>>>>>
>>>>> Lucas De Marchi
>>>>
>>