[PATCH v2 2/2] drm/xe/guc: Port over the slow GuC loading support from i915

Wed Feb 14 05:39:25 UTC 2024


On 14-02-2024 07:44, John Harrison wrote:
> On 2/12/2024 21:17, Nilawar, Badal wrote:
>> On 13-02-2024 06:04, John.C.Harrison at Intel.com wrote:
>>> From: John Harrison <John.C.Harrison at Intel.com>
>>>
>>> GuC loading can take longer than it is supposed to for various
>>> reasons. So add in the code to cope with that and to report it when it
>>> happens. There are also many different reasons why GuC loading can
>>> fail, so add in the code for checking for those and for reporting
>>> issues in a meaningful manner rather than just hitting a timeout and
>>> saying 'fail: status = %x'.
>>>
>>> Also, remove the 'FIXME' comment about an i915 bug that has never been
>>> applicable to Xe!
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/abi/guc_errors_abi.h |  26 +++-
>>>   drivers/gpu/drm/xe/regs/xe_guc_regs.h   |   2 +
>>>   drivers/gpu/drm/xe/xe_guc.c             | 197 +++++++++++++++++++-----
>>>   drivers/gpu/drm/xe/xe_macros.h          |  32 ++++
>>>   4 files changed, 214 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/abi/guc_errors_abi.h 
>>> b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>> index ec83551bf9c0..d0b5fed6876f 100644
>>> --- a/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>> +++ b/drivers/gpu/drm/xe/abi/guc_errors_abi.h
>>> @@ -7,8 +7,12 @@
>>>   #define _ABI_GUC_ERRORS_ABI_H
>>>     enum xe_guc_response_status {
>>> -    XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0,
>>> -    XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000,
>>> +    XE_GUC_RESPONSE_STATUS_SUCCESS                      = 0x0,
>>> +    XE_GUC_RESPONSE_NOT_SUPPORTED                       = 0x20,
>>> +    XE_GUC_RESPONSE_NO_ATTRIBUTE_TABLE                  = 0x201,
>>> +    XE_GUC_RESPONSE_NO_DECRYPTION_KEY                   = 0x202,
>>> +    XE_GUC_RESPONSE_DECRYPTION_FAILED                   = 0x204,
>>> +    XE_GUC_RESPONSE_STATUS_GENERIC_FAIL                 = 0xF000,
>>>   };
>>>     enum xe_guc_load_status {
>>> @@ -17,6 +21,9 @@ enum xe_guc_load_status {
>>>       XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH       = 0x02,
>>>       XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH       = 0x03,
>>>       XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE      = 0x04,
>>> +    XE_GUC_LOAD_STATUS_HWCONFIG_START                   = 0x05,
>>> +    XE_GUC_LOAD_STATUS_HWCONFIG_DONE                    = 0x06,
>>> +    XE_GUC_LOAD_STATUS_HWCONFIG_ERROR                   = 0x07,
>>>       XE_GUC_LOAD_STATUS_GDT_DONE                         = 0x10,
>>>       XE_GUC_LOAD_STATUS_IDT_DONE                         = 0x20,
>>>       XE_GUC_LOAD_STATUS_LAPIC_DONE                       = 0x30,
>>> @@ -34,4 +41,19 @@ enum xe_guc_load_status {
>>>       XE_GUC_LOAD_STATUS_READY                            = 0xF0,
>>>   };
>>>   +enum xe_bootrom_load_status {
>>> +    XE_BOOTROM_STATUS_NO_KEY_FOUND                      = 0x13,
>>> +    XE_BOOTROM_STATUS_AES_PROD_KEY_FOUND                = 0x1A,
>>> +    XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE            = 0x2B,
>>> +    XE_BOOTROM_STATUS_RSA_FAILED                        = 0x50,
>>> +    XE_BOOTROM_STATUS_PAVPC_FAILED                      = 0x73,
>>> +    XE_BOOTROM_STATUS_WOPCM_FAILED                      = 0x74,
>>> +    XE_BOOTROM_STATUS_LOADLOC_FAILED                    = 0x75,
>>> +    XE_BOOTROM_STATUS_JUMP_PASSED                       = 0x76,
>>> +    XE_BOOTROM_STATUS_JUMP_FAILED                       = 0x77,
>>> +    XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED               = 0x79,
>>> +    XE_BOOTROM_STATUS_MPUMAP_INCORRECT                  = 0x7A,
>>> +    XE_BOOTROM_STATUS_EXCEPTION                         = 0x7E,
>>> +};
>>> +
>>>   #endif
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h 
>>> b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>> index 92320bbc9d3d..a30e179e662e 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
>>> @@ -40,6 +40,8 @@
>>>   #define   GS_BOOTROM_JUMP_PASSED REG_FIELD_PREP(GS_BOOTROM_MASK, 0x76)
>>>   #define   GS_MIA_IN_RESET            REG_BIT(0)
>>>   +#define GUC_HEADER_INFO                XE_REG(0xc014)
>>> +
>>>   #define GUC_WOPCM_SIZE                XE_REG(0xc050)
>>>   #define   GUC_WOPCM_SIZE_MASK            REG_GENMASK(31, 12)
>>>   #define   GUC_WOPCM_SIZE_LOCKED            REG_BIT(0)
>>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>> index 868208a39829..82514d395704 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>>> @@ -16,6 +16,7 @@
>>>   #include "xe_device.h"
>>>   #include "xe_force_wake.h"
>>>   #include "xe_gt.h"
>>> +#include "xe_gt_freq.h"
>>>   #include "xe_guc_ads.h"
>>>   #include "xe_guc_ct.h"
>>>   #include "xe_guc_hwconfig.h"
>>> @@ -427,58 +428,172 @@ static int guc_xfer_rsa(struct xe_guc *guc)
>>>       return 0;
>>>   }
>>>   +/*
>>> + * Read the GuC status register (GUC_STATUS) and store it in the
>>> + * specified location; then return a boolean indicating whether
>>> + * the value matches either completion or a known failure code.
>>> + *
>>> + * This is used for polling the GuC status in an xe_wait_for()
>>> + * loop below.
>>> + */
>>> +static inline bool guc_load_done(struct xe_gt *gt, u32 *status, bool 
>>> *success)
>>> +{
>>> +    u32 val = xe_mmio_read32(gt, GUC_STATUS);
>>> +    u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, val);
>>> +    u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, val);
>>> +
>>> +    *status = val;
>>> +    switch (uk_val) {
>>> +    case XE_GUC_LOAD_STATUS_READY:
>>> +        *success = true;
>>> +        return true;
>>> +
>>> +    case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH:
>>> +    case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH:
>>> +    case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE:
>>> +    case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR:
>>> +    case XE_GUC_LOAD_STATUS_DPC_ERROR:
>>> +    case XE_GUC_LOAD_STATUS_EXCEPTION:
>>> +    case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID:
>>> +    case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID:
>>> +    case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID:
>>> +        *success = false;
>>> +        return true;
>>> +    }
>>> +
>>> +    switch (br_val) {
>>> +    case XE_BOOTROM_STATUS_NO_KEY_FOUND:
>>> +    case XE_BOOTROM_STATUS_RSA_FAILED:
>>> +    case XE_BOOTROM_STATUS_PAVPC_FAILED:
>>> +    case XE_BOOTROM_STATUS_WOPCM_FAILED:
>>> +    case XE_BOOTROM_STATUS_LOADLOC_FAILED:
>>> +    case XE_BOOTROM_STATUS_JUMP_FAILED:
>>> +    case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED:
>>> +    case XE_BOOTROM_STATUS_MPUMAP_INCORRECT:
>>> +    case XE_BOOTROM_STATUS_EXCEPTION:
>>> +    case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE:
>>> +        *success = false;
>>> +        return true;
>>> +    }
>>> +
>>> +    return false;
>>> +}
>>> +
>>> +/*
>>> + * Wait for the GuC to start up.
>>> + *
>>> + * Measurements indicate this should take no more than 20ms 
>>> (assuming the GT
>>> + * clock is at maximum frequency). However, thermal throttling and 
>>> other issues
>>> + * can prevent the clock hitting max and thus making the load take 
>>> significantly
>>> + * longer. Indeed, if the GT is clamped to minimum frequency then 
>>> the load times
>>> + * can be in the seconds range. As, there is a limit on how long an 
>>> individual
>>> + * usleep_range() can wait for, the wait is wrapped in a loop. The 
>>> loop count
>>> + * is increased for debug builds so that problems can be detected 
>>> and analysed.
>>> + * For release builds, the timeout is kept short so that user's 
>>> don't wait
>>> + * forever to find out there is a problem. In either case, if the 
>>> load took longer
>>> + * than is reasonable even with some 'sensible' throttling, then 
>>> flag a warning
>>> + * because something is not right.
>>> + *
>>> + * Note that the only reason an end user should hit the timeout is 
>>> in case of
>>> + * extreme thermal throttling. And a system that is that hot during 
>>> boot is
>>> + * probably dead anyway!
>>> + */
>>> +#if defined(CONFIG_DRM_XE_DEBUG)
>>> +#define GUC_LOAD_RETRY_LIMIT    20
>>> +#else
>>> +#define GUC_LOAD_RETRY_LIMIT    3
>>> +#endif
>>> +#define GUC_LOAD_TIME_WARN      200
>>> +
>>>   static int guc_wait_ucode(struct xe_guc *guc)
>>>   {
>>> -    struct xe_device *xe = guc_to_xe(guc);
>>> +    struct xe_gt *gt = guc_to_gt(guc);
>>> +    struct xe_guc_pc *guc_pc = &gt->uc.guc.pc;
>>> +    ktime_t before, after, delta;
>>> +    bool success;
>>>       u32 status;
>>> -    int ret;
>>> +    int ret, count;
>>> +    u64 delta_ms;
>>> +    u32 before_freq;
>>> +
>>> +    before_freq = xe_guc_pc_get_act_freq(guc_pc);
>>> +    before = ktime_get();
>>> +    for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
>>> +        ret = xe_wait_for(guc_load_done(gt, &status, &success), 1000 
>>> * 1000);
>>> +        if (!ret || !success)
>>> +            break;
>>> +
>>> +        xe_gt_dbg(gt, "load still in progress, count = %d, freq = 
>>> %dMHz (req %dMHz), status = 0x%08X [0x%02X/%02X]\n",
>>> +              count, xe_guc_pc_get_act_freq(guc_pc),
>>> +              xe_guc_pc_get_act_freq(guc_pc), status,
>> I think this should be current requested frequency xe_guc_pc_get_cur_freq
> No. The point is to report what the actual frequency was to see if that 
> explains why the load is running slowly. The requested frequency is 
> under driver control. That should be at maximum during driver load. The
Is requested freq set to maximum in resume path as well?
> granted frequency is not under driver control. That is the unknown that 
> needs to be reported to see why the system is not working as intended.
Agreed but in the expression "freq = %dMHz (req %dMHz)" actual frequency 
is being printed 2 times. What is significance of "(req %dMHz) here", I 
thought req stands for requested.

Badal
> 
> John.
> 
> 
> 
>>> + REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>> +              REG_FIELD_GET(GS_UKERNEL_MASK, status));
>>> +    }
>>> +    after = ktime_get();
>>> +    delta = ktime_sub(after, before);
>>> +    delta_ms = ktime_to_ms(delta);
>>> +    if (ret || !success) {
>>> +        u32 ukernel = REG_FIELD_GET(GS_UKERNEL_MASK, status);
>>> +        u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>>> +
>>> +        xe_gt_info(gt, "load failed: status = 0x%08X, time = %lldms, 
>>> freq = %dMHz (req %dMHz), ret = %d\n",
>>> +               status, delta_ms, xe_guc_pc_get_act_freq(guc_pc),
>>> +               xe_guc_pc_get_act_freq(guc_pc), ret);
>> Same as above.
>>
>> Regards,
>> Badal
>>> +        xe_gt_info(gt, "load failed: status: Reset = %d, BootROM = 
>>> 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>>> +               REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>> +               bootrom, ukernel,
>>> +               REG_FIELD_GET(GS_MIA_MASK, status),
>>> +               REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>> +
>>> +        switch (bootrom) {
>>> +        case XE_BOOTROM_STATUS_NO_KEY_FOUND:
>>> +            xe_gt_info(gt, "invalid key requested, header = 0x%08X\n",
>>> +                   xe_mmio_read32(gt, GUC_HEADER_INFO));
>>> +            ret = -ENOEXEC;
>>> +            break;
>>>   -    /*
>>> -     * Wait for the GuC to start up.
>>> -     * NB: Docs recommend not using the interrupt for completion.
>>> -     * Measurements indicate this should take no more than 20ms
>>> -     * (assuming the GT clock is at maximum frequency). So, a
>>> -     * timeout here indicates that the GuC has failed and is unusable.
>>> -     * (Higher levels of the driver may decide to reset the GuC and
>>> -     * attempt the ucode load again if this happens.)
>>> -     *
>>> -     * FIXME: There is a known (but exceedingly unlikely) race 
>>> condition
>>> -     * where the asynchronous frequency management code could reduce
>>> -     * the GT clock while a GuC reload is in progress (during a full
>>> -     * GT reset). A fix is in progress but there are complex locking
>>> -     * issues to be resolved. In the meantime bump the timeout to
>>> -     * 200ms. Even at slowest clock, this should be sufficient. And
>>> -     * in the working case, a larger timeout makes no difference.
>>> -     */
>>> -    ret = xe_mmio_wait32(guc_to_gt(guc), GUC_STATUS, GS_UKERNEL_MASK,
>>> -                 FIELD_PREP(GS_UKERNEL_MASK, XE_GUC_LOAD_STATUS_READY),
>>> -                 200000, &status, false);
>>> +        case XE_BOOTROM_STATUS_RSA_FAILED:
>>> +            xe_gt_info(gt, "firmware signature verification failed\n");
>>> +            ret = -ENOEXEC;
>>> +            break;
>>>   -    if (ret) {
>>> -        struct drm_device *drm = &xe->drm;
>>> -
>>> -        drm_info(drm, "GuC load failed: status = 0x%08X\n", status);
>>> -        drm_info(drm, "GuC load failed: status: Reset = %d, BootROM 
>>> = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>>> -             REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>> -             REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>> -             REG_FIELD_GET(GS_UKERNEL_MASK, status),
>>> -             REG_FIELD_GET(GS_MIA_MASK, status),
>>> -             REG_FIELD_GET(GS_AUTH_STATUS_MASK, status));
>>> -
>>> -        if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
>>> -            drm_info(drm, "GuC firmware signature verification 
>>> failed\n");
>>> +        case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE:
>>> +            xe_gt_info(gt, "firmware production part check failure\n");
>>>               ret = -ENOEXEC;
>>> +            break;
>>>           }
>>>   -        if (REG_FIELD_GET(GS_UKERNEL_MASK, status) ==
>>> -            XE_GUC_LOAD_STATUS_EXCEPTION) {
>>> -            drm_info(drm, "GuC firmware exception. EIP: %#x\n",
>>> -                 xe_mmio_read32(guc_to_gt(guc),
>>> -                        SOFT_SCRATCH(13)));
>>> +        switch (ukernel) {
>>> +        case XE_GUC_LOAD_STATUS_EXCEPTION:
>>> +            xe_gt_info(gt, "firmware exception. EIP: %#x\n",
>>> +                   xe_mmio_read32(gt, SOFT_SCRATCH(13)));
>>>               ret = -ENXIO;
>>> +            break;
>>> +
>>> +        case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID:
>>> +            xe_gt_info(gt, "illegal register in save/restore 
>>> workaround list\n");
>>> +            ret = -EPERM;
>>> +            break;
>>> +
>>> +        case XE_GUC_LOAD_STATUS_HWCONFIG_START:
>>> +            xe_gt_info(gt, "still extracting hwconfig table.\n");
>>> +            ret = -ETIMEDOUT;
>>> +            break;
>>>           }
>>> +
>>> +        /* Uncommon/unexpected error, see earlier status code print 
>>> for details */
>>> +        if (ret == 0)
>>> +            ret = -ENXIO;
>>> +    } else if (delta_ms > GUC_LOAD_TIME_WARN) {
>>> +        xe_gt_warn(gt, "excessive init time: %lldms! [status = 
>>> 0x%08X, count = %d, ret = %d]\n",
>>> +               delta_ms, status, count, ret);
>>> +        xe_gt_warn(gt, "excessive init time: [freq = %dMHz, before = 
>>> %dMHz, perf_limit_reasons = 0x%08X]\n",
>>> +               xe_guc_pc_get_act_freq(guc_pc), before_freq,
>>> +               xe_read_perf_limit_reasons(gt));
>>>       } else {
>>> -        drm_dbg(&xe->drm, "GuC successfully loaded");
>>> +        xe_gt_dbg(gt, "init took %lldms, freq = %dMHz, before = 
>>> %dMHz, status = 0x%08X, count = %d, ret = %d\n",
>>> +              delta_ms, xe_guc_pc_get_act_freq(guc_pc),
>>> +              before_freq, status, count, ret);
>>>       }
>>>         return ret;
>>> diff --git a/drivers/gpu/drm/xe/xe_macros.h 
>>> b/drivers/gpu/drm/xe/xe_macros.h
>>> index daf56c846d03..eac8f2c9fba5 100644
>>> --- a/drivers/gpu/drm/xe/xe_macros.h
>>> +++ b/drivers/gpu/drm/xe/xe_macros.h
>>> @@ -15,4 +15,36 @@
>>>                   "Ioctl argument check failed at %s:%d: %s", \
>>>                   __FILE__, __LINE__, #cond), 1))
>>>   +/*
>>> + * xe_wait_for - magic wait macro
>>> + *
>>> + * Macro to help avoid open coding check/wait/timeout patterns. Note 
>>> that it's
>>> + * important that we check the condition again after having timed 
>>> out, since the
>>> + * timeout could be due to preemption or similar and we've never had 
>>> a chance to
>>> + * check the condition before the timeout.
>>> + */
>>> +#define xe_wait_for(COND, US) ({ \
>>> +    const ktime_t end__ = ktime_add_ns(ktime_get_raw(), 1000ll * 
>>> (US)); \
>>> +    long wait__ = 10; /* recommended min for usleep is 10 us */    \
>>> +    int ret__;                            \
>>> +    might_sleep();                            \
>>> +    for (;;) {                            \
>>> +        const bool expired__ = ktime_after(ktime_get_raw(), end__); \
>>> +        /* Guarantee COND check prior to timeout */        \
>>> +        barrier();                        \
>>> +        if (COND) {                        \
>>> +            ret__ = 0;                    \
>>> +            break;                        \
>>> +        }                            \
>>> +        if (expired__) {                    \
>>> +            ret__ = -ETIMEDOUT;                \
>>> +            break;                        \
>>> +        }                            \
>>> +        usleep_range(wait__, wait__ * 2);            \
>>> +        if (wait__ < (1000))                    \
>>> +            wait__ <<= 1;                    \
>>> +    }                                \
>>> +    ret__;                                \
>>> +})
>>> +
>>>   #endif
>