[RFC 5/9] drm/xe/xe_late_bind_fw: Load late binding firmware

Wed Jun 4 05:36:46 UTC 2025

On 08-05-2025 05:14, Daniele Ceraolo Spurio wrote:
>
>
> On 4/29/2025 9:09 AM, Badal Nilawar wrote:
>> Load late binding firmware
>>
>> Signed-off-by: Badal Nilawar <badal.nilawar at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_device.c       |  2 +
>>   drivers/gpu/drm/xe/xe_late_bind_fw.c | 91 +++++++++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_late_bind_fw.h |  1 +
>>   3 files changed, 92 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c 
>> b/drivers/gpu/drm/xe/xe_device.c
>> index d83864e7189c..30a416323b37 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -894,6 +894,8 @@ int xe_device_probe(struct xe_device *xe)
>>         xe_late_bind_fw_init(&xe->late_bind);
>>   +    xe_late_bind_fw_load(&xe->late_bind);
>> +
>
> Why does this need to be a separated call from xe_late_bind_fw_init?
In subsequent patches xe_late_bind_fw_load called during S2idle/S3/rpm 
resume.
>
>>       err = xe_oa_init(xe);
>>       if (err)
>>           return err;
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c 
>> b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> index 297238fd3d16..7d2bc959027d 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> @@ -16,6 +16,16 @@
>>   #include "xe_late_bind_fw.h"
>>   #include "xe_pcode.h"
>>   #include "xe_pcode_api.h"
>> +#include "xe_pm.h"
>> +
>> +/*
>> + * The component should load quite quickly in most cases, but it 
>> could take
>> + * a bit. Using a very big timeout just to cover the worst case 
>> scenario
>> + */
>> +#define LB_INIT_TIMEOUT_MS 20000
>> +
>> +#define LB_FW_LOAD_RETRY_MAXCOUNT 40
>> +#define LB_FW_LOAD_RETRY_PAUSE_MS 50
>>     static const char * const fw_id_to_name[] = {
>>           [FAN_CONTROL_ID] = "fan_control",
>> @@ -45,6 +55,78 @@ static int late_bind_fw_num_fans(struct 
>> xe_late_bind *late_bind)
>>           return 0;
>>   }
>>   +static void late_bind_work(struct work_struct *work)
>> +{
>> +    struct xe_late_bind_fw *lbfw = container_of(work, struct 
>> xe_late_bind_fw, work);
>> +    struct xe_late_bind *late_bind = container_of(lbfw, struct 
>> xe_late_bind,
>> +                              late_bind_fw[lbfw->id]);
>> +    struct xe_device *xe = late_bind_to_xe(late_bind);
>> +    int retry = LB_FW_LOAD_RETRY_MAXCOUNT;
>> +    int ret;
>> +    int slept;
>> +
>> +    if (!late_bind->component_added)
>> +        return;
>> +
>> +    if (!lbfw->valid)
>> +        return;
>> +
>> +    /* we can queue this before the component is bound */
>> +    for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) {
>> +        if (late_bind->component)
>> +            break;
>> +        msleep(100);
>> +    }
>> +
>> +    xe_pm_runtime_get(xe);
>> +    mutex_lock(&late_bind->mutex);
>
> You're locking the mutex here but you're not checking that any of the 
> protected data is valid (late_bind->component can be NULL when we exit 
> from the above for loop, or it might have changed from valid to NULL 
> in between).
>
>> +    drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]);
>> +
>> +    do {
>> +        ret = 
>> late_bind->component->ops->push_config(late_bind->component->mei_dev,
>> +                                 lbfw->type, lbfw->flags,
>> +                                 lbfw->payload, lbfw->payload_size);
>> +        if (!ret)
>> +            break;
>> +        msleep(LB_FW_LOAD_RETRY_PAUSE_MS);
>> +    } while (--retry && ret == -EAGAIN);
>
> In which scenario can this call return -EAGAIN ? As far as I can see 
> the mei driver only returns that on a non-blocking call, which is not 
> what we're doing here. Am I missing a path somewhere?
Discussed offline with Sasha, we should be using -EBUSY here as 
mei_cldev_enable() can return -EBUSY.
>
>> +
>> +    if (ret)
>> +        drm_err(&xe->drm, "Load %s firmware failed with err %d\n",
>> +            fw_id_to_name[lbfw->id], ret);
>> +    else
>> +        drm_dbg(&xe->drm, "Load %s firmware successful\n",
>> +            fw_id_to_name[lbfw->id]);
>> +
>> +    mutex_unlock(&late_bind->mutex);
>> +    xe_pm_runtime_put(xe);
>> +}
>> +
>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind)
>> +{
>> +    struct xe_device *xe = late_bind_to_xe(late_bind);
>> +    struct xe_late_bind_fw *lbfw;
>> +    int id;
>> +
>> +    if (!late_bind->component_added)
>> +        return -EINVAL;
>> +
>> +    for (id = 0; id < MAX_ID; id++) {
>> +        lbfw = &late_bind->late_bind_fw[id];
>> +        if (lbfw->valid) {
>> +            drm_dbg(&xe->drm, "Queue work: to load %s firmware\n",
>> +                fw_id_to_name[lbfw->id]);
>> +            queue_work(late_bind->wq, &lbfw->work);
>
> Do we need to flush this work before suspend to make sure it has 
> completed before the HW goes into D3Hot? Similarly, do we need to 
> flush it on driver unload to make sure it's done before we de-allocate 
> stuff?
Sure, I will flush this before unbind.
>
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +/**
>> + * late_bind_fw_init() - initialize late bind firmware
>> + *
>> + * Return: 0 if the initialization was successful, a negative errno 
>> otherwise.
>> + */
>>   static int late_bind_fw_init(struct xe_late_bind *late_bind, u32 id)
>>   {
>>       struct xe_device *xe = late_bind_to_xe(late_bind);
>> @@ -93,6 +175,7 @@ static int late_bind_fw_init(struct xe_late_bind 
>> *late_bind, u32 id)
>>         memcpy(lb_fw->payload, fw->data, lb_fw->payload_size);
>>       release_firmware(fw);
>> +    INIT_WORK(&lb_fw->work, late_bind_work);
>>       lb_fw->valid = true;
>>         return 0;
>> @@ -108,12 +191,17 @@ int xe_late_bind_fw_init(struct xe_late_bind 
>> *late_bind)
>>       int id;
>>       int ret;
>>   +    late_bind->wq = 
>> create_singlethread_workqueue("late-bind-ordered-wq");
>
> Where is this WQ destroyed? Also, I think that using 
> alloc_ordered_workqueue would be preferred.
I missed that, will fix it.
>
> Daniele
>
>> +    if (!late_bind->wq)
>> +        return -ENOMEM;
>> +
>>       for (id = 0; id < MAX_ID; id++) {
>>           ret = late_bind_fw_init(late_bind, id);
>>           if (ret)
>>               return ret;
>>       }
>> -    return ret;
>> +
>> +    return 0;
>>   }
>>     static int xe_late_bind_component_bind(struct device *xe_kdev,
>> @@ -179,7 +267,6 @@ int xe_late_bind_init(struct xe_late_bind 
>> *late_bind)
>>       }
>>         late_bind->component_added = true;
>> -    /* the component must be removed before unload, so can't use 
>> drmm for cleanup */
>>         return 0;
>>   }
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h 
>> b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> index e88c637b15a6..edd0e4c0650e 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> @@ -13,5 +13,6 @@ struct xe_late_bind;
>>   int xe_late_bind_init(struct xe_late_bind *late_bind);
>>   void xe_late_bind_remove(struct xe_late_bind *late_bind);
>>   int xe_late_bind_fw_init(struct xe_late_bind *late_bind);
>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind);
>>     #endif
>