[RFC 5/9] drm/xe/xe_late_bind_fw: Load late binding firmware
Nilawar, Badal
badal.nilawar at intel.com
Wed Jun 4 05:36:46 UTC 2025
On 08-05-2025 05:14, Daniele Ceraolo Spurio wrote:
>
>
> On 4/29/2025 9:09 AM, Badal Nilawar wrote:
>> Load late binding firmware
>>
>> Signed-off-by: Badal Nilawar <badal.nilawar at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_device.c | 2 +
>> drivers/gpu/drm/xe/xe_late_bind_fw.c | 91 +++++++++++++++++++++++++++-
>> drivers/gpu/drm/xe/xe_late_bind_fw.h | 1 +
>> 3 files changed, 92 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index d83864e7189c..30a416323b37 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -894,6 +894,8 @@ int xe_device_probe(struct xe_device *xe)
>> xe_late_bind_fw_init(&xe->late_bind);
>> + xe_late_bind_fw_load(&xe->late_bind);
>> +
>
> Why does this need to be a separated call from xe_late_bind_fw_init?
In subsequent patches xe_late_bind_fw_load called during S2idle/S3/rpm
resume.
>
>> err = xe_oa_init(xe);
>> if (err)
>> return err;
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> index 297238fd3d16..7d2bc959027d 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> @@ -16,6 +16,16 @@
>> #include "xe_late_bind_fw.h"
>> #include "xe_pcode.h"
>> #include "xe_pcode_api.h"
>> +#include "xe_pm.h"
>> +
>> +/*
>> + * The component should load quite quickly in most cases, but it
>> could take
>> + * a bit. Using a very big timeout just to cover the worst case
>> scenario
>> + */
>> +#define LB_INIT_TIMEOUT_MS 20000
>> +
>> +#define LB_FW_LOAD_RETRY_MAXCOUNT 40
>> +#define LB_FW_LOAD_RETRY_PAUSE_MS 50
>> static const char * const fw_id_to_name[] = {
>> [FAN_CONTROL_ID] = "fan_control",
>> @@ -45,6 +55,78 @@ static int late_bind_fw_num_fans(struct
>> xe_late_bind *late_bind)
>> return 0;
>> }
>> +static void late_bind_work(struct work_struct *work)
>> +{
>> + struct xe_late_bind_fw *lbfw = container_of(work, struct
>> xe_late_bind_fw, work);
>> + struct xe_late_bind *late_bind = container_of(lbfw, struct
>> xe_late_bind,
>> + late_bind_fw[lbfw->id]);
>> + struct xe_device *xe = late_bind_to_xe(late_bind);
>> + int retry = LB_FW_LOAD_RETRY_MAXCOUNT;
>> + int ret;
>> + int slept;
>> +
>> + if (!late_bind->component_added)
>> + return;
>> +
>> + if (!lbfw->valid)
>> + return;
>> +
>> + /* we can queue this before the component is bound */
>> + for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) {
>> + if (late_bind->component)
>> + break;
>> + msleep(100);
>> + }
>> +
>> + xe_pm_runtime_get(xe);
>> + mutex_lock(&late_bind->mutex);
>
> You're locking the mutex here but you're not checking that any of the
> protected data is valid (late_bind->component can be NULL when we exit
> from the above for loop, or it might have changed from valid to NULL
> in between).
>
>> + drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]);
>> +
>> + do {
>> + ret =
>> late_bind->component->ops->push_config(late_bind->component->mei_dev,
>> + lbfw->type, lbfw->flags,
>> + lbfw->payload, lbfw->payload_size);
>> + if (!ret)
>> + break;
>> + msleep(LB_FW_LOAD_RETRY_PAUSE_MS);
>> + } while (--retry && ret == -EAGAIN);
>
> In which scenario can this call return -EAGAIN ? As far as I can see
> the mei driver only returns that on a non-blocking call, which is not
> what we're doing here. Am I missing a path somewhere?
Discussed offline with Sasha, we should be using -EBUSY here as
mei_cldev_enable() can return -EBUSY.
>
>> +
>> + if (ret)
>> + drm_err(&xe->drm, "Load %s firmware failed with err %d\n",
>> + fw_id_to_name[lbfw->id], ret);
>> + else
>> + drm_dbg(&xe->drm, "Load %s firmware successful\n",
>> + fw_id_to_name[lbfw->id]);
>> +
>> + mutex_unlock(&late_bind->mutex);
>> + xe_pm_runtime_put(xe);
>> +}
>> +
>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind)
>> +{
>> + struct xe_device *xe = late_bind_to_xe(late_bind);
>> + struct xe_late_bind_fw *lbfw;
>> + int id;
>> +
>> + if (!late_bind->component_added)
>> + return -EINVAL;
>> +
>> + for (id = 0; id < MAX_ID; id++) {
>> + lbfw = &late_bind->late_bind_fw[id];
>> + if (lbfw->valid) {
>> + drm_dbg(&xe->drm, "Queue work: to load %s firmware\n",
>> + fw_id_to_name[lbfw->id]);
>> + queue_work(late_bind->wq, &lbfw->work);
>
> Do we need to flush this work before suspend to make sure it has
> completed before the HW goes into D3Hot? Similarly, do we need to
> flush it on driver unload to make sure it's done before we de-allocate
> stuff?
Sure, I will flush this before unbind.
>
>> + }
>> + }
>> + return 0;
>> +}
>> +
>> +/**
>> + * late_bind_fw_init() - initialize late bind firmware
>> + *
>> + * Return: 0 if the initialization was successful, a negative errno
>> otherwise.
>> + */
>> static int late_bind_fw_init(struct xe_late_bind *late_bind, u32 id)
>> {
>> struct xe_device *xe = late_bind_to_xe(late_bind);
>> @@ -93,6 +175,7 @@ static int late_bind_fw_init(struct xe_late_bind
>> *late_bind, u32 id)
>> memcpy(lb_fw->payload, fw->data, lb_fw->payload_size);
>> release_firmware(fw);
>> + INIT_WORK(&lb_fw->work, late_bind_work);
>> lb_fw->valid = true;
>> return 0;
>> @@ -108,12 +191,17 @@ int xe_late_bind_fw_init(struct xe_late_bind
>> *late_bind)
>> int id;
>> int ret;
>> + late_bind->wq =
>> create_singlethread_workqueue("late-bind-ordered-wq");
>
> Where is this WQ destroyed? Also, I think that using
> alloc_ordered_workqueue would be preferred.
I missed that, will fix it.
>
> Daniele
>
>> + if (!late_bind->wq)
>> + return -ENOMEM;
>> +
>> for (id = 0; id < MAX_ID; id++) {
>> ret = late_bind_fw_init(late_bind, id);
>> if (ret)
>> return ret;
>> }
>> - return ret;
>> +
>> + return 0;
>> }
>> static int xe_late_bind_component_bind(struct device *xe_kdev,
>> @@ -179,7 +267,6 @@ int xe_late_bind_init(struct xe_late_bind
>> *late_bind)
>> }
>> late_bind->component_added = true;
>> - /* the component must be removed before unload, so can't use
>> drmm for cleanup */
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> index e88c637b15a6..edd0e4c0650e 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> @@ -13,5 +13,6 @@ struct xe_late_bind;
>> int xe_late_bind_init(struct xe_late_bind *late_bind);
>> void xe_late_bind_remove(struct xe_late_bind *late_bind);
>> int xe_late_bind_fw_init(struct xe_late_bind *late_bind);
>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind);
>> #endif
>
More information about the Intel-xe
mailing list