[RFC 5/9] drm/xe/xe_late_bind_fw: Load late binding firmware

Wed May 7 23:44:18 UTC 2025

On 4/29/2025 9:09 AM, Badal Nilawar wrote:
> Load late binding firmware
>
> Signed-off-by: Badal Nilawar <badal.nilawar at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_device.c       |  2 +
>   drivers/gpu/drm/xe/xe_late_bind_fw.c | 91 +++++++++++++++++++++++++++-
>   drivers/gpu/drm/xe/xe_late_bind_fw.h |  1 +
>   3 files changed, 92 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index d83864e7189c..30a416323b37 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -894,6 +894,8 @@ int xe_device_probe(struct xe_device *xe)
>   
>   	xe_late_bind_fw_init(&xe->late_bind);
>   
> +	xe_late_bind_fw_load(&xe->late_bind);
> +

Why does this need to be a separated call from xe_late_bind_fw_init?

>   	err = xe_oa_init(xe);
>   	if (err)
>   		return err;
> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c b/drivers/gpu/drm/xe/xe_late_bind_fw.c
> index 297238fd3d16..7d2bc959027d 100644
> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c
> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c
> @@ -16,6 +16,16 @@
>   #include "xe_late_bind_fw.h"
>   #include "xe_pcode.h"
>   #include "xe_pcode_api.h"
> +#include "xe_pm.h"
> +
> +/*
> + * The component should load quite quickly in most cases, but it could take
> + * a bit. Using a very big timeout just to cover the worst case scenario
> + */
> +#define LB_INIT_TIMEOUT_MS 20000
> +
> +#define LB_FW_LOAD_RETRY_MAXCOUNT 40
> +#define LB_FW_LOAD_RETRY_PAUSE_MS 50
>   
>   static const char * const fw_id_to_name[] = {
>   		[FAN_CONTROL_ID] = "fan_control",
> @@ -45,6 +55,78 @@ static int late_bind_fw_num_fans(struct xe_late_bind *late_bind)
>   		return 0;
>   }
>   
> +static void late_bind_work(struct work_struct *work)
> +{
> +	struct xe_late_bind_fw *lbfw = container_of(work, struct xe_late_bind_fw, work);
> +	struct xe_late_bind *late_bind = container_of(lbfw, struct xe_late_bind,
> +						      late_bind_fw[lbfw->id]);
> +	struct xe_device *xe = late_bind_to_xe(late_bind);
> +	int retry = LB_FW_LOAD_RETRY_MAXCOUNT;
> +	int ret;
> +	int slept;
> +
> +	if (!late_bind->component_added)
> +		return;
> +
> +	if (!lbfw->valid)
> +		return;
> +
> +	/* we can queue this before the component is bound */
> +	for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) {
> +		if (late_bind->component)
> +			break;
> +		msleep(100);
> +	}
> +
> +	xe_pm_runtime_get(xe);
> +	mutex_lock(&late_bind->mutex);

You're locking the mutex here but you're not checking that any of the 
protected data is valid (late_bind->component can be NULL when we exit 
from the above for loop, or it might have changed from valid to NULL in 
between).

> +	drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]);
> +
> +	do {
> +		ret = late_bind->component->ops->push_config(late_bind->component->mei_dev,
> +							     lbfw->type, lbfw->flags,
> +							     lbfw->payload, lbfw->payload_size);
> +		if (!ret)
> +			break;
> +		msleep(LB_FW_LOAD_RETRY_PAUSE_MS);
> +	} while (--retry && ret == -EAGAIN);

In which scenario can this call return -EAGAIN ? As far as I can see the 
mei driver only returns that on a non-blocking call, which is not what 
we're doing here. Am I missing a path somewhere?

> +
> +	if (ret)
> +		drm_err(&xe->drm, "Load %s firmware failed with err %d\n",
> +			fw_id_to_name[lbfw->id], ret);
> +	else
> +		drm_dbg(&xe->drm, "Load %s firmware successful\n",
> +			fw_id_to_name[lbfw->id]);
> +
> +	mutex_unlock(&late_bind->mutex);
> +	xe_pm_runtime_put(xe);
> +}
> +
> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind)
> +{
> +	struct xe_device *xe = late_bind_to_xe(late_bind);
> +	struct xe_late_bind_fw *lbfw;
> +	int id;
> +
> +	if (!late_bind->component_added)
> +		return -EINVAL;
> +
> +	for (id = 0; id < MAX_ID; id++) {
> +		lbfw = &late_bind->late_bind_fw[id];
> +		if (lbfw->valid) {
> +			drm_dbg(&xe->drm, "Queue work: to load %s firmware\n",
> +				fw_id_to_name[lbfw->id]);
> +			queue_work(late_bind->wq, &lbfw->work);

Do we need to flush this work before suspend to make sure it has 
completed before the HW goes into D3Hot? Similarly, do we need to flush 
it on driver unload to make sure it's done before we de-allocate stuff?

> +		}
> +	}
> +	return 0;
> +}
> +
> +/**
> + * late_bind_fw_init() - initialize late bind firmware
> + *
> + * Return: 0 if the initialization was successful, a negative errno otherwise.
> + */
>   static int late_bind_fw_init(struct xe_late_bind *late_bind, u32 id)
>   {
>   	struct xe_device *xe = late_bind_to_xe(late_bind);
> @@ -93,6 +175,7 @@ static int late_bind_fw_init(struct xe_late_bind *late_bind, u32 id)
>   
>   	memcpy(lb_fw->payload, fw->data, lb_fw->payload_size);
>   	release_firmware(fw);
> +	INIT_WORK(&lb_fw->work, late_bind_work);
>   	lb_fw->valid = true;
>   
>   	return 0;
> @@ -108,12 +191,17 @@ int xe_late_bind_fw_init(struct xe_late_bind *late_bind)
>   	int id;
>   	int ret;
>   
> +	late_bind->wq = create_singlethread_workqueue("late-bind-ordered-wq");

Where is this WQ destroyed? Also, I think that using 
alloc_ordered_workqueue would be preferred.

Daniele

> +	if (!late_bind->wq)
> +		return -ENOMEM;
> +
>   	for (id = 0; id < MAX_ID; id++) {
>   		ret = late_bind_fw_init(late_bind, id);
>   		if (ret)
>   			return ret;
>   	}
> -	return ret;
> +
> +	return 0;
>   }
>   
>   static int xe_late_bind_component_bind(struct device *xe_kdev,
> @@ -179,7 +267,6 @@ int xe_late_bind_init(struct xe_late_bind *late_bind)
>   	}
>   
>   	late_bind->component_added = true;
> -	/* the component must be removed before unload, so can't use drmm for cleanup */
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h b/drivers/gpu/drm/xe/xe_late_bind_fw.h
> index e88c637b15a6..edd0e4c0650e 100644
> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h
> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h
> @@ -13,5 +13,6 @@ struct xe_late_bind;
>   int xe_late_bind_init(struct xe_late_bind *late_bind);
>   void xe_late_bind_remove(struct xe_late_bind *late_bind);
>   int xe_late_bind_fw_init(struct xe_late_bind *late_bind);
> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind);
>   
>   #endif