[PATCH v4 01/13] drm/xe: Add callback support for driver remove

Rodrigo Vivi rodrigo.vivi at intel.com
Wed Feb 12 20:01:12 UTC 2025


On Wed, Feb 12, 2025 at 11:35:48AM -0800, Lucas De Marchi wrote:
> xe device probe uses devm cleanup in most places. However there are a
> few cases where this is not possible: when the driver interacts with
> component add/del. In that case, the resource group would be cleanup
> while the entire device resources are in the process of cleanup.  One
> example is the xe_gsc_proxy and display using that to interact with mei
> and audio.
> 
> Add a callback-based remove so the exception doesn't make the probe
> use multiple error handling styles.
> 
> v2: Change internal API to mimic the devm API. This will make it easier
>     to migrate in future when devm can be used.
> 
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c       | 79 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_device.h       |  4 ++
>  drivers/gpu/drm/xe/xe_device_types.h | 17 ++++++
>  drivers/gpu/drm/xe/xe_pci.c          |  4 +-
>  4 files changed, 103 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 36d7ffb3b4d90..69bde506ee87e 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -65,6 +65,12 @@
>  
>  #include <generated/xe_wa_oob.h>
>  
> +struct xe_device_remove_action {
> +	struct list_head node;
> +	xe_device_remove_action_t remove;
> +	void *data;
> +};
> +
>  static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>  {
>  	struct xe_device *xe = to_xe_device(dev);
> @@ -746,6 +752,9 @@ int xe_device_probe(struct xe_device *xe)
>  	u8 last_gt;
>  	u8 id;
>  
> +	xe->probing = true;
> +	INIT_LIST_HEAD(&xe->remove_action_list);
> +
>  	xe_pat_init_early(xe);
>  
>  	err = xe_sriov_init(xe);
> @@ -886,6 +895,8 @@ int xe_device_probe(struct xe_device *xe)
>  
>  	xe_vsec_init(xe);
>  
> +	xe->probing = false;
> +
>  	return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
>  
>  err_fini_display:
> @@ -907,6 +918,72 @@ int xe_device_probe(struct xe_device *xe)
>  	return err;
>  }
>  
> +/**
> + * xe_device_call_remove_actions - Call the remove actions
> + * @xe: xe device instance
> + *
> + * This is only to be used by xe_pci and xe_device to call the remove actions
> + * while removing the driver or handling probe failures.
> + */
> +void xe_device_call_remove_actions(struct xe_device *xe)
> +{
> +	struct xe_device_remove_action *ra, *tmp;
> +
> +	list_for_each_entry_safe(ra, tmp, &xe->remove_action_list, node) {
> +		ra->remove(xe, ra->data);
> +		list_del(&ra->node);
> +		kfree(ra);
> +	}
> +
> +	xe->probing = false;
> +}
> +
> +/**
> + * xe_device_add_action_or_reset - Add an action to run on driver removal
> + * @xe: xe device instance
> + * @ra: pointer to the object embedded into the object to cleanup
> + * @remove: function to execute. The @ra is passed as argument
> + *
> + * Example:
> + *
> + * .. code-block:: c
> + *
> + *	static void foo_remove(struct xe_device_remove_action *ra)
> + *	{
> + *		struct xe_foo *foo = container_of(ra, struct xe_foo, remove_action);
> + *		...
> + *	}
> + *
> + *	int xe_foo_init(struct xe_foo *foo)
> + *	{
> + *		...
> + *		xe_device_add_remove_action(xe, &foo->remove_action, foo_remove);
> + *		...
> + *		return 0;
> + *	};

I still believe we should add here a note here to highlight this is the
exception and that devm should be preferred. But up to you, the
explanation in the commit message makes more sense now and the patch
is right. I hope we can get some devm solution to handle this component
case. But let's move on:

Reviewed-by: Rodrigo Vivi <rodrigo.vivi at intel.com>

> + */
> +int xe_device_add_action_or_reset(struct xe_device *xe,
> +				  xe_device_remove_action_t action,
> +				  void *data)
> +{
> +	struct xe_device_remove_action *ra;
> +
> +	drm_WARN_ON(&xe->drm, !xe->probing);
> +
> +	ra = kmalloc(sizeof(*ra), GFP_KERNEL);
> +	if (!ra) {
> +		action(xe, data);
> +		return -ENOMEM;
> +	}
> +
> +	INIT_LIST_HEAD(&ra->node);
> +	ra->remove = action;
> +	ra->data = data;
> +	list_add(&ra->node, &xe->remove_action_list);
> +
> +	return 0;
> +}
> +
>  static void xe_device_remove_display(struct xe_device *xe)
>  {
>  	xe_display_unregister(xe);
> @@ -932,6 +1009,8 @@ void xe_device_remove(struct xe_device *xe)
>  
>  	for_each_gt(gt, xe, id)
>  		xe_gt_remove(gt);
> +
> +	xe_device_call_remove_actions(xe);
>  }
>  
>  void xe_device_shutdown(struct xe_device *xe)
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index 0bc3bc8e68030..a6fedf1ef3c7b 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -45,6 +45,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
>  				   const struct pci_device_id *ent);
>  int xe_device_probe_early(struct xe_device *xe);
>  int xe_device_probe(struct xe_device *xe);
> +int xe_device_add_action_or_reset(struct xe_device *xe,
> +				  xe_device_remove_action_t action,
> +				  void *data);
> +void xe_device_call_remove_actions(struct xe_device *xe);
>  void xe_device_remove(struct xe_device *xe);
>  void xe_device_shutdown(struct xe_device *xe);
>  
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 833c29fed3a37..b322d49c83c77 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -35,6 +35,7 @@
>  #include "intel_display_device.h"
>  #endif
>  
> +struct xe_device;
>  struct xe_ggtt;
>  struct xe_pat_ops;
>  struct xe_pxp;
> @@ -70,6 +71,8 @@ struct xe_pxp;
>  		 const struct xe_tile * : (const struct xe_device *)((tile__)->xe),	\
>  		 struct xe_tile * : (tile__)->xe)
>  
> +typedef void (*xe_device_remove_action_t)(struct xe_device *xe, void *data);
> +
>  /**
>   * struct xe_vram_region - memory region structure
>   * This is used to describe a memory region in xe
> @@ -428,6 +431,20 @@ struct xe_device {
>  	/** @tiles: device tiles */
>  	struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE];
>  
> +	/**
> +	 * @remove_action_list: list of actions to execute on device remove.
> +	 * Use xe_device_add_remove_action() for that. Actions can only be added
> +	 * during probe and are executed during the call from PCI subsystem to
> +	 * remove the driver from the device.
> +	 */
> +	struct list_head remove_action_list;
> +
> +	/**
> +	 * @probing: cover the section in which @remove_action_list can be used
> +	 * to post cleaning actions
> +	 */
> +	bool probing;
> +
>  	/**
>  	 * @mem_access: keep track of memory access in the device, possibly
>  	 * triggering additional actions when they occur.
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index 6a8e82aff3853..70b697fde5b96 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -905,8 +905,10 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		return err;
>  
>  	err = xe_device_probe(xe);
> -	if (err)
> +	if (err) {
> +		xe_device_call_remove_actions(xe);
>  		return err;
> +	}
>  
>  	err = xe_pm_init(xe);
>  	if (err)
> -- 
> 2.48.1
> 


More information about the Intel-xe mailing list