[PATCH v4 01/13] drm/xe: Add callback support for driver remove

Lucas De Marchi lucas.demarchi at intel.com
Wed Feb 12 20:11:42 UTC 2025


On Wed, Feb 12, 2025 at 03:01:12PM -0500, Rodrigo Vivi wrote:
>On Wed, Feb 12, 2025 at 11:35:48AM -0800, Lucas De Marchi wrote:
>> xe device probe uses devm cleanup in most places. However there are a
>> few cases where this is not possible: when the driver interacts with
>> component add/del. In that case, the resource group would be cleanup
>> while the entire device resources are in the process of cleanup.  One
>> example is the xe_gsc_proxy and display using that to interact with mei
>> and audio.
>>
>> Add a callback-based remove so the exception doesn't make the probe
>> use multiple error handling styles.
>>
>> v2: Change internal API to mimic the devm API. This will make it easier
>>     to migrate in future when devm can be used.
>>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
>> ---
>>  drivers/gpu/drm/xe/xe_device.c       | 79 ++++++++++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_device.h       |  4 ++
>>  drivers/gpu/drm/xe/xe_device_types.h | 17 ++++++
>>  drivers/gpu/drm/xe/xe_pci.c          |  4 +-
>>  4 files changed, 103 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>> index 36d7ffb3b4d90..69bde506ee87e 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -65,6 +65,12 @@
>>
>>  #include <generated/xe_wa_oob.h>
>>
>> +struct xe_device_remove_action {
>> +	struct list_head node;
>> +	xe_device_remove_action_t remove;
>> +	void *data;
>> +};
>> +
>>  static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>>  {
>>  	struct xe_device *xe = to_xe_device(dev);
>> @@ -746,6 +752,9 @@ int xe_device_probe(struct xe_device *xe)
>>  	u8 last_gt;
>>  	u8 id;
>>
>> +	xe->probing = true;
>> +	INIT_LIST_HEAD(&xe->remove_action_list);
>> +
>>  	xe_pat_init_early(xe);
>>
>>  	err = xe_sriov_init(xe);
>> @@ -886,6 +895,8 @@ int xe_device_probe(struct xe_device *xe)
>>
>>  	xe_vsec_init(xe);
>>
>> +	xe->probing = false;
>> +
>>  	return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
>>
>>  err_fini_display:
>> @@ -907,6 +918,72 @@ int xe_device_probe(struct xe_device *xe)
>>  	return err;
>>  }
>>
>> +/**
>> + * xe_device_call_remove_actions - Call the remove actions
>> + * @xe: xe device instance
>> + *
>> + * This is only to be used by xe_pci and xe_device to call the remove actions
>> + * while removing the driver or handling probe failures.
>> + */
>> +void xe_device_call_remove_actions(struct xe_device *xe)
>> +{
>> +	struct xe_device_remove_action *ra, *tmp;
>> +
>> +	list_for_each_entry_safe(ra, tmp, &xe->remove_action_list, node) {
>> +		ra->remove(xe, ra->data);
>> +		list_del(&ra->node);
>> +		kfree(ra);
>> +	}
>> +
>> +	xe->probing = false;
>> +}
>> +
>> +/**
>> + * xe_device_add_action_or_reset - Add an action to run on driver removal
>> + * @xe: xe device instance
>> + * @ra: pointer to the object embedded into the object to cleanup
>> + * @remove: function to execute. The @ra is passed as argument
>> + *
>> + * Example:
>> + *
>> + * .. code-block:: c
>> + *
>> + *	static void foo_remove(struct xe_device_remove_action *ra)
>> + *	{
>> + *		struct xe_foo *foo = container_of(ra, struct xe_foo, remove_action);
>> + *		...
>> + *	}
>> + *
>> + *	int xe_foo_init(struct xe_foo *foo)
>> + *	{
>> + *		...
>> + *		xe_device_add_remove_action(xe, &foo->remove_action, foo_remove);
>> + *		...
>> + *		return 0;
>> + *	};
>
>I still believe we should add here a note here to highlight this is the
>exception and that devm should be preferred. But up to you, the
>explanation in the commit message makes more sense now and the patch
>is right. I hope we can get some devm solution to handle this component

arrgh, sorry. I was too entertained with the devm solution that I forgot
the update in this doc that you requested in previous version. My bad.

The devm part is here:
https://lore.kernel.org/dri-devel/20250212200542.515493-1-lucas.demarchi@intel.com/

In the end I could simplify it much more than I thought and what is
required on devres side is just one patch, 2 lines diff. Hopefully I
didn't simplify it too much. Let's see. If that one is accepted through
intel-xe tree, then we may even drop this first patch and go straight
with that solution.

Lucas De Marchi

>case. But let's move on:
>
>Reviewed-by: Rodrigo Vivi <rodrigo.vivi at intel.com>

I will wait some feedback on that other series to see how we are going
to proceed. If we keep this patch, then I will update this doc.

thanks
Lucas De Marchi

>
>> + */
>> +int xe_device_add_action_or_reset(struct xe_device *xe,
>> +				  xe_device_remove_action_t action,
>> +				  void *data)
>> +{
>> +	struct xe_device_remove_action *ra;
>> +
>> +	drm_WARN_ON(&xe->drm, !xe->probing);
>> +
>> +	ra = kmalloc(sizeof(*ra), GFP_KERNEL);
>> +	if (!ra) {
>> +		action(xe, data);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	INIT_LIST_HEAD(&ra->node);
>> +	ra->remove = action;
>> +	ra->data = data;
>> +	list_add(&ra->node, &xe->remove_action_list);
>> +
>> +	return 0;
>> +}
>> +
>>  static void xe_device_remove_display(struct xe_device *xe)
>>  {
>>  	xe_display_unregister(xe);
>> @@ -932,6 +1009,8 @@ void xe_device_remove(struct xe_device *xe)
>>
>>  	for_each_gt(gt, xe, id)
>>  		xe_gt_remove(gt);
>> +
>> +	xe_device_call_remove_actions(xe);
>>  }
>>
>>  void xe_device_shutdown(struct xe_device *xe)
>> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>> index 0bc3bc8e68030..a6fedf1ef3c7b 100644
>> --- a/drivers/gpu/drm/xe/xe_device.h
>> +++ b/drivers/gpu/drm/xe/xe_device.h
>> @@ -45,6 +45,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
>>  				   const struct pci_device_id *ent);
>>  int xe_device_probe_early(struct xe_device *xe);
>>  int xe_device_probe(struct xe_device *xe);
>> +int xe_device_add_action_or_reset(struct xe_device *xe,
>> +				  xe_device_remove_action_t action,
>> +				  void *data);
>> +void xe_device_call_remove_actions(struct xe_device *xe);
>>  void xe_device_remove(struct xe_device *xe);
>>  void xe_device_shutdown(struct xe_device *xe);
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index 833c29fed3a37..b322d49c83c77 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -35,6 +35,7 @@
>>  #include "intel_display_device.h"
>>  #endif
>>
>> +struct xe_device;
>>  struct xe_ggtt;
>>  struct xe_pat_ops;
>>  struct xe_pxp;
>> @@ -70,6 +71,8 @@ struct xe_pxp;
>>  		 const struct xe_tile * : (const struct xe_device *)((tile__)->xe),	\
>>  		 struct xe_tile * : (tile__)->xe)
>>
>> +typedef void (*xe_device_remove_action_t)(struct xe_device *xe, void *data);
>> +
>>  /**
>>   * struct xe_vram_region - memory region structure
>>   * This is used to describe a memory region in xe
>> @@ -428,6 +431,20 @@ struct xe_device {
>>  	/** @tiles: device tiles */
>>  	struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE];
>>
>> +	/**
>> +	 * @remove_action_list: list of actions to execute on device remove.
>> +	 * Use xe_device_add_remove_action() for that. Actions can only be added
>> +	 * during probe and are executed during the call from PCI subsystem to
>> +	 * remove the driver from the device.
>> +	 */
>> +	struct list_head remove_action_list;
>> +
>> +	/**
>> +	 * @probing: cover the section in which @remove_action_list can be used
>> +	 * to post cleaning actions
>> +	 */
>> +	bool probing;
>> +
>>  	/**
>>  	 * @mem_access: keep track of memory access in the device, possibly
>>  	 * triggering additional actions when they occur.
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index 6a8e82aff3853..70b697fde5b96 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -905,8 +905,10 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>>  		return err;
>>
>>  	err = xe_device_probe(xe);
>> -	if (err)
>> +	if (err) {
>> +		xe_device_call_remove_actions(xe);
>>  		return err;
>> +	}
>>
>>  	err = xe_pm_init(xe);
>>  	if (err)
>> --
>> 2.48.1
>>


More information about the Intel-xe mailing list