[PATCH v3 06/13] drm/xe: Add callback support for driver remove
Lucas De Marchi
lucas.demarchi at intel.com
Wed Feb 12 05:45:38 UTC 2025
On Mon, Feb 10, 2025 at 12:41:46PM -0500, Rodrigo Vivi wrote:
>On Fri, Feb 07, 2025 at 02:19:38PM -0800, Lucas De Marchi wrote:
>> xe device probe uses devm cleanup in most places. However there are a
>> few that are not possible: when the driver interacts with other
>
>"few cases where this is not possible" ?!
>
>> subsystems that require the cleanup to happen before the device being
>> removed from the bus. One example is the component_* APIs used by
>> xe_gsc_proxy and display.
>>
>> Add a callback-based remove so the exception don't make the probe
>> use multiple error handling styles.
>>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_device.c | 59 ++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_device.h | 4 ++
>> drivers/gpu/drm/xe/xe_device_remove_action.h | 24 ++++++++
>> drivers/gpu/drm/xe/xe_device_types.h | 15 +++++
>> drivers/gpu/drm/xe/xe_pci.c | 4 +-
>> 5 files changed, 105 insertions(+), 1 deletion(-)
>> create mode 100644 drivers/gpu/drm/xe/xe_device_remove_action.h
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>> index 90275531653fe..5fc4e696262f9 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -747,6 +747,9 @@ int xe_device_probe(struct xe_device *xe)
>> u8 last_gt;
>> u8 id;
>>
>> + xe->probing = true;
>> + INIT_LIST_HEAD(&xe->remove_action_list);
>> +
>> xe_pat_init_early(xe);
>>
>> err = xe_sriov_init(xe);
>> @@ -892,6 +895,8 @@ int xe_device_probe(struct xe_device *xe)
>>
>> xe_vsec_init(xe);
>>
>> + xe->probing = false;
>> +
>> return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe);
>>
>> err_fini_display:
>> @@ -911,6 +916,58 @@ int xe_device_probe(struct xe_device *xe)
>> return err;
>> }
>>
>> +/**
>> + * xe_device_call_remove_actions - Call the remove actions
>> + * @xe: xe device instance
>> + *
>> + * This is only to be used by xe_pci and xe_device to call the remove actions
>> + * while removing the driver or handling probe failures.
>> + */
>> +void xe_device_call_remove_actions(struct xe_device *xe)
>> +{
>> + struct xe_device_remove_action *ra;
>> +
>> + list_for_each_entry(ra, &xe->remove_action_list, node)
>> + ra->remove(ra);
>> +
>> + xe->probing = false;
>> +}
>> +
>> +/**
>> + * xe_device_add_remove_action - Add an action to run on driver removal
>> + * @xe: xe device instance
>> + * @ra: pointer to the object embedded into the object to cleanup
>> + * @remove: function to execute. The @ra is passed as argument
>> + *
>> + * Example:
>> + *
>> + * .. code-block:: c
>> + *
>> + * static void foo_remove(struct xe_device_remove_action *ra)
>> + * {
>> + * struct xe_foo *foo = container_of(ra, struct xe_foo, remove_action);
>> + * ...
>> + * }
>> + *
>> + * int xe_foo_init(struct xe_foo *foo)
>> + * {
>> + * ...
>> + * xe_device_add_remove_action(xe, &foo->remove_action, foo_remove);
>> + * ...
>> + * return 0;
>> + * };
>
>Although the cover letter mention that this should be the exception, the
>documentation here doesn't make that so clear.
>
>I believe we should be more clear on what cases this structure is aiming
>and some basic rules on when to go here instead of devm or drmm.
ok.. I went back to double check the cases in which this is really
really needed and the only one is apparently the handling with
component driver: devres is currently not handling well the devres group
removal while all resources are removed. I have a patch for that, but
I think I will have to do it in parallel to what's being done here.
CI uncovered one more issue since xe_display_fini() may also do
component_del() in ADL-P... I will move this patch to be the first one,
fixup the xe_display part and re-submit. In parallel I'm submitting the
improvements in drivers/base/{component,devres}.c to the relevant mailing
list.
>
>And probably even keep that comment where it is used with the GSC code.
>
>But other than that, the code and the approach looks good to me.
thanks
Lucas De Marchi
>
>> + */
>> +void xe_device_add_remove_action(struct xe_device *xe,
>> + struct xe_device_remove_action *ra,
>> + void (*remove)(struct xe_device_remove_action *ra))
>> +{
>> + drm_WARN_ON(&xe->drm, !xe->probing);
>> +
>> + INIT_LIST_HEAD(&ra->node);
>> + ra->remove = remove;
>> + list_add(&ra->node, &xe->remove_action_list);
>> +}
>> +
>> static void xe_device_remove_display(struct xe_device *xe)
>> {
>> xe_display_unregister(xe);
>> @@ -934,6 +991,8 @@ void xe_device_remove(struct xe_device *xe)
>>
>> for_each_gt(gt, xe, id)
>> xe_gt_remove(gt);
>> +
>> + xe_device_call_remove_actions(xe);
>> }
>>
>> void xe_device_shutdown(struct xe_device *xe)
>> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>> index fc3c2af3fb7fd..3fecf865957b0 100644
>> --- a/drivers/gpu/drm/xe/xe_device.h
>> +++ b/drivers/gpu/drm/xe/xe_device.h
>> @@ -45,6 +45,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
>> const struct pci_device_id *ent);
>> int xe_device_probe_early(struct xe_device *xe);
>> int xe_device_probe(struct xe_device *xe);
>> +void xe_device_add_remove_action(struct xe_device *xe,
>> + struct xe_device_remove_action *ra,
>> + void (*remove)(struct xe_device_remove_action *ra));
>> +void xe_device_call_remove_actions(struct xe_device *xe);
>> void xe_device_remove(struct xe_device *xe);
>> void xe_device_shutdown(struct xe_device *xe);
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device_remove_action.h b/drivers/gpu/drm/xe/xe_device_remove_action.h
>> new file mode 100644
>> index 0000000000000..e0322c4660dda
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_device_remove_action.h
>> @@ -0,0 +1,24 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_DEVICE_REMOVE_ACTION_H_
>> +#define _XE_DEVICE_REMOVE_ACTION_H_
>> +
>> +#include <linux/list.h>
>> +
>> +/**
>> + * struct xe_device_remove_action - Action item to run on driver removal
>> + *
>> + * This should be used like a list_head, embeding it into structures of the
>> + * individual parts being initialized. Once the remove action is ready to be
>> + * added, call xe_device_add_remove_action() to initialize and use this struct.
>> + */
>> +struct xe_device_remove_action {
>> + /* private: */
>> + struct list_head node;
>> + void (*remove)(struct xe_device_remove_action *ra);
>> +};
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index c0e886bac1831..4c902e0cb4ba9 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -13,6 +13,7 @@
>> #include <drm/ttm/ttm_device.h>
>>
>> #include "xe_devcoredump_types.h"
>> +#include "xe_device_remove_action.h"
>> #include "xe_heci_gsc.h"
>> #include "xe_lmtt_types.h"
>> #include "xe_memirq_types.h"
>> @@ -428,6 +429,20 @@ struct xe_device {
>> /** @tiles: device tiles */
>> struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE];
>>
>> + /**
>> + * @remove_action_list: list of actions to execute on device remove.
>> + * Use xe_device_add_remove_action() for that. Actions can only be added
>> + * during probe and are executed during the call from PCI subsystem to
>> + * remove the driver from the device.
>> + */
>> + struct list_head remove_action_list;
>> +
>> + /**
>> + * @probing: cover the section in which @remove_action_list can be used
>> + * to post cleaning actions
>> + */
>> + bool probing;
>> +
>> /**
>> * @mem_access: keep track of memory access in the device, possibly
>> * triggering additional actions when they occur.
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index 6a8e82aff3853..70b697fde5b96 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -905,8 +905,10 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>> return err;
>>
>> err = xe_device_probe(xe);
>> - if (err)
>> + if (err) {
>> + xe_device_call_remove_actions(xe);
>> return err;
>> + }
>>
>> err = xe_pm_init(xe);
>> if (err)
>> --
>> 2.48.1
>>
More information about the Intel-xe
mailing list