[PATCH] drm/xe: remove sysfs entries on driver unbind

Andrzej Hajda andrzej.hajda at intel.com
Tue Apr 30 16:18:18 UTC 2024


On 30.04.2024 17:51, Andrzej Hajda wrote:
> After unbinding driver from the device, driver is responsible for
> leaving the device in it's initial state, to be ready to bind to another
> driver or to rebind. Particularly device sysfs entries added by the driver
> should disappear. Removing them together with drm device is too late,
> as it's life-time can be prolonged by userspace keeping open drm
> descriptor. Removing them at driver's unbind seems the proper solution.
> 
> Signed-off-by: Andrzej Hajda <andrzej.hajda at intel.com>
> ---
> Hi all,
> 
> xe driver seems to be not protected against hot-rebind scenario - driver
> re-bind with opened drm descriptor.
> This patch solves one of the issues - removal of sysfs entires, but
> apparently there are more:
> [  845.740016] ---[ end trace 0000000000000000 ]---
> [  845.771285] xe 0000:00:02.0: [drm] *ERROR* power well DC_off state mismatch (refcount 1/enabled 0)
> [  845.771309] xe 0000:00:02.0: [drm] *ERROR* power well PW_2 state mismatch (refcount 1/enabled 0)
> [  845.771324] xe 0000:00:02.0: [drm] *ERROR* power well PW_A state mismatch (refcount 1/enabled 0)
> [  845.771338] xe 0000:00:02.0: [drm] *ERROR* power well PW_B state mismatch (refcount 1/enabled 0)
> [  845.771351] xe 0000:00:02.0: [drm] *ERROR* power well PW_C state mismatch (refcount 1/enabled 0)
> [  845.771365] xe 0000:00:02.0: [drm] *ERROR* power well PW_D state mismatch (refcount 1/enabled 0)
> [  845.774682] ------------[ cut here ]------------
> [  845.774697] xe 0000:00:02.0: Missing outer runtime PM protection
> [  845.774743] WARNING: CPU: 16 PID: 2228 at drivers/gpu/drm/xe/xe_pm.c:550 xe_pm_runtime_get_noresume+0x141/0x1d0 [xe]
> ...
> [  845.777795] Call Trace:
> [  845.777801]  <TASK>
> [  845.777809]  ? show_regs+0x71/0x90
> [  845.777824]  ? __warn+0xce/0x300
> [  845.777835]  ? xe_pm_runtime_get_noresume+0x141/0x1d0 [xe]
> [  845.778002]  ? report_bug+0x2ad/0x300
> [  845.778015]  ? handle_bug+0x46/0x90
> [  845.778027]  ? exc_invalid_op+0x19/0x50
> [  845.778037]  ? asm_exc_invalid_op+0x1b/0x20
> [  845.778052]  ? xe_pm_runtime_get_noresume+0x141/0x1d0 [xe]
> [  845.778215]  xe_ggtt_remove_node+0x69/0x1c0 [xe]
> [  845.780754]  xe_ggtt_remove_bo+0x10a/0x650 [xe]
> [  845.780909]  ? __pfx_xe_ggtt_remove_bo+0x10/0x10 [xe]
> [  845.781064]  ? _raw_write_unlock+0x23/0x50
> [  845.781076]  ? drm_vma_offset_remove+0x72/0x90 [drm]
> [  845.781190]  xe_ttm_bo_destroy+0x351/0x770 [xe]
> [  845.781341]  ? ww_mutex_unlock+0x1a6/0x280
> [  845.781354]  ttm_bo_release+0x41d/0xae0 [ttm]
> [  845.781380]  ? __pfx_ttm_bo_release+0x10/0x10 [ttm]
> [  845.781402]  ? __pfx___mutex_unlock_slowpath+0x10/0x10
> [  845.781413]  ? __kasan_check_read+0x11/0x20
> [  845.781425]  ? do_raw_spin_unlock+0x5c/0x210
> [  845.781437]  ? _raw_spin_unlock+0x23/0x50
> [  845.781448]  ttm_bo_put+0x50/0x80 [ttm]
> [  845.781469]  xe_gem_object_free+0x88/0x190 [xe]
> [  845.781610]  drm_gem_object_free+0x59/0x90 [drm]
> [  845.781697]  __xe_bo_unpin_map_no_vm+0xcf/0x110 [xe]
> [  845.781839]  drm_managed_release+0x1a3/0x4e0 [drm]
> [  845.781925]  ? drm_client_dev_restore+0x1f4/0x280 [drm]
> [  845.782001]  drm_minor_release+0xd1/0x140 [drm]
> [  845.782079]  drm_release_noglobal+0xad/0x100 [drm]
> 
> I guess after device remove call driver should not touch the hardware
> at all, it could only maintain objects allocated by userspace in kind
> of zombie mode - no interactions with hw, if userspace request hw access
> return error.
> Question to maintainers of xe/drm. How should it be implemented in xe
> driver?
> Or maybe drm provides already well tested way of handling such cases?

There are drm helpers:
- drm_dev_unplug
- drm_dev_enter
- drm_dev_exit
Maybe we just need to use them widely in xe?

Regards
Andrzej


> 
> Signed-off-by: Andrzej Hajda <andrzej.hajda at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_device_sysfs.c          |  4 ++--
>   drivers/gpu/drm/xe/xe_gt_ccs_mode.c           |  4 ++--
>   drivers/gpu/drm/xe/xe_gt_freq.c               |  4 ++--
>   drivers/gpu/drm/xe/xe_gt_idle.c               |  4 ++--
>   drivers/gpu/drm/xe/xe_gt_sysfs.c              |  4 ++--
>   drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c     |  4 ++--
>   drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c | 12 ++++++------
>   drivers/gpu/drm/xe/xe_tile_sysfs.c            |  4 ++--
>   drivers/gpu/drm/xe/xe_vram_freq.c             |  4 ++--
>   9 files changed, 22 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c
> index 21677b8cd977..8d096db750fa 100644
> --- a/drivers/gpu/drm/xe/xe_device_sysfs.c
> +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
> @@ -69,7 +69,7 @@ vram_d3cold_threshold_store(struct device *dev, struct device_attribute *attr,
>   
>   static DEVICE_ATTR_RW(vram_d3cold_threshold);
>   
> -static void xe_device_sysfs_fini(struct drm_device *drm, void *arg)
> +static void xe_device_sysfs_fini(void *arg)
>   {
>   	struct xe_device *xe = arg;
>   
> @@ -85,5 +85,5 @@ int xe_device_sysfs_init(struct xe_device *xe)
>   	if (ret)
>   		return ret;
>   
> -	return drmm_add_action_or_reset(&xe->drm, xe_device_sysfs_fini, xe);
> +	return devm_add_action_or_reset(xe->drm.dev, xe_device_sysfs_fini, xe);
>   }
> diff --git a/drivers/gpu/drm/xe/xe_gt_ccs_mode.c b/drivers/gpu/drm/xe/xe_gt_ccs_mode.c
> index 396aeb5b9924..890da8870b0d 100644
> --- a/drivers/gpu/drm/xe/xe_gt_ccs_mode.c
> +++ b/drivers/gpu/drm/xe/xe_gt_ccs_mode.c
> @@ -150,7 +150,7 @@ static const struct attribute *gt_ccs_mode_attrs[] = {
>   	NULL,
>   };
>   
> -static void xe_gt_ccs_mode_sysfs_fini(struct drm_device *drm, void *arg)
> +static void xe_gt_ccs_mode_sysfs_fini(void *arg)
>   {
>   	struct xe_gt *gt = arg;
>   
> @@ -182,5 +182,5 @@ int xe_gt_ccs_mode_sysfs_init(struct xe_gt *gt)
>   	if (err)
>   		return err;
>   
> -	return drmm_add_action_or_reset(&xe->drm, xe_gt_ccs_mode_sysfs_fini, gt);
> +	return devm_add_action_or_reset(xe->drm.dev, xe_gt_ccs_mode_sysfs_fini, gt);
>   }
> diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c
> index 855de40e40ea..ed7949df5bf5 100644
> --- a/drivers/gpu/drm/xe/xe_gt_freq.c
> +++ b/drivers/gpu/drm/xe/xe_gt_freq.c
> @@ -209,7 +209,7 @@ static const struct attribute *freq_attrs[] = {
>   	NULL
>   };
>   
> -static void freq_fini(struct drm_device *drm, void *arg)
> +static void freq_fini(void *arg)
>   {
>   	struct kobject *kobj = arg;
>   
> @@ -237,7 +237,7 @@ int xe_gt_freq_init(struct xe_gt *gt)
>   	if (!gt->freq)
>   		return -ENOMEM;
>   
> -	err = drmm_add_action_or_reset(&xe->drm, freq_fini, gt->freq);
> +	err = devm_add_action_or_reset(xe->drm.dev, freq_fini, gt->freq);
>   	if (err)
>   		return err;
>   
> diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c
> index 8fc0f3f6ecc5..9e907033f32b 100644
> --- a/drivers/gpu/drm/xe/xe_gt_idle.c
> +++ b/drivers/gpu/drm/xe/xe_gt_idle.c
> @@ -144,7 +144,7 @@ static const struct attribute *gt_idle_attrs[] = {
>   	NULL,
>   };
>   
> -static void gt_idle_sysfs_fini(struct drm_device *drm, void *arg)
> +static void gt_idle_sysfs_fini(void *arg)
>   {
>   	struct kobject *kobj = arg;
>   
> @@ -181,7 +181,7 @@ int xe_gt_idle_sysfs_init(struct xe_gt_idle *gtidle)
>   		return err;
>   	}
>   
> -	return drmm_add_action_or_reset(&xe->drm, gt_idle_sysfs_fini, kobj);
> +	return devm_add_action_or_reset(xe->drm.dev, gt_idle_sysfs_fini, kobj);
>   }
>   
>   void xe_gt_idle_enable_c6(struct xe_gt *gt)
> diff --git a/drivers/gpu/drm/xe/xe_gt_sysfs.c b/drivers/gpu/drm/xe/xe_gt_sysfs.c
> index 1e5971072bc8..ec2b8246204b 100644
> --- a/drivers/gpu/drm/xe/xe_gt_sysfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_sysfs.c
> @@ -22,7 +22,7 @@ static const struct kobj_type xe_gt_sysfs_kobj_type = {
>   	.sysfs_ops = &kobj_sysfs_ops,
>   };
>   
> -static void gt_sysfs_fini(struct drm_device *drm, void *arg)
> +static void gt_sysfs_fini(void *arg)
>   {
>   	struct xe_gt *gt = arg;
>   
> @@ -51,5 +51,5 @@ int xe_gt_sysfs_init(struct xe_gt *gt)
>   
>   	gt->sysfs = &kg->base;
>   
> -	return drmm_add_action_or_reset(&xe->drm, gt_sysfs_fini, gt);
> +	return devm_add_action_or_reset(xe->drm.dev, gt_sysfs_fini, gt);
>   }
> diff --git a/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c b/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c
> index fbe21a8599ca..c9e04151286d 100644
> --- a/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c
> +++ b/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c
> @@ -229,7 +229,7 @@ static const struct attribute_group throttle_group_attrs = {
>   	.attrs = throttle_attrs,
>   };
>   
> -static void gt_throttle_sysfs_fini(struct drm_device *drm, void *arg)
> +static void gt_throttle_sysfs_fini(void *arg)
>   {
>   	struct xe_gt *gt = arg;
>   
> @@ -245,5 +245,5 @@ int xe_gt_throttle_sysfs_init(struct xe_gt *gt)
>   	if (err)
>   		return err;
>   
> -	return drmm_add_action_or_reset(&xe->drm, gt_throttle_sysfs_fini, gt);
> +	return devm_add_action_or_reset(xe->drm.dev, gt_throttle_sysfs_fini, gt);
>   }
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c
> index 844ec68cbbb8..258078a6b461 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c
> @@ -492,7 +492,7 @@ static const struct attribute * const files[] = {
>   	NULL
>   };
>   
> -static void kobj_xe_hw_engine_class_fini(struct drm_device *drm, void *arg)
> +static void kobj_xe_hw_engine_class_fini(void *arg)
>   {
>   	struct kobject *kobj = arg;
>   
> @@ -517,7 +517,7 @@ kobj_xe_hw_engine_class(struct xe_device *xe, struct kobject *parent, const char
>   	}
>   	keclass->xe = xe;
>   
> -	err = drmm_add_action_or_reset(&xe->drm, kobj_xe_hw_engine_class_fini,
> +	err = devm_add_action_or_reset(xe->drm.dev, kobj_xe_hw_engine_class_fini,
>   				       &keclass->base);
>   	if (err)
>   		return NULL;
> @@ -525,7 +525,7 @@ kobj_xe_hw_engine_class(struct xe_device *xe, struct kobject *parent, const char
>   	return keclass;
>   }
>   
> -static void hw_engine_class_defaults_fini(struct drm_device *drm, void *arg)
> +static void hw_engine_class_defaults_fini(void *arg)
>   {
>   	struct kobject *kobj = arg;
>   
> @@ -552,7 +552,7 @@ static int xe_add_hw_engine_class_defaults(struct xe_device *xe,
>   	if (err)
>   		goto err_object;
>   
> -	return drmm_add_action_or_reset(&xe->drm, hw_engine_class_defaults_fini, kobj);
> +	return devm_add_action_or_reset(xe->drm.dev, hw_engine_class_defaults_fini, kobj);
>   
>   err_object:
>   	kobject_put(kobj);
> @@ -611,7 +611,7 @@ static const struct kobj_type xe_hw_engine_sysfs_kobj_type = {
>   	.sysfs_ops = &xe_hw_engine_class_sysfs_ops,
>   };
>   
> -static void hw_engine_class_sysfs_fini(struct drm_device *drm, void *arg)
> +static void hw_engine_class_sysfs_fini(void *arg)
>   {
>   	struct kobject *kobj = arg;
>   
> @@ -698,7 +698,7 @@ int xe_hw_engine_class_sysfs_init(struct xe_gt *gt)
>   			goto err_object;
>   	}
>   
> -	return drmm_add_action_or_reset(&xe->drm, hw_engine_class_sysfs_fini, kobj);
> +	return devm_add_action_or_reset(xe->drm.dev, hw_engine_class_sysfs_fini, kobj);
>   
>   err_object:
>   	kobject_put(kobj);
> diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c
> index 64661403afcd..12b2c3e5c421 100644
> --- a/drivers/gpu/drm/xe/xe_tile_sysfs.c
> +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c
> @@ -22,7 +22,7 @@ static const struct kobj_type xe_tile_sysfs_kobj_type = {
>   	.sysfs_ops = &kobj_sysfs_ops,
>   };
>   
> -static void tile_sysfs_fini(struct drm_device *drm, void *arg)
> +static void tile_sysfs_fini(void *arg)
>   {
>   	struct xe_tile *tile = arg;
>   
> @@ -55,5 +55,5 @@ int xe_tile_sysfs_init(struct xe_tile *tile)
>   	if (err)
>   		return err;
>   
> -	return drmm_add_action_or_reset(&xe->drm, tile_sysfs_fini, tile);
> +	return devm_add_action_or_reset(dev, tile_sysfs_fini, tile);
>   }
> diff --git a/drivers/gpu/drm/xe/xe_vram_freq.c b/drivers/gpu/drm/xe/xe_vram_freq.c
> index 3e21ddc6e60c..99ff95e408e0 100644
> --- a/drivers/gpu/drm/xe/xe_vram_freq.c
> +++ b/drivers/gpu/drm/xe/xe_vram_freq.c
> @@ -87,7 +87,7 @@ static const struct attribute_group freq_group_attrs = {
>   	.attrs = freq_attrs,
>   };
>   
> -static void vram_freq_sysfs_fini(struct drm_device *drm, void *arg)
> +static void vram_freq_sysfs_fini(void *arg)
>   {
>   	struct kobject *kobj = arg;
>   
> @@ -122,5 +122,5 @@ int xe_vram_freq_sysfs_init(struct xe_tile *tile)
>   		return err;
>   	}
>   
> -	return drmm_add_action_or_reset(&xe->drm, vram_freq_sysfs_fini, kobj);
> +	return devm_add_action_or_reset(xe->drm.dev, vram_freq_sysfs_fini, kobj);
>   }
> 
> ---
> base-commit: 4caf410766add8cf376a3afc910b17dd0961dd75
> change-id: 20240430-hotrebind_xu-897cc3413a7f
> 
> Best regards,



More information about the Intel-xe mailing list