[PATCH v3 4/7] drm/amdgpu: Add work_struct for GPU reset from debugfs

Mon May 30 07:52:59 UTC 2022

Am 25.05.22 um 21:04 schrieb Andrey Grodzovsky:
> We need to have a work_struct to cancel this reset if another
> already in progress.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 19 +++++++++++++++++--
>   2 files changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 76df583663c7..8165ee5b0457 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1048,6 +1048,8 @@ struct amdgpu_device {
>   
>   	bool                            scpm_enabled;
>   	uint32_t                        scpm_status;
> +
> +	struct work_struct		reset_work;
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index d16c8c1f72db..b0498ffcf7c3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -39,6 +39,7 @@
>   #include <drm/drm_drv.h>
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
> +#include "amdgpu_reset.h"
>   
>   /*
>    * Fences
> @@ -798,7 +799,10 @@ static int gpu_recover_get(void *data, u64 *val)
>   		return 0;
>   	}
>   
> -	*val = amdgpu_device_gpu_recover(adev, NULL);
> +	if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->reset_work))
> +		flush_work(&adev->reset_work);
> +
> +	*val = atomic_read(&adev->reset_domain->reset_res);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
>   	pm_runtime_put_autosuspend(dev->dev);
> @@ -810,6 +814,14 @@ DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
>   DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get, NULL,
>   			 "%lld\n");
>   
> +static void amdgpu_debugfs_reset_work(struct work_struct *work)
> +{
> +	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
> +						  reset_work);
> +
> +	amdgpu_device_gpu_recover_imp(adev, NULL);
> +}
> +
>   #endif
>   
>   void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
> @@ -821,9 +833,12 @@ void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
>   	debugfs_create_file("amdgpu_fence_info", 0444, root, adev,
>   			    &amdgpu_debugfs_fence_info_fops);
>   
> -	if (!amdgpu_sriov_vf(adev))
> +	if (!amdgpu_sriov_vf(adev)) {

I think we should drop the check for amdgpu_sriov_vf() here. It's a 
valid requirement to be able to trigger a GPU reset for a VF as well.

But not topic of this patch, feel free to add an Reviewed-by: Christian 
König <christian.koenig at amd.com>.

Regards,
Christian.

> +
> +		INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
>   		debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
>   				    &amdgpu_debugfs_gpu_recover_fops);
> +	}
>   #endif
>   }
>