[PATCH 2/2] drm/amdgpu: add work function for GPU reset event

Mon Mar 7 16:46:21 UTC 2022

On 3/7/2022 9:56 PM, Shashank Sharma wrote:
> From: Shashank Sharma <shashank.sharma at amd.com>
>
> This patch adds a work function, which will get scheduled
> in event of a GPU reset, and will send a uevent to user with
> some reset context infomration, like a PID and some flags.
>
> The userspace can do some recovery and post-processing work
> based on this event.
>
> V2:
> - Changed the name of the work to gpu_reset_event_work
>    (Christian)
> - Added a structure to accommodate some additional information
>    (like a PID and some flags)
>
> Cc: Alexander Deucher <alexander.deucher at amd.com>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>   2 files changed, 26 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d8b854fcbffa..7df219fe363f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -813,6 +813,11 @@ struct amd_powerplay {
>   #define AMDGPU_RESET_MAGIC_NUM 64
>   #define AMDGPU_MAX_DF_PERFMONS 4
>   #define AMDGPU_PRODUCT_NAME_LEN 64
> +struct amdgpu_reset_event_ctx {
> +	uint64_t pid;
> +	uint32_t flags;
> +};
> +
>   struct amdgpu_device {
>   	struct device			*dev;
>   	struct pci_dev			*pdev;
> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>   
>   	int asic_reset_res;
>   	struct work_struct		xgmi_reset_work;
> +	struct work_struct		gpu_reset_event_work;
>   	struct list_head		reset_list;
>   
>   	long				gfx_timeout;
> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>   	pci_channel_state_t		pci_channel_state;
>   
>   	struct amdgpu_reset_control     *reset_cntl;
> +	struct amdgpu_reset_event_ctx   reset_event_ctx;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
>   	bool				ram_is_direct_mapped;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ed077de426d9..c43d099da06d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -73,6 +73,7 @@
>   #include <linux/pm_runtime.h>
>   
>   #include <drm/drm_drv.h>
> +#include <drm/drm_sysfs.h>
>   
>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
>   	return amdgpu_device_asic_has_dc_support(adev->asic_type);
>   }
>   
> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
> +{
> +	struct amdgpu_device *adev = container_of(__work, struct amdgpu_device,
> +						  gpu_reset_event_work);

I am trying same thing but adev context is lost.

schedule_work() in amdgpu_do_asic_reset after getting/reading vram_lost 
= amdgpu_device_check_vram_lost(tmp_adev);

Regards,

S.Amarnath

> +	struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
> +
> +	/*
> +	 * A GPU reset has happened, indicate the userspace and pass the
> +	 * following information:
> +	 *	- pid of the process involved,
> +	 *	- if the VRAM is valid or not,
> +	 *	- indicate that userspace may want to collect the ftrace event
> +	 * data from the trace event.
> +	 */
> +	drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, event_ctx->flags);
> +}
> +
>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   {
>   	struct amdgpu_device *adev =
> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   			  amdgpu_device_delay_enable_gfx_off);
>   
>   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> +	INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func);
>   
>   	adev->gfx.gfx_off_req_count = 1;
>   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;