[PATCH] drm/amdgpu: add ring timeout information in devcoredump

Tue Mar 5 09:23:14 UTC 2024

Am 01.03.24 um 13:43 schrieb Sunil Khatri:
> Add ring timeout related information in the amdgpu
> devcoredump file for debugging purposes.
>
> During the gpu recovery process the registered call
> is triggered and add the debug information in data
> file created by devcoredump framework under the
> directory /sys/class/devcoredump/devcdx/
>
> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       | 15 +++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   | 11 +++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>   4 files changed, 38 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 9246bca0a008..9f57c7795c47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -816,6 +816,17 @@ struct amdgpu_reset_info {
>   #endif
>   };
>   
> +/*
> + *  IP and Queue information during timeout
> + */
> +struct amdgpu_ring_timeout_info {
> +	bool timeout;

What should that be good for?

> +	char ring_name[32];
> +	enum amdgpu_ring_type ip_type;

Those information should already be available in the core dump.

> +	bool soft_recovery;

That doesn't make sense since we don't do a core dump in case of a soft 
recovery.

> +};
> +
> +
>   /*
>    * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
>    */
> @@ -1150,6 +1161,10 @@ struct amdgpu_device {
>   	bool                            debug_largebar;
>   	bool                            debug_disable_soft_recovery;
>   	bool                            debug_use_vram_fw_buf;
> +
> +#ifdef CONFIG_DEV_COREDUMP
> +	struct amdgpu_ring_timeout_info ring_timeout_info;
> +#endif

Please never store core dump related information in the amdgpu_device 
structure.

Regards,
Christian.

>   };
>   
>   static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 71a5cf37b472..e36b7352b2de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -51,8 +51,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   	memset(&ti, 0, sizeof(struct amdgpu_task_info));
>   	adev->job_hang = true;
>   
> +#ifdef CONFIG_DEV_COREDUMP
> +	/* Update the ring timeout info for coredump*/
> +	adev->ring_timeout_info.timeout = TRUE;
> +	sprintf(adev->ring_timeout_info.ring_name, s_job->sched->name);
> +	adev->ring_timeout_info.ip_type = ring->funcs->type;
> +	adev->ring_timeout_info.soft_recovery = FALSE;
> +#endif
> +
>   	if (amdgpu_gpu_recovery &&
>   	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
> +#ifdef CONFIG_DEV_COREDUMP
> +		adev->ring_timeout_info.soft_recovery = TRUE;
> +#endif
>   		DRM_ERROR("ring %s timeout, but soft recovered\n",
>   			  s_job->sched->name);
>   		goto exit;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 4baa300121d8..d4f892ed105f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -196,8 +196,16 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
>   			   coredump->reset_task_info.process_name,
>   			   coredump->reset_task_info.pid);
>   
> +	if (coredump->timeout_info.timeout) {
> +		drm_printf(&p, "\nRing timed out details\n");
> +		drm_printf(&p, "IP Type: %d Ring Name: %s Soft Recovery: %s\n",
> +				coredump->timeout_info.ip_type,
> +				coredump->timeout_info.ring_name,
> +				coredump->timeout_info.soft_recovery ? "Successful":"Failed");
> +	}
> +
>   	if (coredump->reset_vram_lost)
> -		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> +		drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>   	if (coredump->adev->reset_info.num_regs) {
>   		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
>   
> @@ -228,6 +236,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>   		return;
>   	}
>   
> +	coredump->timeout_info = adev->ring_timeout_info;
>   	coredump->reset_vram_lost = vram_lost;
>   
>   	if (reset_context->job && reset_context->job->vm)
> @@ -236,6 +245,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>   	coredump->adev = adev;
>   
>   	ktime_get_ts64(&coredump->reset_time);
> +	memset(&adev->ring_timeout_info, 0, sizeof(adev->ring_timeout_info));
>   
>   	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>   		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 19899f6b9b2b..37172d6e1f94 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -97,6 +97,7 @@ struct amdgpu_coredump_info {
>   	struct amdgpu_task_info         reset_task_info;
>   	struct timespec64               reset_time;
>   	bool                            reset_vram_lost;
> +	struct amdgpu_ring_timeout_info timeout_info;
>   };
>   #endif
>