[PATCH v2] drm/amdgpu: add ring timeout information in devcoredump
Christian König
christian.koenig at amd.com
Tue Mar 5 13:10:31 UTC 2024
Am 05.03.24 um 12:58 schrieb Sunil Khatri:
> Add ring timeout related information in the amdgpu
> devcoredump file for debugging purposes.
>
> During the gpu recovery process the registered call
> is triggered and add the debug information in data
> file created by devcoredump framework under the
> directory /sys/class/devcoredump/devcdx/
>
> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 +++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 2 ++
> 2 files changed, 17 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index a59364e9b6ed..aa7fed59a0d5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -196,6 +196,13 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
> coredump->reset_task_info.process_name,
> coredump->reset_task_info.pid);
>
> + if (coredump->ring_timeout) {
> + drm_printf(&p, "\nRing timed out details\n");
> + drm_printf(&p, "IP Type: %d Ring Name: %s \n",
> + coredump->ring->funcs->type,
> + coredump->ring->name);
> + }
> +
> if (coredump->reset_vram_lost)
> drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> if (coredump->adev->reset_info.num_regs) {
> @@ -220,6 +227,8 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> {
> struct amdgpu_coredump_info *coredump;
> struct drm_device *dev = adev_to_drm(adev);
> + struct amdgpu_job *job = reset_context->job;
> + struct drm_sched_job *s_job;
>
> coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
>
> @@ -228,6 +237,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> return;
> }
>
> + if (job) {
> + s_job = &job->base;
> + coredump->ring = to_amdgpu_ring(s_job->sched);
> + coredump->ring_timeout = TRUE;
> + }
> +
> coredump->reset_vram_lost = vram_lost;
>
> if (reset_context->job && reset_context->job->vm) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 19899f6b9b2b..6d67001a1057 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -97,6 +97,8 @@ struct amdgpu_coredump_info {
> struct amdgpu_task_info reset_task_info;
> struct timespec64 reset_time;
> bool reset_vram_lost;
> + struct amdgpu_ring *ring;
> + bool ring_timeout;
I think you can drop ring_timeout, just having ring as optional
information should be enough.
Apart from that looks pretty good I think.
Regards,
Christian.
> };
> #endif
>
More information about the amd-gfx
mailing list