[PATCH] drm/amdgpu: add ring timeout information in devcoredump
Khatri, Sunil
Sunil.Khatri at amd.com
Tue Mar 5 11:05:55 UTC 2024
On 3/5/2024 2:53 PM, Christian König wrote:
> Am 01.03.24 um 13:43 schrieb Sunil Khatri:
>> Add ring timeout related information in the amdgpu
>> devcoredump file for debugging purposes.
>>
>> During the gpu recovery process the registered call
>> is triggered and add the debug information in data
>> file created by devcoredump framework under the
>> directory /sys/class/devcoredump/devcdx/
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 15 +++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 11 +++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +++++++++++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 +
>> 4 files changed, 38 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 9246bca0a008..9f57c7795c47 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -816,6 +816,17 @@ struct amdgpu_reset_info {
>> #endif
>> };
>> +/*
>> + * IP and Queue information during timeout
>> + */
>> +struct amdgpu_ring_timeout_info {
>> + bool timeout;
>
> What should that be good for?
> In case of page fault or gpu reset due to other reasons there is no
> time out. In that case we are not adding any information and we are
> using this flag while dumping information.
>
>> + char ring_name[32];
>> + enum amdgpu_ring_type ip_type;
>
> Those information should already be available in the core dump.
Will update.
>
>> + bool soft_recovery;
>
> That doesn't make sense since we don't do a core dump in case of a
> soft recovery.
Noted, this can be removed.
>
>> +};
>> +
>> +
>> /*
>> * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
>> */
>> @@ -1150,6 +1161,10 @@ struct amdgpu_device {
>> bool debug_largebar;
>> bool debug_disable_soft_recovery;
>> bool debug_use_vram_fw_buf;
>> +
>> +#ifdef CONFIG_DEV_COREDUMP
>> + struct amdgpu_ring_timeout_info ring_timeout_info;
>> +#endif
>
> Please never store core dump related information in the amdgpu_device
> structure.
Let me see to it. Point taken
Thanks
Sunil
>
> Regards,
> Christian.
>
>> };
>> static inline uint32_t amdgpu_ip_version(const struct
>> amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 71a5cf37b472..e36b7352b2de 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -51,8 +51,19 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> memset(&ti, 0, sizeof(struct amdgpu_task_info));
>> adev->job_hang = true;
>> +#ifdef CONFIG_DEV_COREDUMP
>> + /* Update the ring timeout info for coredump*/
>> + adev->ring_timeout_info.timeout = TRUE;
>> + sprintf(adev->ring_timeout_info.ring_name, s_job->sched->name);
>> + adev->ring_timeout_info.ip_type = ring->funcs->type;
>> + adev->ring_timeout_info.soft_recovery = FALSE;
>> +#endif
>> +
>> if (amdgpu_gpu_recovery &&
>> amdgpu_ring_soft_recovery(ring, job->vmid,
>> s_job->s_fence->parent)) {
>> +#ifdef CONFIG_DEV_COREDUMP
>> + adev->ring_timeout_info.soft_recovery = TRUE;
>> +#endif
>> DRM_ERROR("ring %s timeout, but soft recovered\n",
>> s_job->sched->name);
>> goto exit;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 4baa300121d8..d4f892ed105f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -196,8 +196,16 @@ amdgpu_devcoredump_read(char *buffer, loff_t
>> offset, size_t count,
>> coredump->reset_task_info.process_name,
>> coredump->reset_task_info.pid);
>> + if (coredump->timeout_info.timeout) {
>> + drm_printf(&p, "\nRing timed out details\n");
>> + drm_printf(&p, "IP Type: %d Ring Name: %s Soft Recovery: %s\n",
>> + coredump->timeout_info.ip_type,
>> + coredump->timeout_info.ring_name,
>> + coredump->timeout_info.soft_recovery ?
>> "Successful":"Failed");
>> + }
>> +
>> if (coredump->reset_vram_lost)
>> - drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>> + drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>> if (coredump->adev->reset_info.num_regs) {
>> drm_printf(&p, "AMDGPU register dumps:\nOffset:
>> Value:\n");
>> @@ -228,6 +236,7 @@ void amdgpu_coredump(struct amdgpu_device
>> *adev, bool vram_lost,
>> return;
>> }
>> + coredump->timeout_info = adev->ring_timeout_info;
>> coredump->reset_vram_lost = vram_lost;
>> if (reset_context->job && reset_context->job->vm)
>> @@ -236,6 +245,7 @@ void amdgpu_coredump(struct amdgpu_device *adev,
>> bool vram_lost,
>> coredump->adev = adev;
>> ktime_get_ts64(&coredump->reset_time);
>> + memset(&adev->ring_timeout_info, 0,
>> sizeof(adev->ring_timeout_info));
>> dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>> amdgpu_devcoredump_read, amdgpu_devcoredump_free);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index 19899f6b9b2b..37172d6e1f94 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -97,6 +97,7 @@ struct amdgpu_coredump_info {
>> struct amdgpu_task_info reset_task_info;
>> struct timespec64 reset_time;
>> bool reset_vram_lost;
>> + struct amdgpu_ring_timeout_info timeout_info;
>> };
>> #endif
>
More information about the amd-gfx
mailing list