[PATCH] drm/amdgpu: add ring timeout information in devcoredump

Tue Mar 5 11:05:55 UTC 2024

On 3/5/2024 2:53 PM, Christian König wrote:
> Am 01.03.24 um 13:43 schrieb Sunil Khatri:
>> Add ring timeout related information in the amdgpu
>> devcoredump file for debugging purposes.
>>
>> During the gpu recovery process the registered call
>> is triggered and add the debug information in data
>> file created by devcoredump framework under the
>> directory /sys/class/devcoredump/devcdx/
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       | 15 +++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   | 11 +++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>>   4 files changed, 38 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 9246bca0a008..9f57c7795c47 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -816,6 +816,17 @@ struct amdgpu_reset_info {
>>   #endif
>>   };
>>   +/*
>> + *  IP and Queue information during timeout
>> + */
>> +struct amdgpu_ring_timeout_info {
>> +    bool timeout;
>
> What should that be good for?
> In case of page fault or gpu reset due to other reasons there is no 
> time out. In that case we are not adding any information and we are 
> using this flag while dumping information.
>
>> +    char ring_name[32];
>> +    enum amdgpu_ring_type ip_type;
>
> Those information should already be available in the core dump.
Will update.
>
>> +    bool soft_recovery;
>
> That doesn't make sense since we don't do a core dump in case of a 
> soft recovery.
Noted, this can be removed.
>
>> +};
>> +
>> +
>>   /*
>>    * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
>>    */
>> @@ -1150,6 +1161,10 @@ struct amdgpu_device {
>>       bool                            debug_largebar;
>>       bool debug_disable_soft_recovery;
>>       bool                            debug_use_vram_fw_buf;
>> +
>> +#ifdef CONFIG_DEV_COREDUMP
>> +    struct amdgpu_ring_timeout_info ring_timeout_info;
>> +#endif
>
> Please never store core dump related information in the amdgpu_device 
> structure.

Let me see to it. Point taken

Thanks
Sunil

>
> Regards,
> Christian.
>
>>   };
>>     static inline uint32_t amdgpu_ip_version(const struct 
>> amdgpu_device *adev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 71a5cf37b472..e36b7352b2de 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -51,8 +51,19 @@ static enum drm_gpu_sched_stat 
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>       memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>       adev->job_hang = true;
>>   +#ifdef CONFIG_DEV_COREDUMP
>> +    /* Update the ring timeout info for coredump*/
>> +    adev->ring_timeout_info.timeout = TRUE;
>> +    sprintf(adev->ring_timeout_info.ring_name, s_job->sched->name);
>> +    adev->ring_timeout_info.ip_type = ring->funcs->type;
>> +    adev->ring_timeout_info.soft_recovery = FALSE;
>> +#endif
>> +
>>       if (amdgpu_gpu_recovery &&
>>           amdgpu_ring_soft_recovery(ring, job->vmid, 
>> s_job->s_fence->parent)) {
>> +#ifdef CONFIG_DEV_COREDUMP
>> +        adev->ring_timeout_info.soft_recovery = TRUE;
>> +#endif
>>           DRM_ERROR("ring %s timeout, but soft recovered\n",
>>                 s_job->sched->name);
>>           goto exit;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 4baa300121d8..d4f892ed105f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -196,8 +196,16 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
>> offset, size_t count,
>>                  coredump->reset_task_info.process_name,
>>                  coredump->reset_task_info.pid);
>>   +    if (coredump->timeout_info.timeout) {
>> +        drm_printf(&p, "\nRing timed out details\n");
>> +        drm_printf(&p, "IP Type: %d Ring Name: %s Soft Recovery: %s\n",
>> +                coredump->timeout_info.ip_type,
>> +                coredump->timeout_info.ring_name,
>> +                coredump->timeout_info.soft_recovery ? 
>> "Successful":"Failed");
>> +    }
>> +
>>       if (coredump->reset_vram_lost)
>> -        drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>> +        drm_printf(&p, "\nVRAM is lost due to GPU reset!\n");
>>       if (coredump->adev->reset_info.num_regs) {
>>           drm_printf(&p, "AMDGPU register dumps:\nOffset:     
>> Value:\n");
>>   @@ -228,6 +236,7 @@ void amdgpu_coredump(struct amdgpu_device 
>> *adev, bool vram_lost,
>>           return;
>>       }
>>   +    coredump->timeout_info = adev->ring_timeout_info;
>>       coredump->reset_vram_lost = vram_lost;
>>         if (reset_context->job && reset_context->job->vm)
>> @@ -236,6 +245,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, 
>> bool vram_lost,
>>       coredump->adev = adev;
>>         ktime_get_ts64(&coredump->reset_time);
>> +    memset(&adev->ring_timeout_info, 0, 
>> sizeof(adev->ring_timeout_info));
>>         dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>>                 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index 19899f6b9b2b..37172d6e1f94 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -97,6 +97,7 @@ struct amdgpu_coredump_info {
>>       struct amdgpu_task_info         reset_task_info;
>>       struct timespec64               reset_time;
>>       bool                            reset_vram_lost;
>> +    struct amdgpu_ring_timeout_info timeout_info;
>>   };
>>   #endif
>