[PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump
André Almeida
andrealmeid at igalia.com
Fri Jul 14 12:23:55 UTC 2023
Em 14/07/2023 04:57, Christian König escreveu:
> Am 13.07.23 um 23:32 schrieb André Almeida:
>> Log the IB addresses used by the hung job along with the stuck ring
>> name. Note that due to nested IBs, the one that caused the reset itself
>> may be in not listed address.
>>
>> Signed-off-by: André Almeida <andrealmeid at igalia.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
>> 2 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index e1cc83a89d46..cfeaf93934fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
>> struct amdgpu_task_info reset_task_info;
>> struct timespec64 reset_time;
>> bool reset_vram_lost;
>> + u64 *ibs;
>> + u32 num_ibs;
>> + char ring_name[16];
>> };
>> #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07546781b8b8..431ccc3d7857 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char
>> *buffer, loff_t offset,
>> coredump->adev->reset_dump_reg_value[i]);
>> }
>> + if (coredump->num_ibs) {
>> + drm_printf(&p, "IBs:\n");
>> + for (i = 0; i < coredump->num_ibs; i++)
>> + drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
>> + }
>> +
>> + if (coredump->ring_name[0] != '\0')
>> + drm_printf(&p, "ring name: %s\n", coredump->ring_name);
>> +
>> return count - iter.remain;
>> }
>> static void amdgpu_devcoredump_free(void *data)
>> {
>> - kfree(data);
>> + struct amdgpu_coredump_info *coredump = data;
>> +
>> + kfree(coredump->ibs);
>> + kfree(coredump);
>> }
>> static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>> @@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device
>> *adev, bool vram_lost,
>> {
>> struct amdgpu_coredump_info *coredump;
>> struct drm_device *dev = adev_to_drm(adev);
>> + struct amdgpu_job *job = reset_context->job;
>> + int i;
>> coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
>> @@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct
>> amdgpu_device *adev, bool vram_lost,
>> coredump->adev = adev;
>> + if (job && job->num_ibs) {
>
> I really really really don't want any dependency of the core dump
> feature towards the job.
>
Because of the lifetime of job?
Do you think implementing amdgpu_job_get()/put() would help here?
> What we could do is to record the first executed IB VAs in the hw fence,
> but I'm not sure how useful this is in the first place.
>
I see, any hint here of the timedout job would be helpful AFAIK.
> We have some internal feature in progress to query the VA of the draw
> command which cause the waves currently executing in the SQ to be
> retrieved.
>
>> + struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
>> + u32 num_ibs = job->num_ibs;
>> +
>> + coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs),
>> GFP_NOWAIT);
>
> This can fail pretty easily.
Because of its size?
>
> Christian.
>
>> + if (coredump->ibs)
>> + coredump->num_ibs = num_ibs;
>> +
>> + for (i = 0; i < coredump->num_ibs; i++)
>> + coredump->ibs[i] = job->ibs[i].gpu_addr;
>> +
>> + if (ring)
>> + strncpy(coredump->ring_name, ring->name, 16);
>> + }
>> +
>> ktime_get_ts64(&coredump->reset_time);
>> dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>
More information about the dri-devel
mailing list