[PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump

Fri Jul 14 12:23:55 UTC 2023

Em 14/07/2023 04:57, Christian König escreveu:
> Am 13.07.23 um 23:32 schrieb André Almeida:
>> Log the IB addresses used by the hung job along with the stuck ring
>> name. Note that due to nested IBs, the one that caused the reset itself
>> may be in not listed address.
>>
>> Signed-off-by: André Almeida <andrealmeid at igalia.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
>>   2 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index e1cc83a89d46..cfeaf93934fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
>>       struct amdgpu_task_info         reset_task_info;
>>       struct timespec64               reset_time;
>>       bool                            reset_vram_lost;
>> +    u64                *ibs;
>> +    u32                num_ibs;
>> +    char                ring_name[16];
>>   };
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07546781b8b8..431ccc3d7857 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char 
>> *buffer, loff_t offset,
>>                      coredump->adev->reset_dump_reg_value[i]);
>>       }
>> +    if (coredump->num_ibs) {
>> +        drm_printf(&p, "IBs:\n");
>> +        for (i = 0; i < coredump->num_ibs; i++)
>> +            drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
>> +    }
>> +
>> +    if (coredump->ring_name[0] != '\0')
>> +        drm_printf(&p, "ring name: %s\n", coredump->ring_name);
>> +
>>       return count - iter.remain;
>>   }
>>   static void amdgpu_devcoredump_free(void *data)
>>   {
>> -    kfree(data);
>> +    struct amdgpu_coredump_info *coredump = data;
>> +
>> +    kfree(coredump->ibs);
>> +    kfree(coredump);
>>   }
>>   static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>> @@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device 
>> *adev, bool vram_lost,
>>   {
>>       struct amdgpu_coredump_info *coredump;
>>       struct drm_device *dev = adev_to_drm(adev);
>> +    struct amdgpu_job *job = reset_context->job;
>> +    int i;
>>       coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
>> @@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct 
>> amdgpu_device *adev, bool vram_lost,
>>       coredump->adev = adev;
>> +    if (job && job->num_ibs) {
> 
> I really really really don't want any dependency of the core dump 
> feature towards the job.
> 

Because of the lifetime of job?

Do you think implementing amdgpu_job_get()/put() would help here?

> What we could do is to record the first executed IB VAs in the hw fence, 
> but I'm not sure how useful this is in the first place.
> 

I see, any hint here of the timedout job would be helpful AFAIK.

> We have some internal feature in progress to query the VA of the draw 
> command which cause the waves currently executing in the SQ to be 
> retrieved.
> 
>> +        struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
>> +        u32 num_ibs = job->num_ibs;
>> +
>> +        coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs), 
>> GFP_NOWAIT);
> 
> This can fail pretty easily.

Because of its size?

> 
> Christian.
> 
>> +        if (coredump->ibs)
>> +            coredump->num_ibs = num_ibs;
>> +
>> +        for (i = 0; i < coredump->num_ibs; i++)
>> +            coredump->ibs[i] = job->ibs[i].gpu_addr;
>> +
>> +        if (ring)
>> +            strncpy(coredump->ring_name, ring->name, 16);
>> +    }
>> +
>>       ktime_get_ts64(&coredump->reset_time);
>>       dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>