[PATCH v4 2/2] drm/amdgpu: add reset register dump trace on GPU reset

Somalapuram, Amaranath asomalap at amd.com
Wed Feb 16 10:46:06 UTC 2022


On 2/15/2022 10:09 PM, Andrey Grodzovsky wrote:
>
> On 2022-02-15 05:12, Somalapuram Amaranath wrote:
>> Dump the list of register values to trace event on GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 16 ++++++++++++++++
>>   2 files changed, 32 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 1e651b959141..ff21262c6fea 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4534,6 +4534,19 @@ int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>       return r;
>>   }
>>   +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>> +{
>> +    uint32_t reg_value;
>> +    int i;
>> +
>> +    for (i = 0; i < adev->n_regs; i++) {
>> +        reg_value = RREG32(adev->reset_dump_reg_list[i]);
>> + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                struct amdgpu_reset_context *reset_context)
>>   {
>> @@ -4567,8 +4580,10 @@ int amdgpu_do_asic_reset(struct list_head 
>> *device_list_handle,
>>                   tmp_adev->gmc.xgmi.pending_reset = false;
>>                   if (!queue_work(system_unbound_wq, 
>> &tmp_adev->xgmi_reset_work))
>>                       r = -EALREADY;
>> -            } else
>> +            } else {
>> +                amdgpu_reset_reg_dumps(tmp_adev);
>>                   r = amdgpu_asic_reset(tmp_adev);
>> +            }
>
>
> Is there any particular reason you only dump registers in single ASIC 
> case and not for XGMI ?
>
> Andrey
>
Not really, should I move it to the top of function?

Regards,

S.Amarnath

>
>>                 if (r) {
>>                   dev_err(tmp_adev->dev, "ASIC reset failed with 
>> error, %d for drm dev, %s",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> index d855cb53c7e0..b9637925e85c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> @@ -537,6 +537,22 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>                 __entry->seqno)
>>   );
>>   +TRACE_EVENT(amdgpu_reset_reg_dumps,
>> +        TP_PROTO(uint32_t address, uint32_t value),
>> +        TP_ARGS(address, value),
>> +        TP_STRUCT__entry(
>> +                 __field(uint32_t, address)
>> +                 __field(uint32_t, value)
>> +                 ),
>> +        TP_fast_assign(
>> +               __entry->address = address;
>> +               __entry->value = value;
>> +               ),
>> +        TP_printk("amdgpu register dump 0x%x: 0x%x",
>> +              __entry->address,
>> +              __entry->value)
>> +);
>> +
>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>   #endif


More information about the amd-gfx mailing list