[PATCH 2/2] drm/amdgpu: add reset register trace function on GPU reset

Wed Feb 9 07:47:42 UTC 2022

Am 08.02.22 um 16:28 schrieb Alex Deucher:
> On Tue, Feb 8, 2022 at 3:17 AM Somalapuram Amaranath
> <Amaranath.Somalapuram at amd.com> wrote:
>> Dump the list of register values to trace event on GPU reset.
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 19 +++++++++++++++++++
>>   2 files changed, 39 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 1e651b959141..057922fb7e37 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4534,6 +4534,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>          return r;
>>   }
>>
>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>> +{
>> +       int i;
>> +       uint32_t reg_value[128];
>> +
>> +       for (i = 0; adev->reset_dump_reg_list[i] != 0; i++) {
>> +               if (adev->asic_type >= CHIP_NAVI10)
> This check should be against CHIP_VEGA10.  Also, this only allows for
> GC registers.  If we wanted to dump other registers, we'd need a
> different macro.  Might be better to just use RREG32 here for
> everything and then encode the full offset using
> SOC15_REG_ENTRY_OFFSET() or a similar macro.  Also, we need to think
> about how to handle gfxoff in this case.  gfxoff needs to be disabled
> or we'll hang the chip if we try and read GC or SDMA registers via
> MMIO which will adversely affect the hang signature.

Well this should execute right before a GPU reset, so I think it 
shouldn't matter if we hang the chip or not as long as the read comes 
back correctly (I remember a very long UVD debug session because of this).

But in general I agree, we should just use RREG32() here and always 
encode the full register offset.

Regards,
Christian.

>
> Alex
>
>> +                       reg_value[i] = RREG32_SOC15_IP(GC, adev->reset_dump_reg_list[i]);
>> +               else
>> +                       reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
>> +       }
>> +
>> +       trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list, reg_value, i);
>> +
>> +       return 0;
>> +}
>> +
>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                           struct amdgpu_reset_context *reset_context)
>>   {
>> @@ -4567,8 +4584,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                                  tmp_adev->gmc.xgmi.pending_reset = false;
>>                                  if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
>>                                          r = -EALREADY;
>> -                       } else
>> +                       } else {
>> +                               amdgpu_reset_reg_dumps(tmp_adev);
>>                                  r = amdgpu_asic_reset(tmp_adev);
>> +                       }
>>
>>                          if (r) {
>>                                  dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> index d855cb53c7e0..3fe33de3564a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> @@ -537,6 +537,25 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>                        __entry->seqno)
>>   );
>>
>> +TRACE_EVENT(amdgpu_reset_reg_dumps,
>> +           TP_PROTO(long *address, uint32_t *value, int length),
>> +           TP_ARGS(address, value, length),
>> +           TP_STRUCT__entry(
>> +                            __array(long, address, 128)
>> +                            __array(uint32_t, value, 128)
>> +                            __field(int, len)
>> +                            ),
>> +           TP_fast_assign(
>> +                          memcpy(__entry->address, address, 128);
>> +                          memcpy(__entry->value,  value, 128);
>> +                          __entry->len = length;
>> +                          ),
>> +           TP_printk("amdgpu register dump offset: %s value: %s ",
>> +                     __print_array(__entry->address, __entry->len, 8),
>> +                     __print_array(__entry->value, __entry->len, 8)
>> +                    )
>> +);
>> +
>>   #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>   #endif
>>
>> --
>> 2.25.1
>>