[PATCH 3/4] drm/amdgpu: add reset register trace dump function

Christian König christian.koenig at amd.com
Mon Jan 24 16:48:11 UTC 2022


Am 24.01.22 um 17:45 schrieb Sharma, Shashank:
> Hello Christian,
> Thank for your comments, please fine mine inline:
>
> On 1/24/2022 8:15 AM, Christian König wrote:
>> Am 21.01.22 um 21:34 schrieb Sharma, Shashank:
>>> From 1c5c552eeddaffd9fb3e7d45ece1b2b28fccc575 Mon Sep 17 00:00:00 2001
>>> From: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>>> Date: Fri, 21 Jan 2022 14:19:10 +0530
>>> Subject: [PATCH 3/4] drm/amdgpu: add reset register trace dump 
>>> function for
>>>  gfx_v10_0
>>>
>>> Implementation of register trace dump function on the AMD GPU resets
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h |  8 ++++
>>>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 53 ++++++++++++++++++++++-
>>>  drivers/gpu/drm/amd/include/amd_shared.h  |  1 +
>>>  3 files changed, 60 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> index d855cb53c7e0..c97b53b54333 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>>> @@ -537,6 +537,14 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>>>                __entry->seqno)
>>>  );
>>>
>>> +TRACE_EVENT(gfx_v10_0_reset_reg_dumps,
>>> +        TP_PROTO(char *reg_dumps),
>>> +        TP_ARGS(reg_dumps),
>>> +        TP_STRUCT__entry(__string(dumps, reg_dumps)),
>>> +        TP_fast_assign(__assign_str(dumps, reg_dumps);),
>>> +        TP_printk("amdgpu register dump {%s}", __get_str(dumps))
>>> +);
>>> +
>>>  #undef AMDGPU_JOB_GET_TIMELINE_NAME
>>>  #endif
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index 16dbe593cba2..05974ed5416d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -46,7 +46,7 @@
>>>  #include "v10_structs.h"
>>>  #include "gfx_v10_0.h"
>>>  #include "nbio_v2_3.h"
>>> -
>>> +#include "amdgpu_trace.h"
>>>  /*
>>>   * Navi10 has two graphic rings to share each graphic pipe.
>>>   * 1. Primary ring
>>> @@ -188,6 +188,12 @@
>>>  #define RLCG_ERROR_REPORT_ENABLED(adev) \
>>>      (amdgpu_sriov_reg_indirect_mmhub(adev) || 
>>> amdgpu_sriov_reg_indirect_gc(adev))
>>>
>>> +#define N_REGS (17)
>>> +#define DUMP_REG(addr) do {                    \
>>> +        (dump)[i][0] = (addr);                \
>>> +        (dump)[i++][1] =  RREG32_SOC15_IP(GC, addr);    \
>>> +    } while (0)
>>
>> Those macros need an AMDGPU_ prefix and maybe some better names.
>
> Agree, @Amar pls check this out.
>>
>>  From the design POV I'm really wondering if it wouldn't be better if 
>> userspace defines the registers we want to dump in case of a crash.
>>
>
> I am not so sure about it. This means we have to communicate with the 
> userspace and get the list of registers it wishes to see, do I smell a 
> new IOCTL or DRM property ? I also feel that we have better filtering 
> tools for a trace event in userspace than in kernel.
>
> What do you think ?

Just a writeable debugfs file should probably do it. We have a list of 
known registers filled in during boot (just reg offsets) and userspace 
can read/write the file to update it.

Christian.

>
> - Shashank
>
>> Regards,
>> Christian.
>>
>>> +
>>>  MODULE_FIRMWARE("amdgpu/navi10_ce.bin");
>>>  MODULE_FIRMWARE("amdgpu/navi10_pfp.bin");
>>>  MODULE_FIRMWARE("amdgpu/navi10_me.bin");
>>> @@ -7488,7 +7494,6 @@ static int gfx_v10_0_hw_init(void *handle)
>>>  {
>>>      int r;
>>>      struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>      if (!amdgpu_emu_mode)
>>>          gfx_v10_0_init_golden_registers(adev);
>>>
>>> @@ -7602,6 +7607,49 @@ static int gfx_v10_0_hw_fini(void *handle)
>>>      return 0;
>>>  }
>>>
>>> +static int gfx_v10_0_reset_reg_dumps(void *handle)
>>> +{
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +    uint32_t i = 0;
>>> +    uint32_t (*dump)[2], n_regs = 0;
>>> +    char *reg_dumps;
>>> +
>>> +    dump = kmalloc(N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
>>> +    reg_dumps = kmalloc(1024, GFP_KERNEL);
>>> +
>>> +    if (dump == NULL || reg_dumps == NULL)
>>> +        return -ENOMEM;
>>> +
>>> +    DUMP_REG(mmGRBM_STATUS2);
>>> +    DUMP_REG(mmGRBM_STATUS_SE0);
>>> +    DUMP_REG(mmGRBM_STATUS_SE1);
>>> +    DUMP_REG(mmGRBM_STATUS_SE2);
>>> +    DUMP_REG(mmGRBM_STATUS_SE3);
>>> +    DUMP_REG(mmSDMA0_STATUS_REG);
>>> +    DUMP_REG(mmSDMA1_STATUS_REG);
>>> +    DUMP_REG(mmCP_STAT);
>>> +    DUMP_REG(mmCP_STALLED_STAT1);
>>> +    DUMP_REG(mmCP_STALLED_STAT1);
>>> +    DUMP_REG(mmCP_STALLED_STAT3);
>>> +    DUMP_REG(mmCP_CPC_STATUS);
>>> +    DUMP_REG(mmCP_CPC_BUSY_STAT);
>>> +    DUMP_REG(mmCP_CPC_STALLED_STAT1);
>>> +    DUMP_REG(mmCP_CPF_STATUS);
>>> +    DUMP_REG(mmCP_CPF_BUSY_STAT);
>>> +    DUMP_REG(mmCP_CPF_STALLED_STAT1);
>>> +
>>> +    n_regs = i;
>>> +
>>> +    for (i = 0; i < n_regs; i++)
>>> +        sprintf(reg_dumps + strlen(reg_dumps), "%08x: %08x, ", 
>>> dump[i][0], dump[i][1]);
>>> +
>>> +    trace_gfx_v10_0_reset_reg_dumps(reg_dumps);
>>> +    kfree(dump);
>>> +    kfree(reg_dumps);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>  static int gfx_v10_0_suspend(void *handle)
>>>  {
>>>      return gfx_v10_0_hw_fini(handle);
>>> @@ -9367,6 +9415,7 @@ static const struct amd_ip_funcs 
>>> gfx_v10_0_ip_funcs = {
>>>      .set_clockgating_state = gfx_v10_0_set_clockgating_state,
>>>      .set_powergating_state = gfx_v10_0_set_powergating_state,
>>>      .get_clockgating_state = gfx_v10_0_get_clockgating_state,
>>> +    .reset_reg_dumps = gfx_v10_0_reset_reg_dumps,
>>>  };
>>>
>>>  static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h 
>>> b/drivers/gpu/drm/amd/include/amd_shared.h
>>> index 257f280d3d53..76d3a55278df 100644
>>> --- a/drivers/gpu/drm/amd/include/amd_shared.h
>>> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
>>> @@ -296,6 +296,7 @@ struct amd_ip_funcs {
>>>                       enum amd_powergating_state state);
>>>      void (*get_clockgating_state)(void *handle, u32 *flags);
>>>      int (*enable_umd_pstate)(void *handle, enum 
>>> amd_dpm_forced_level *level);
>>> +    int (*reset_reg_dumps)(void *handle);
>>>  };
>>>
>>>
>>



More information about the amd-gfx mailing list