[PATCH 3/4] drm/amdgpu: add reset register trace dump function
Sharma, Shashank
shashank.sharma at amd.com
Mon Jan 24 16:45:17 UTC 2022
Hello Christian,
Thank for your comments, please fine mine inline:
On 1/24/2022 8:15 AM, Christian König wrote:
> Am 21.01.22 um 21:34 schrieb Sharma, Shashank:
>> From 1c5c552eeddaffd9fb3e7d45ece1b2b28fccc575 Mon Sep 17 00:00:00 2001
>> From: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>> Date: Fri, 21 Jan 2022 14:19:10 +0530
>> Subject: [PATCH 3/4] drm/amdgpu: add reset register trace dump
>> function for
>> gfx_v10_0
>>
>> Implementation of register trace dump function on the AMD GPU resets
>>
>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 8 ++++
>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 53 ++++++++++++++++++++++-
>> drivers/gpu/drm/amd/include/amd_shared.h | 1 +
>> 3 files changed, 60 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> index d855cb53c7e0..c97b53b54333 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
>> @@ -537,6 +537,14 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
>> __entry->seqno)
>> );
>>
>> +TRACE_EVENT(gfx_v10_0_reset_reg_dumps,
>> + TP_PROTO(char *reg_dumps),
>> + TP_ARGS(reg_dumps),
>> + TP_STRUCT__entry(__string(dumps, reg_dumps)),
>> + TP_fast_assign(__assign_str(dumps, reg_dumps);),
>> + TP_printk("amdgpu register dump {%s}", __get_str(dumps))
>> +);
>> +
>> #undef AMDGPU_JOB_GET_TIMELINE_NAME
>> #endif
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 16dbe593cba2..05974ed5416d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -46,7 +46,7 @@
>> #include "v10_structs.h"
>> #include "gfx_v10_0.h"
>> #include "nbio_v2_3.h"
>> -
>> +#include "amdgpu_trace.h"
>> /*
>> * Navi10 has two graphic rings to share each graphic pipe.
>> * 1. Primary ring
>> @@ -188,6 +188,12 @@
>> #define RLCG_ERROR_REPORT_ENABLED(adev) \
>> (amdgpu_sriov_reg_indirect_mmhub(adev) ||
>> amdgpu_sriov_reg_indirect_gc(adev))
>>
>> +#define N_REGS (17)
>> +#define DUMP_REG(addr) do { \
>> + (dump)[i][0] = (addr); \
>> + (dump)[i++][1] = RREG32_SOC15_IP(GC, addr); \
>> + } while (0)
>
> Those macros need an AMDGPU_ prefix and maybe some better names.
Agree, @Amar pls check this out.
>
> From the design POV I'm really wondering if it wouldn't be better if
> userspace defines the registers we want to dump in case of a crash.
>
I am not so sure about it. This means we have to communicate with the
userspace and get the list of registers it wishes to see, do I smell a
new IOCTL or DRM property ? I also feel that we have better filtering
tools for a trace event in userspace than in kernel.
What do you think ?
- Shashank
> Regards,
> Christian.
>
>> +
>> MODULE_FIRMWARE("amdgpu/navi10_ce.bin");
>> MODULE_FIRMWARE("amdgpu/navi10_pfp.bin");
>> MODULE_FIRMWARE("amdgpu/navi10_me.bin");
>> @@ -7488,7 +7494,6 @@ static int gfx_v10_0_hw_init(void *handle)
>> {
>> int r;
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> if (!amdgpu_emu_mode)
>> gfx_v10_0_init_golden_registers(adev);
>>
>> @@ -7602,6 +7607,49 @@ static int gfx_v10_0_hw_fini(void *handle)
>> return 0;
>> }
>>
>> +static int gfx_v10_0_reset_reg_dumps(void *handle)
>> +{
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + uint32_t i = 0;
>> + uint32_t (*dump)[2], n_regs = 0;
>> + char *reg_dumps;
>> +
>> + dump = kmalloc(N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
>> + reg_dumps = kmalloc(1024, GFP_KERNEL);
>> +
>> + if (dump == NULL || reg_dumps == NULL)
>> + return -ENOMEM;
>> +
>> + DUMP_REG(mmGRBM_STATUS2);
>> + DUMP_REG(mmGRBM_STATUS_SE0);
>> + DUMP_REG(mmGRBM_STATUS_SE1);
>> + DUMP_REG(mmGRBM_STATUS_SE2);
>> + DUMP_REG(mmGRBM_STATUS_SE3);
>> + DUMP_REG(mmSDMA0_STATUS_REG);
>> + DUMP_REG(mmSDMA1_STATUS_REG);
>> + DUMP_REG(mmCP_STAT);
>> + DUMP_REG(mmCP_STALLED_STAT1);
>> + DUMP_REG(mmCP_STALLED_STAT1);
>> + DUMP_REG(mmCP_STALLED_STAT3);
>> + DUMP_REG(mmCP_CPC_STATUS);
>> + DUMP_REG(mmCP_CPC_BUSY_STAT);
>> + DUMP_REG(mmCP_CPC_STALLED_STAT1);
>> + DUMP_REG(mmCP_CPF_STATUS);
>> + DUMP_REG(mmCP_CPF_BUSY_STAT);
>> + DUMP_REG(mmCP_CPF_STALLED_STAT1);
>> +
>> + n_regs = i;
>> +
>> + for (i = 0; i < n_regs; i++)
>> + sprintf(reg_dumps + strlen(reg_dumps), "%08x: %08x, ",
>> dump[i][0], dump[i][1]);
>> +
>> + trace_gfx_v10_0_reset_reg_dumps(reg_dumps);
>> + kfree(dump);
>> + kfree(reg_dumps);
>> +
>> + return 0;
>> +}
>> +
>> static int gfx_v10_0_suspend(void *handle)
>> {
>> return gfx_v10_0_hw_fini(handle);
>> @@ -9367,6 +9415,7 @@ static const struct amd_ip_funcs
>> gfx_v10_0_ip_funcs = {
>> .set_clockgating_state = gfx_v10_0_set_clockgating_state,
>> .set_powergating_state = gfx_v10_0_set_powergating_state,
>> .get_clockgating_state = gfx_v10_0_get_clockgating_state,
>> + .reset_reg_dumps = gfx_v10_0_reset_reg_dumps,
>> };
>>
>> static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
>> b/drivers/gpu/drm/amd/include/amd_shared.h
>> index 257f280d3d53..76d3a55278df 100644
>> --- a/drivers/gpu/drm/amd/include/amd_shared.h
>> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
>> @@ -296,6 +296,7 @@ struct amd_ip_funcs {
>> enum amd_powergating_state state);
>> void (*get_clockgating_state)(void *handle, u32 *flags);
>> int (*enable_umd_pstate)(void *handle, enum amd_dpm_forced_level
>> *level);
>> + int (*reset_reg_dumps)(void *handle);
>> };
>>
>>
>
More information about the amd-gfx
mailing list