[PATCH 2/6] drm/amdgpu: add support of gfx10 register dump

Khatri, Sunil sukhatri at amd.com
Tue Apr 16 17:32:33 UTC 2024


On 4/16/2024 7:30 PM, Christian König wrote:
> Am 16.04.24 um 15:55 schrieb Alex Deucher:
>> On Tue, Apr 16, 2024 at 8:08 AM Sunil Khatri <sunil.khatri at amd.com> 
>> wrote:
>>> Adding gfx10 gc registers to be used for register
>>> dump via devcoredump during a gpu reset.
>>>
>>> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  12 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   4 +
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 131 
>>> +++++++++++++++++-
>>>   .../include/asic_reg/gc/gc_10_1_0_offset.h    |  12 ++
>>>   4 files changed, 158 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index e0d7f4ee7e16..e016ac33629d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -139,6 +139,18 @@ enum amdgpu_ss {
>>>          AMDGPU_SS_DRV_UNLOAD
>>>   };
>>>
>>> +struct hwip_reg_entry {
>>> +       u32     hwip;
>>> +       u32     inst;
>>> +       u32     seg;
>>> +       u32     reg_offset;
>>> +};
>>> +
>>> +struct reg_pair {
>>> +       u32     offset;
>>> +       u32     value;
>>> +};
>>> +
>>>   struct amdgpu_watchdog_timer {
>>>          bool timeout_fatal_disable;
>>>          uint32_t period; /* maxCycles = (1 << period), the number 
>>> of cycles before a timeout */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> index 04a86dff71e6..295a2c8d2e48 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>>> @@ -433,6 +433,10 @@ struct amdgpu_gfx {
>>>          uint32_t                        num_xcc_per_xcp;
>>>          struct mutex                    partition_mutex;
>>>          bool                            mcbp; /* mid command buffer 
>>> preemption */
>>> +
>>> +       /* IP reg dump */
>>> +       struct reg_pair                 *ip_dump;
>>> +       uint32_t                        reg_count;
>>>   };
>>>
>>>   struct amdgpu_gfx_ras_reg_entry {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index a0bc4196ff8b..46e136609ff1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin");
>>>   MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin");
>>>   MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin");
>>>
>>> +static const struct hwip_reg_entry gc_reg_list_10_1[] = {
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS3) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_STALLED_STAT1) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_STALLED_STAT2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_STALLED_STAT1) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_STALLED_STAT1) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_BUSY_STAT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_BUSY_STAT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_BUSY_STAT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_BUSY_STAT2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_BUSY_STAT2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPF_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_GFX_ERROR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_GFX_HPD_STATUS0) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB_BASE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB_RPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB_WPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB0_BASE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB0_RPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB0_WPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB1_BASE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB1_RPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB1_WPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB2_BASE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB2_WPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_RB2_WPTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_CMD_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_CMD_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_CMD_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_CMD_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_BASE_LO) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_BASE_HI) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB1_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_BASE_LO) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_BASE_HI) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_IB2_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_BASE_LO) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_BASE_HI) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB1_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_BASE_LO) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_BASE_HI) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_IB2_BUFSZ) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCPF_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCPC_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCPG_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGDS_PROTECTION_FAULT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGDS_VM_PROTECTION_FAULT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmIA_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmIA_UTCL1_STATUS_2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmPA_CL_CNTL_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRMI_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmSQC_DCACHE_UTCL0_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmSQC_ICACHE_UTCL0_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmSQG_UTCL0_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmTCP_UTCL0_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmWD_UTCL1_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_DEBUG) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_MEC_CNTL) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_MES_CNTL) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CE_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_MEC1_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_MEC2_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, 
>>> mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_MES_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_ME_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_PFP_INSTR_PNTR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmCP_CPC_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_STAT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_COMMAND) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_MESSAGE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_1) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_3) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_ARGUMENT_4) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmSMU_RLC_RESPONSE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SAFE_MODE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SMU_SAFE_MODE) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_RLCS_GPM_STAT_2) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_SPP_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_INT_STAT) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_GENERAL_6) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_DEBUG_INST_A) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_DEBUG_INST_B) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR) },
>>> +       { SOC15_REG_ENTRY(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST) }
>>> +};
>> Might want to add the string name for the register as well.
>
> I think we could hack up the SOC15_REG_ENTRY() macro for that. E.g. 
> have a const pointer to the stringified last parameter in the 
> hwip_reg_entry.


How does this sound ?
structamdgpu_hwip_reg_entry{
         u32 hwip;
         u32 inst;
         u32 seg;
         u32 reg_offset;
charreg_name[50];
};

#defineSOC15_REG_ENTRY_STR(ip, inst, reg)  ip##_HWIP, inst, 
reg##_BASE_IDX, reg, #reg
staticconststructamdgpu_hwip_reg_entrygc_reg_list_10_1[]={
         { SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS) },
}


>
> And btw please name that amdgpu_hwip_reg_entry.
Noted
>
> Regards,
> Christian.
>
>>    It would
>> be nice to print the register name rather than the offset in the
>> devcoredump.
>>
>> E.g.,
>>
>>
>>> +
>>>   static const struct soc15_reg_golden golden_settings_gc_10_1[] = {
>>>          SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 
>>> 0xffffffff, 0x00400014),
>>>          SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, 
>>> 0xfcff8fff, 0xf8000100),
>>> @@ -4490,6 +4583,23 @@ static int gfx_v10_0_compute_ring_init(struct 
>>> amdgpu_device *adev, int ring_id,
>>>                               hw_prio, NULL);
>>>   }
>>>
>>> +static void gfx_v10_0_alloc_dump_mem(struct amdgpu_device *adev)
>>> +{
>>> +       uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
>>> +       struct reg_pair *ptr;
>>> +
>>> +       ptr = kcalloc(reg_count, sizeof(struct reg_pair), GFP_KERNEL);
>> I think you only need to allocate enough memory to store the register
>> values.  We already have the offsets in the array above.
>> { SOC15_REG_ENTRY(GC, 0, mmGRBM_STATUS), "mmGRBM_STATUS" },
>>
>>> +       if (ptr == NULL) {
>>> +               DRM_ERROR("Failed to allocate memory for IP Dump\n");
>>> +               adev->gfx.ip_dump = NULL;
>>> +               adev->gfx.reg_count = 0;
>>> +       }
>>> +       else {
>>> +               adev->gfx.ip_dump = ptr;
>>> +               adev->gfx.reg_count = reg_count;
>>> +       }
>>> +}
>>> +
>>>   static int gfx_v10_0_sw_init(void *handle)
>>>   {
>>>          int i, j, k, r, ring_id = 0;
>>> @@ -4642,6 +4752,8 @@ static int gfx_v10_0_sw_init(void *handle)
>>>
>>>          gfx_v10_0_gpu_early_init(adev);
>>>
>>> +       gfx_v10_0_alloc_dump_mem(adev);
>>> +
>>>          return 0;
>>>   }
>>>
>>> @@ -4694,6 +4806,8 @@ static int gfx_v10_0_sw_fini(void *handle)
>>>
>>>          gfx_v10_0_free_microcode(adev);
>>>
>>> +       kfree(adev->gfx.ip_dump);
>>> +
>>>          return 0;
>>>   }
>>>
>>> @@ -9154,6 +9268,21 @@ static void gfx_v10_0_emit_mem_sync(struct 
>>> amdgpu_ring *ring)
>>>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>>>   }
>>>
>>> +static void gfx_v10_ip_dump(void *handle)
>>> +{
>>> +       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +       uint32_t i;
>>> +       uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
>>> +
>>> +       if (!adev->gfx.ip_dump)
>>> +               return;
>>> +
>> Need turn disallow gfxoff before reading the gfx registers:
>>
>> amdgpu_gfx_off_ctrl(adev, false);
>>
>>> +       for (i = 0; i < reg_count; i++) {
>>> +               adev->gfx.ip_dump[i].offset = 
>>> gc_reg_list_10_1[i].reg_offset;
>> No need to store the offset.  We already have it in the static array 
>> above.
>>
>>> + adev->gfx.ip_dump[i].value = 
>>> RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i]));
>>> +       }
>> And then allow it again:
>>
>> amdgpu_gfx_off_ctrl(adev, true);
>>
>>> +}
>>> +
>>>   static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
>>>          .name = "gfx_v10_0",
>>>          .early_init = gfx_v10_0_early_init,
>>> @@ -9170,7 +9299,7 @@ static const struct amd_ip_funcs 
>>> gfx_v10_0_ip_funcs = {
>>>          .set_clockgating_state = gfx_v10_0_set_clockgating_state,
>>>          .set_powergating_state = gfx_v10_0_set_powergating_state,
>>>          .get_clockgating_state = gfx_v10_0_get_clockgating_state,
>>> -       .dump_ip_state = NULL,
>>> +       .dump_ip_state = gfx_v10_ip_dump,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>> diff --git 
>>> a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h 
>>> b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
>>> index 4908044f7409..4c8e7fdb6976 100644
>>> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
>>> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
>>> @@ -4830,6 +4830,8 @@
>>>   #define mmCP_ECC_FIRSTOCCURRENCE_RING2_BASE_IDX 0
>>>   #define mmGB_EDC_MODE 0x1e1e
>>>   #define mmGB_EDC_MODE_BASE_IDX 0
>>> +#define mmCP_DEBUG 0x1e1f
>>> +#define mmCP_DEBUG_BASE_IDX 0
>>>   #define mmCP_FETCHER_SOURCE 0x1e22
>>>   #define mmCP_FETCHER_SOURCE_BASE_IDX 0
>>>   #define mmCP_PQ_WPTR_POLL_CNTL 0x1e23
>>> @@ -7778,6 +7780,8 @@
>>>   #define mmCP_MES_DOORBELL_CONTROL5_BASE_IDX 1
>>>   #define mmCP_MES_DOORBELL_CONTROL6 0x2841
>>>   #define mmCP_MES_DOORBELL_CONTROL6_BASE_IDX 1
>>> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR 0x2842
>>> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR_BASE_IDX 1
>>>   #define mmCP_MES_GP0_LO 0x2843
>>>   #define mmCP_MES_GP0_LO_BASE_IDX 1
>>>   #define mmCP_MES_GP0_HI 0x2844
>>> @@ -9332,10 +9336,16 @@
>>>   #define mmRLC_LB_CNTR_INIT_1_BASE_IDX 1
>>>   #define mmRLC_LB_CNTR_1 0x4c1c
>>>   #define mmRLC_LB_CNTR_1_BASE_IDX 1
>>> +#define mmRLC_GPM_DEBUG_INST_ADDR 0x4c1d
>>> +#define mmRLC_GPM_DEBUG_INST_ADDR_BASE_IDX 1
>>>   #define mmRLC_JUMP_TABLE_RESTORE 0x4c1e
>>>   #define mmRLC_JUMP_TABLE_RESTORE_BASE_IDX 1
>>>   #define mmRLC_PG_DELAY_2 0x4c1f
>>>   #define mmRLC_PG_DELAY_2_BASE_IDX 1
>>> +#define mmRLC_GPM_DEBUG_INST_A 0x4c22
>>> +#define mmRLC_GPM_DEBUG_INST_A_BASE_IDX 1
>>> +#define mmRLC_GPM_DEBUG_INST_B 0x4c23
>>> +#define mmRLC_GPM_DEBUG_INST_B_BASE_IDX 1
>>>   #define mmRLC_GPU_CLOCK_COUNT_LSB 0x4c24
>>>   #define mmRLC_GPU_CLOCK_COUNT_LSB_BASE_IDX 1
>>>   #define mmRLC_GPU_CLOCK_COUNT_MSB 0x4c25
>>> @@ -9720,6 +9730,8 @@
>>>   #define mmRLC_SPM_THREAD_TRACE_CTRL_BASE_IDX 1
>>>   #define mmRLC_LB_CNTR_2 0x4de7
>>>   #define mmRLC_LB_CNTR_2_BASE_IDX 1
>>> +#define mmRLC_LX6_CORE_PDEBUG_INST 0x4deb
>>> +#define mmRLC_LX6_CORE_PDEBUG_INST_BASE_IDX 1
>>>   #define mmRLC_CPAXI_DOORBELL_MON_CTRL 0x4df1
>>>   #define mmRLC_CPAXI_DOORBELL_MON_CTRL_BASE_IDX 1
>>>   #define mmRLC_CPAXI_DOORBELL_MON_STAT 0x4df2
>>> -- 
>>> 2.34.1
>>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240416/08d8db1c/attachment-0001.htm>


More information about the amd-gfx mailing list