[PATCH v4 2/6] drm/amdgpu: add support of gfx10 register dump

Christian König christian.koenig at amd.com
Wed Apr 17 08:45:17 UTC 2024



Am 17.04.24 um 10:18 schrieb Sunil Khatri:
> Adding gfx10 gc registers to be used for register
> dump via devcoredump during a gpu reset.
>
> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   8 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |   4 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c        | 130 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/soc15.h            |   2 +
>   .../include/asic_reg/gc/gc_10_1_0_offset.h    |  12 ++
>   5 files changed, 155 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e0d7f4ee7e16..210af65a744c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -139,6 +139,14 @@ enum amdgpu_ss {
>   	AMDGPU_SS_DRV_UNLOAD
>   };
>   
> +struct amdgpu_hwip_reg_entry {
> +	u32	hwip;
> +	u32	inst;
> +	u32	seg;
> +	u32	reg_offset;

> +	char	reg_name[50];

Make that a const char *. Otherwise it bloats up the final binary 
because the compiler has to add zeros at the end.

> +};
> +
>   struct amdgpu_watchdog_timer {
>   	bool timeout_fatal_disable;
>   	uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 04a86dff71e6..64f197bbc866 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -433,6 +433,10 @@ struct amdgpu_gfx {
>   	uint32_t			num_xcc_per_xcp;
>   	struct mutex			partition_mutex;
>   	bool				mcbp; /* mid command buffer preemption */
> +
> +	/* IP reg dump */
> +	uint32_t			*ip_dump;
> +	uint32_t			reg_count;
>   };
>   
>   struct amdgpu_gfx_ras_reg_entry {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index a0bc4196ff8b..4a54161f4837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin");
>   MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin");
>   MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin");
>   
> +static const struct amdgpu_hwip_reg_entry gc_reg_list_10_1[] = {
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS3),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_HPD_STATUS0),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS_2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL0_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL0_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmSQG_UTCL0_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL0_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_CNTL),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_3),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_4),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_GPM_STAT_2),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SPP_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_A),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_B),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR),
> +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST)
> +};
> +
>   static const struct soc15_reg_golden golden_settings_gc_10_1[] = {
>   	SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0xffffffff, 0x00400014),
>   	SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, 0xfcff8fff, 0xf8000100),
> @@ -4490,6 +4583,22 @@ static int gfx_v10_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,
>   			     hw_prio, NULL);
>   }
>   
> +static void gfx_v10_0_alloc_dump_mem(struct amdgpu_device *adev)
> +{
> +	uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
> +	uint32_t *ptr;
> +
> +	ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL);
> +	if (ptr == NULL) {
> +		DRM_ERROR("Failed to allocate memory for IP Dump\n");
> +		adev->gfx.ip_dump = NULL;
> +		adev->gfx.reg_count = 0;
> +	} else {
> +		adev->gfx.ip_dump = ptr;
> +		adev->gfx.reg_count = reg_count;
> +	}
> +}
> +
>   static int gfx_v10_0_sw_init(void *handle)
>   {
>   	int i, j, k, r, ring_id = 0;
> @@ -4642,6 +4751,8 @@ static int gfx_v10_0_sw_init(void *handle)
>   
>   	gfx_v10_0_gpu_early_init(adev);
>   
> +	gfx_v10_0_alloc_dump_mem(adev);
> +
>   	return 0;
>   }
>   
> @@ -4694,6 +4805,8 @@ static int gfx_v10_0_sw_fini(void *handle)
>   
>   	gfx_v10_0_free_microcode(adev);
>   
> +	kfree(adev->gfx.ip_dump);
> +
>   	return 0;
>   }
>   
> @@ -9154,6 +9267,21 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>   }
>   
> +static void gfx_v10_ip_dump(void *handle)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +	uint32_t i;
> +	uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
> +
> +	if (!adev->gfx.ip_dump)
> +		return;
> +
> +	amdgpu_gfx_off_ctrl(adev, false);
> +	for (i = 0; i < reg_count; i++)
> +		adev->gfx.ip_dump[i] = RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i]));
> +	amdgpu_gfx_off_ctrl(adev, true);
> +}
> +
>   static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
>   	.name = "gfx_v10_0",
>   	.early_init = gfx_v10_0_early_init,
> @@ -9170,7 +9298,7 @@ static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
>   	.set_clockgating_state = gfx_v10_0_set_clockgating_state,
>   	.set_powergating_state = gfx_v10_0_set_powergating_state,
>   	.get_clockgating_state = gfx_v10_0_get_clockgating_state,
> -	.dump_ip_state = NULL,
> +	.dump_ip_state = gfx_v10_ip_dump,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h
> index 1444b7765e4b..282584a48be0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h
> @@ -88,6 +88,8 @@ struct soc15_ras_field_entry {
>   };
>   
>   #define SOC15_REG_ENTRY(ip, inst, reg)	ip##_HWIP, inst, reg##_BASE_IDX, reg
> +#define SOC15_REG_ENTRY_STR(ip, inst, reg) \
> +	{ ip##_HWIP, inst, reg##_BASE_IDX, reg, #reg }

Mhm, do you still get the warning that we should use ()?

In general named initializers would be better. E.g. something like this

{
     .hwip = (ip##_HWIP),
     .inst = (inst),
...
}

But you need to give the parameters unique names for that. "inst" for 
example won't work like this.

Apart from this looks good to me,
Christian.

>   
>   #define SOC15_REG_ENTRY_OFFSET(entry)	(adev->reg_offset[entry.hwip][entry.inst][entry.seg] + entry.reg_offset)
>   
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> index 4908044f7409..4c8e7fdb6976 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_10_1_0_offset.h
> @@ -4830,6 +4830,8 @@
>   #define mmCP_ECC_FIRSTOCCURRENCE_RING2_BASE_IDX                                                        0
>   #define mmGB_EDC_MODE                                                                                  0x1e1e
>   #define mmGB_EDC_MODE_BASE_IDX                                                                         0
> +#define mmCP_DEBUG                                                                                     0x1e1f
> +#define mmCP_DEBUG_BASE_IDX                                                                            0
>   #define mmCP_FETCHER_SOURCE                                                                            0x1e22
>   #define mmCP_FETCHER_SOURCE_BASE_IDX                                                                   0
>   #define mmCP_PQ_WPTR_POLL_CNTL                                                                         0x1e23
> @@ -7778,6 +7780,8 @@
>   #define mmCP_MES_DOORBELL_CONTROL5_BASE_IDX                                                            1
>   #define mmCP_MES_DOORBELL_CONTROL6                                                                     0x2841
>   #define mmCP_MES_DOORBELL_CONTROL6_BASE_IDX                                                            1
> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR                                                            0x2842
> +#define mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR_BASE_IDX                                                   1
>   #define mmCP_MES_GP0_LO                                                                                0x2843
>   #define mmCP_MES_GP0_LO_BASE_IDX                                                                       1
>   #define mmCP_MES_GP0_HI                                                                                0x2844
> @@ -9332,10 +9336,16 @@
>   #define mmRLC_LB_CNTR_INIT_1_BASE_IDX                                                                  1
>   #define mmRLC_LB_CNTR_1                                                                                0x4c1c
>   #define mmRLC_LB_CNTR_1_BASE_IDX                                                                       1
> +#define mmRLC_GPM_DEBUG_INST_ADDR                                                                      0x4c1d
> +#define mmRLC_GPM_DEBUG_INST_ADDR_BASE_IDX                                                             1
>   #define mmRLC_JUMP_TABLE_RESTORE                                                                       0x4c1e
>   #define mmRLC_JUMP_TABLE_RESTORE_BASE_IDX                                                              1
>   #define mmRLC_PG_DELAY_2                                                                               0x4c1f
>   #define mmRLC_PG_DELAY_2_BASE_IDX                                                                      1
> +#define mmRLC_GPM_DEBUG_INST_A                                                                         0x4c22
> +#define mmRLC_GPM_DEBUG_INST_A_BASE_IDX                                                                1
> +#define mmRLC_GPM_DEBUG_INST_B                                                                         0x4c23
> +#define mmRLC_GPM_DEBUG_INST_B_BASE_IDX                                                                1
>   #define mmRLC_GPU_CLOCK_COUNT_LSB                                                                      0x4c24
>   #define mmRLC_GPU_CLOCK_COUNT_LSB_BASE_IDX                                                             1
>   #define mmRLC_GPU_CLOCK_COUNT_MSB                                                                      0x4c25
> @@ -9720,6 +9730,8 @@
>   #define mmRLC_SPM_THREAD_TRACE_CTRL_BASE_IDX                                                           1
>   #define mmRLC_LB_CNTR_2                                                                                0x4de7
>   #define mmRLC_LB_CNTR_2_BASE_IDX                                                                       1
> +#define mmRLC_LX6_CORE_PDEBUG_INST                                                                     0x4deb
> +#define mmRLC_LX6_CORE_PDEBUG_INST_BASE_IDX                                                            1
>   #define mmRLC_CPAXI_DOORBELL_MON_CTRL                                                                  0x4df1
>   #define mmRLC_CPAXI_DOORBELL_MON_CTRL_BASE_IDX                                                         1
>   #define mmRLC_CPAXI_DOORBELL_MON_STAT                                                                  0x4df2



More information about the amd-gfx mailing list