[PATCH v1 1/3] drm/amdgpu: add gfx9 register support in ipdump

Khatri, Sunil Sunil.Khatri at amd.com
Wed May 29 16:25:17 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Alex Deucher <alexdeucher at gmail.com>
Sent: Wednesday, May 29, 2024 7:16 PM
To: Khatri, Sunil <Sunil.Khatri at amd.com>
Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH v1 1/3] drm/amdgpu: add gfx9 register support in ipdump

On Wed, May 29, 2024 at 5:50 AM Sunil Khatri <sunil.khatri at amd.com> wrote:
>
> Add general registers of gfx9 in ipdump for devcoredump support.
>
> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 124
> +++++++++++++++++++++++++-
>  1 file changed, 123 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 3c8c5abf35ab..528a20393313 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -149,6 +149,94 @@ MODULE_FIRMWARE("amdgpu/aldebaran_sjt_mec2.bin");
>  #define mmGOLDEN_TSC_COUNT_LOWER_Renoir                0x0026
>  #define mmGOLDEN_TSC_COUNT_LOWER_Renoir_BASE_IDX       1
>
> +static const struct amdgpu_hwip_reg_entry gc_reg_list_9[] = {
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_CNTL),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSQ_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmVM_L2_PROTECTION_FAULT_CNTL),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmVM_L2_PROTECTION_FAULT_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6),
> +       /* cp header registers */
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_ME1_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_ME2_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_HEADER_DUMP),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_HEADER_DUMP),
> +       /* SE status registers */
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE0),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE1),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE2),
> +       SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS_SE3) };

This list should probably be split between gfx and non-gfx registers since MI chips don't have a gfx pipe, only compute.  If
adev->gfx.num_gfx_rings == 0, then the chip does not have a gfx pipe.

Sure Alex. I will split the registers in GFX specific and Non GFX based registers. Would it be correct way to see which registers are part of mi300 are non gfx and registers which are not are gfx ??
I just gave a quick look and all the registers that are part of the patches that we are dumping already all are available in mi300 spec sheet.

Also MI300 probably needs special handling because it has multiple XCCs.  For now, you might want to skip the IP dump for gfx9 chips with
adev->gfx.num_gfx_rings == 0, and then add new register lists for the
MI parts later.

For MI300 by multiple XCC do you mean physically different nodes/multiple nodes. If my understanding is correct we are interested in dumping only the GPU/node which caused the hang isn’t it?

Regards
Sunil Khatri

Alex

> +
>  enum ta_ras_gfx_subblock {
>         /*CPC*/
>         TA_RAS_BLOCK__GFX_CPC_INDEX_START = 0, @@ -1994,6 +2082,20 @@
> static int gfx_v9_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,
>                                 hw_prio, NULL);  }
>
> +static void gfx_v9_0_alloc_ip_dump(struct amdgpu_device *adev) {
> +       uint32_t reg_count = ARRAY_SIZE(gc_reg_list_9);
> +       uint32_t *ptr;
> +
> +       ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL);
> +       if (ptr == NULL) {
> +               DRM_ERROR("Failed to allocate memory for IP Dump\n");
> +               adev->gfx.ip_dump_core = NULL;
> +       } else {
> +               adev->gfx.ip_dump_core = ptr;
> +       }
> +}
> +
>  static int gfx_v9_0_sw_init(void *handle)  {
>         int i, j, k, r, ring_id;
> @@ -2171,6 +2273,8 @@ static int gfx_v9_0_sw_init(void *handle)
>                 return -EINVAL;
>         }
>
> +       gfx_v9_0_alloc_ip_dump(adev);
> +
>         return 0;
>  }
>
> @@ -2206,6 +2310,8 @@ static int gfx_v9_0_sw_fini(void *handle)
>         }
>         gfx_v9_0_free_microcode(adev);
>
> +       kfree(adev->gfx.ip_dump_core);
> +
>         return 0;
>  }
>
> @@ -6840,6 +6946,22 @@ static void gfx_v9_0_emit_wave_limit(struct amdgpu_ring *ring, bool enable)
>         }
>  }
>
> +static void gfx_v9_ip_dump(void *handle) {
> +       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +       uint32_t i;
> +       uint32_t reg_count = ARRAY_SIZE(gc_reg_list_9);
> +
> +       if (!adev->gfx.ip_dump_core)
> +               return;
> +
> +       amdgpu_gfx_off_ctrl(adev, false);
> +       for (i = 0; i < reg_count; i++)
> +               adev->gfx.ip_dump_core[i] = RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_9[i]));
> +       amdgpu_gfx_off_ctrl(adev, true);
> +
> +}
> +
>  static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
>         .name = "gfx_v9_0",
>         .early_init = gfx_v9_0_early_init, @@ -6856,7 +6978,7 @@
> static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
>         .set_clockgating_state = gfx_v9_0_set_clockgating_state,
>         .set_powergating_state = gfx_v9_0_set_powergating_state,
>         .get_clockgating_state = gfx_v9_0_get_clockgating_state,
> -       .dump_ip_state = NULL,
> +       .dump_ip_state = gfx_v9_ip_dump,
>         .print_ip_state = NULL,
>  };
>
> --
> 2.34.1
>


More information about the amd-gfx mailing list