[PATCH 2/2] drm/amdgpu: only harvest gcea/mmea error status in aldebaran
Yang, Stanley
Stanley.Yang at amd.com
Fri Apr 16 10:08:50 UTC 2021
[AMD Official Use Only - Internal Distribution Only]
Seriers is Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
Regards,
Stanley
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Friday, April 16, 2021 5:44 PM
> To: amd-gfx at lists.freedesktop.org; Yang, Stanley <Stanley.Yang at amd.com>;
> John Clements <John.Clemenets at amd.com>; Li, Dennis
> <Dennis.Li at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: only harvest gcea/mmea error status in
> aldebaran
>
> In aldebaran, driver only needs to harvest SDP RdRspStatus, WrRspStatus
> and first parity error on RdRsp data. Check error type before harvest error
> information.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 21 ++++++++++++---------
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 11 +++++++----
> 2 files changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> index 9ca76a3ac38c..91427543aabe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> @@ -808,7 +808,7 @@ static struct gfx_v9_4_2_utc_block
> gfx_v9_4_2_utc_blocks[] = {
> REG_SET_FIELD(0, ATC_L2_CACHE_4K_DSM_CNTL,
> WRITE_COUNTERS, 1) }, };
>
> -static const struct soc15_reg_entry gfx_v9_4_2_rdrsp_status_regs =
> +static const struct soc15_reg_entry gfx_v9_4_2_ea_err_status_regs =
> { SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 };
>
> static int gfx_v9_4_2_get_reg_error_count(struct amdgpu_device *adev,
> @@ -1040,11 +1040,11 @@ static void
> gfx_v9_4_2_reset_ea_err_status(struct amdgpu_device *adev)
> uint32_t i, j;
>
> mutex_lock(&adev->grbm_idx_mutex);
> - for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
> - for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
> + for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
> + for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
> j++) {
> gfx_v9_4_2_select_se_sh(adev, i, 0, j);
> -
> WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_reg
> s), 0x10);
> +
> WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_re
> gs), 0x10);
> }
> }
> gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); @@
> -1089,17 +1089,20 @@ static void gfx_v9_4_2_query_ea_err_status(struct
> amdgpu_device *adev)
>
> mutex_lock(&adev->grbm_idx_mutex);
>
> - for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
> - for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
> + for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
> + for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
> j++) {
> gfx_v9_4_2_select_se_sh(adev, i, 0, j);
> reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
> - gfx_v9_4_2_rdrsp_status_regs));
> - if (reg_value)
> + gfx_v9_4_2_ea_err_status_regs));
> + if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS,
> SDP_RDRSP_STATUS) ||
> + REG_GET_FIELD(reg_value, GCEA_ERR_STATUS,
> SDP_WRRSP_STATUS) ||
> + REG_GET_FIELD(reg_value, GCEA_ERR_STATUS,
> +SDP_RDRSP_DATAPARITY_ERROR)) {
> dev_warn(adev->dev, "GCEA err detected at
> instance: %d, status: 0x%x!\n",
> j, reg_value);
> + }
> /* clear after read */
> -
> WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_reg
> s), 0x10);
> +
> WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_re
> gs), 0x10);
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> index d0f41346ea0c..cc69c434d0de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> @@ -1286,7 +1286,7 @@ static void
> mmhub_v1_7_reset_ras_error_count(struct amdgpu_device *adev)
> }
> }
>
> -static const struct soc15_reg_entry mmhub_v1_7_err_status_regs[] = {
> +static const struct soc15_reg_entry mmhub_v1_7_ea_err_status_regs[] = {
> { SOC15_REG_ENTRY(MMHUB, 0, regMMEA0_ERR_STATUS), 0, 0, 0 },
> { SOC15_REG_ENTRY(MMHUB, 0, regMMEA1_ERR_STATUS), 0, 0, 0 },
> { SOC15_REG_ENTRY(MMHUB, 0, regMMEA2_ERR_STATUS), 0, 0, 0 },
> @@ -1303,12 +1303,15 @@ static void
> mmhub_v1_7_query_ras_error_status(struct amdgpu_device *adev)
> if (!amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__MMHUB))
> return;
>
> - for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_err_status_regs); i++) {
> + for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) {
> reg_value =
> -
> RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_err_status_regs
> [i]));
> - if (reg_value)
> +
> RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_r
> egs[i]));
> + if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS,
> SDP_RDRSP_STATUS) ||
> + REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS,
> SDP_WRRSP_STATUS) ||
> + REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS,
> +SDP_RDRSP_DATAPARITY_ERROR)) {
> dev_warn(adev->dev, "MMHUB EA err detected at
> instance: %d, status: 0x%x!\n",
> i, reg_value);
> + }
> }
> }
>
> --
> 2.17.1
More information about the amd-gfx
mailing list