[PATCH 2/2] drm/amdgpu: only harvest gcea/mmea error status in aldebaran

Fri Apr 16 10:08:50 UTC 2021

[AMD Official Use Only - Internal Distribution Only]

Seriers is Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>

Regards,
Stanley
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Friday, April 16, 2021 5:44 PM
> To: amd-gfx at lists.freedesktop.org; Yang, Stanley <Stanley.Yang at amd.com>;
> John Clements <John.Clemenets at amd.com>; Li, Dennis
> <Dennis.Li at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: only harvest gcea/mmea error status in
> aldebaran
> 
> In aldebaran, driver only needs to harvest SDP RdRspStatus, WrRspStatus
> and first parity error on RdRsp data. Check error type before harvest error
> information.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 21 ++++++++++++---------
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 11 +++++++----
>  2 files changed, 19 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> index 9ca76a3ac38c..91427543aabe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> @@ -808,7 +808,7 @@ static struct gfx_v9_4_2_utc_block
> gfx_v9_4_2_utc_blocks[] = {
>  	  REG_SET_FIELD(0, ATC_L2_CACHE_4K_DSM_CNTL,
> WRITE_COUNTERS, 1) },  };
> 
> -static const struct soc15_reg_entry gfx_v9_4_2_rdrsp_status_regs =
> +static const struct soc15_reg_entry gfx_v9_4_2_ea_err_status_regs =
>  	{ SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16 };
> 
>  static int gfx_v9_4_2_get_reg_error_count(struct amdgpu_device *adev,
> @@ -1040,11 +1040,11 @@ static void
> gfx_v9_4_2_reset_ea_err_status(struct amdgpu_device *adev)
>  	uint32_t i, j;
> 
>  	mutex_lock(&adev->grbm_idx_mutex);
> -	for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
> -		for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
> +	for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
> +		for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
>  		     j++) {
>  			gfx_v9_4_2_select_se_sh(adev, i, 0, j);
> -
> 	WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_reg
> s), 0x10);
> +
> 	WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_re
> gs), 0x10);
>  		}
>  	}
>  	gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); @@
> -1089,17 +1089,20 @@ static void gfx_v9_4_2_query_ea_err_status(struct
> amdgpu_device *adev)
> 
>  	mutex_lock(&adev->grbm_idx_mutex);
> 
> -	for (i = 0; i < gfx_v9_4_2_rdrsp_status_regs.se_num; i++) {
> -		for (j = 0; j < gfx_v9_4_2_rdrsp_status_regs.instance;
> +	for (i = 0; i < gfx_v9_4_2_ea_err_status_regs.se_num; i++) {
> +		for (j = 0; j < gfx_v9_4_2_ea_err_status_regs.instance;
>  		     j++) {
>  			gfx_v9_4_2_select_se_sh(adev, i, 0, j);
>  			reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
> -				gfx_v9_4_2_rdrsp_status_regs));
> -			if (reg_value)
> +				gfx_v9_4_2_ea_err_status_regs));
> +			if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS,
> SDP_RDRSP_STATUS) ||
> +			    REG_GET_FIELD(reg_value, GCEA_ERR_STATUS,
> SDP_WRRSP_STATUS) ||
> +			    REG_GET_FIELD(reg_value, GCEA_ERR_STATUS,
> +SDP_RDRSP_DATAPARITY_ERROR)) {
>  				dev_warn(adev->dev, "GCEA err detected at
> instance: %d, status: 0x%x!\n",
>  						j, reg_value);
> +			}
>  			/* clear after read */
> -
> 	WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_reg
> s), 0x10);
> +
> 	WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_ea_err_status_re
> gs), 0x10);
>  		}
>  	}
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> index d0f41346ea0c..cc69c434d0de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c
> @@ -1286,7 +1286,7 @@ static void
> mmhub_v1_7_reset_ras_error_count(struct amdgpu_device *adev)
>  	}
>  }
> 
> -static const struct soc15_reg_entry mmhub_v1_7_err_status_regs[] = {
> +static const struct soc15_reg_entry mmhub_v1_7_ea_err_status_regs[] = {
>  	{ SOC15_REG_ENTRY(MMHUB, 0, regMMEA0_ERR_STATUS), 0, 0, 0 },
>  	{ SOC15_REG_ENTRY(MMHUB, 0, regMMEA1_ERR_STATUS), 0, 0, 0 },
>  	{ SOC15_REG_ENTRY(MMHUB, 0, regMMEA2_ERR_STATUS), 0, 0, 0 },
> @@ -1303,12 +1303,15 @@ static void
> mmhub_v1_7_query_ras_error_status(struct amdgpu_device *adev)
>  	if (!amdgpu_ras_is_supported(adev,
> AMDGPU_RAS_BLOCK__MMHUB))
>  		return;
> 
> -	for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_err_status_regs); i++) {
> +	for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) {
>  		reg_value =
> -
> 	RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_err_status_regs
> [i]));
> -		if (reg_value)
> +
> 	RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_r
> egs[i]));
> +		if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS,
> SDP_RDRSP_STATUS) ||
> +		    REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS,
> SDP_WRRSP_STATUS) ||
> +		    REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS,
> +SDP_RDRSP_DATAPARITY_ERROR)) {
>  			dev_warn(adev->dev, "MMHUB EA err detected at
> instance: %d, status: 0x%x!\n",
>  					i, reg_value);
> +		}
>  	}
>  }
> 
> --
> 2.17.1