[PATCH 2/2] drm/amdgpu: Add ecc info query interface for umc v8_10

Yang, Stanley Stanley.Yang at amd.com
Wed Feb 22 07:52:34 UTC 2023


[AMD Official Use Only - General]

The series is Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>

Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Candice Li
> Sent: Wednesday, February 22, 2023 12:35 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Li, Candice <Candice.Li at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: Add ecc info query interface for umc
> v8_10
> 
> Support ecc info query for umc v8_10.
> 
> v2: Simplied by convert_error_address.
> v3: Remove unused variable and invalid checking.
> 
> Signed-off-by: Candice Li <candice.li at amd.com>
> Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 134
> +++++++++++++++++++++++++
>  1 file changed, 134 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> index 293ba39c8a2fda..66158219f791cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> @@ -360,6 +360,138 @@ static bool
> umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev)
>  	return true;
>  }
> 
> +static void umc_v8_10_ecc_info_query_correctable_error_count(struct
> amdgpu_device *adev,
> +				      uint32_t node_inst, uint32_t umc_inst,
> uint32_t ch_inst,
> +				      unsigned long *error_count)
> +{
> +	uint64_t mc_umc_status;
> +	uint32_t eccinfo_table_idx;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
> +				  adev->umc.channel_inst_num +
> +				  umc_inst * adev->umc.channel_inst_num +
> +				  ch_inst;
> +
> +	/* check the MCUMC_STATUS */
> +	mc_umc_status = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
> +	if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> +	    REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
> +		*error_count += 1;
> +	}
> +}
> +
> +static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct
> amdgpu_device *adev,
> +				      uint32_t node_inst, uint32_t umc_inst,
> uint32_t ch_inst,
> +				      unsigned long *error_count)
> +{
> +	uint64_t mc_umc_status;
> +	uint32_t eccinfo_table_idx;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
> +				  adev->umc.channel_inst_num +
> +				  umc_inst * adev->umc.channel_inst_num +
> +				  ch_inst;
> +
> +	/* check the MCUMC_STATUS */
> +	mc_umc_status = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
> +	if ((REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
> +	    (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
> +	    REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
> +	    REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
> +	    REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
> +	    REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
> +		*error_count += 1;
> +	}
> +}
> +
> +static void umc_v8_10_ecc_info_query_ras_error_count(struct
> amdgpu_device *adev,
> +					void *ras_error_status)
> +{
> +	struct ras_err_data *err_data = (struct ras_err_data
> *)ras_error_status;
> +
> +	uint32_t node_inst       = 0;
> +	uint32_t umc_inst        = 0;
> +	uint32_t ch_inst         = 0;
> +
> +	/* TODO: driver needs to toggle DF Cstate to ensure
> +	 * safe access of UMC registers. Will add the protection
> +	 */
> +	LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst,
> ch_inst) {
> +		umc_v8_10_ecc_info_query_correctable_error_count(adev,
> +							node_inst, umc_inst,
> ch_inst,
> +							&(err_data-
> >ce_count));
> +
> 	umc_v8_10_ecc_info_query_uncorrectable_error_count(adev,
> +							node_inst, umc_inst,
> ch_inst,
> +							&(err_data-
> >ue_count));
> +	}
> +}
> +
> +static void umc_v8_10_ecc_info_query_error_address(struct
> amdgpu_device *adev,
> +					struct ras_err_data *err_data,
> +					uint32_t ch_inst,
> +					uint32_t umc_inst,
> +					uint32_t node_inst)
> +{
> +	uint32_t eccinfo_table_idx, channel_index;
> +	uint64_t mc_umc_status, err_addr;
> +
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
> +				  adev->umc.channel_inst_num +
> +				  umc_inst * adev->umc.channel_inst_num +
> +				  ch_inst;
> +	channel_index =
> +		adev->umc.channel_idx_tbl[node_inst * adev-
> >umc.umc_inst_num *
> +						  adev-
> >umc.channel_inst_num +
> +						  umc_inst * adev-
> >umc.channel_inst_num +
> +						  ch_inst];
> +
> +	mc_umc_status = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
> +
> +	if (mc_umc_status == 0)
> +		return;
> +
> +	if (!err_data->err_addr)
> +		return;
> +
> +	/* calculate error address if ue error is detected */
> +	if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> +	    REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 &&
> +	    (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1)) {
> +
> +		err_addr = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
> +		err_addr = REG_GET_FIELD(err_addr,
> MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +
> +		umc_v8_10_convert_error_address(adev, err_data,
> err_addr,
> +					ch_inst, umc_inst, node_inst,
> mc_umc_status);
> +	}
> +}
> +
> +static void umc_v8_10_ecc_info_query_ras_error_address(struct
> amdgpu_device *adev,
> +					void *ras_error_status)
> +{
> +	struct ras_err_data *err_data = (struct ras_err_data
> *)ras_error_status;
> +
> +	uint32_t node_inst       = 0;
> +	uint32_t umc_inst        = 0;
> +	uint32_t ch_inst         = 0;
> +
> +	/* TODO: driver needs to toggle DF Cstate to ensure
> +	 * safe access of UMC resgisters. Will add the protection
> +	 * when firmware interface is ready
> +	 */
> +	LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst,
> ch_inst) {
> +		umc_v8_10_ecc_info_query_error_address(adev,
> +						err_data,
> +						ch_inst,
> +						umc_inst,
> +						node_inst);
> +	}
> +}
> +
>  const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
>  	.query_ras_error_count = umc_v8_10_query_ras_error_count,
>  	.query_ras_error_address = umc_v8_10_query_ras_error_address,
> @@ -371,4 +503,6 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
>  	},
>  	.err_cnt_init = umc_v8_10_err_cnt_init,
>  	.query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,
> +	.ecc_info_query_ras_error_count =
> umc_v8_10_ecc_info_query_ras_error_count,
> +	.ecc_info_query_ras_error_address =
> umc_v8_10_ecc_info_query_ras_error_address,
>  };
> --
> 2.17.1


More information about the amd-gfx mailing list