[PATCH 2/2] drm/amdgpu: Add ecc info query interface for umc v8_10
Yang, Stanley
Stanley.Yang at amd.com
Wed Feb 22 07:52:34 UTC 2023
[AMD Official Use Only - General]
The series is Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Candice Li
> Sent: Wednesday, February 22, 2023 12:35 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Li, Candice <Candice.Li at amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: Add ecc info query interface for umc
> v8_10
>
> Support ecc info query for umc v8_10.
>
> v2: Simplied by convert_error_address.
> v3: Remove unused variable and invalid checking.
>
> Signed-off-by: Candice Li <candice.li at amd.com>
> Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 134
> +++++++++++++++++++++++++
> 1 file changed, 134 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> index 293ba39c8a2fda..66158219f791cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> @@ -360,6 +360,138 @@ static bool
> umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev)
> return true;
> }
>
> +static void umc_v8_10_ecc_info_query_correctable_error_count(struct
> amdgpu_device *adev,
> + uint32_t node_inst, uint32_t umc_inst,
> uint32_t ch_inst,
> + unsigned long *error_count)
> +{
> + uint64_t mc_umc_status;
> + uint32_t eccinfo_table_idx;
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
> + adev->umc.channel_inst_num +
> + umc_inst * adev->umc.channel_inst_num +
> + ch_inst;
> +
> + /* check the MCUMC_STATUS */
> + mc_umc_status = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
> + if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
> + *error_count += 1;
> + }
> +}
> +
> +static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct
> amdgpu_device *adev,
> + uint32_t node_inst, uint32_t umc_inst,
> uint32_t ch_inst,
> + unsigned long *error_count)
> +{
> + uint64_t mc_umc_status;
> + uint32_t eccinfo_table_idx;
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
> + adev->umc.channel_inst_num +
> + umc_inst * adev->umc.channel_inst_num +
> + ch_inst;
> +
> + /* check the MCUMC_STATUS */
> + mc_umc_status = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
> + if ((REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
> + (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
> + *error_count += 1;
> + }
> +}
> +
> +static void umc_v8_10_ecc_info_query_ras_error_count(struct
> amdgpu_device *adev,
> + void *ras_error_status)
> +{
> + struct ras_err_data *err_data = (struct ras_err_data
> *)ras_error_status;
> +
> + uint32_t node_inst = 0;
> + uint32_t umc_inst = 0;
> + uint32_t ch_inst = 0;
> +
> + /* TODO: driver needs to toggle DF Cstate to ensure
> + * safe access of UMC registers. Will add the protection
> + */
> + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst,
> ch_inst) {
> + umc_v8_10_ecc_info_query_correctable_error_count(adev,
> + node_inst, umc_inst,
> ch_inst,
> + &(err_data-
> >ce_count));
> +
> umc_v8_10_ecc_info_query_uncorrectable_error_count(adev,
> + node_inst, umc_inst,
> ch_inst,
> + &(err_data-
> >ue_count));
> + }
> +}
> +
> +static void umc_v8_10_ecc_info_query_error_address(struct
> amdgpu_device *adev,
> + struct ras_err_data *err_data,
> + uint32_t ch_inst,
> + uint32_t umc_inst,
> + uint32_t node_inst)
> +{
> + uint32_t eccinfo_table_idx, channel_index;
> + uint64_t mc_umc_status, err_addr;
> +
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
> + adev->umc.channel_inst_num +
> + umc_inst * adev->umc.channel_inst_num +
> + ch_inst;
> + channel_index =
> + adev->umc.channel_idx_tbl[node_inst * adev-
> >umc.umc_inst_num *
> + adev-
> >umc.channel_inst_num +
> + umc_inst * adev-
> >umc.channel_inst_num +
> + ch_inst];
> +
> + mc_umc_status = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
> +
> + if (mc_umc_status == 0)
> + return;
> +
> + if (!err_data->err_addr)
> + return;
> +
> + /* calculate error address if ue error is detected */
> + if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 &&
> + (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1)) {
> +
> + err_addr = ras-
> >umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
> + err_addr = REG_GET_FIELD(err_addr,
> MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
> +
> + umc_v8_10_convert_error_address(adev, err_data,
> err_addr,
> + ch_inst, umc_inst, node_inst,
> mc_umc_status);
> + }
> +}
> +
> +static void umc_v8_10_ecc_info_query_ras_error_address(struct
> amdgpu_device *adev,
> + void *ras_error_status)
> +{
> + struct ras_err_data *err_data = (struct ras_err_data
> *)ras_error_status;
> +
> + uint32_t node_inst = 0;
> + uint32_t umc_inst = 0;
> + uint32_t ch_inst = 0;
> +
> + /* TODO: driver needs to toggle DF Cstate to ensure
> + * safe access of UMC resgisters. Will add the protection
> + * when firmware interface is ready
> + */
> + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst,
> ch_inst) {
> + umc_v8_10_ecc_info_query_error_address(adev,
> + err_data,
> + ch_inst,
> + umc_inst,
> + node_inst);
> + }
> +}
> +
> const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
> .query_ras_error_count = umc_v8_10_query_ras_error_count,
> .query_ras_error_address = umc_v8_10_query_ras_error_address,
> @@ -371,4 +503,6 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
> },
> .err_cnt_init = umc_v8_10_err_cnt_init,
> .query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,
> + .ecc_info_query_ras_error_count =
> umc_v8_10_ecc_info_query_ras_error_count,
> + .ecc_info_query_ras_error_address =
> umc_v8_10_ecc_info_query_ras_error_address,
> };
> --
> 2.17.1
More information about the amd-gfx
mailing list