[PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon Apr 22 08:14:02 UTC 2024
[AMD Official Use Only - General]
> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice
> <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang,
> Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
> 1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
> #include "umc/umc_12_0_0_sh_mask.h"
> #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT 16
[Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes
> +
> static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
> uint32_t node_inst,
> uint32_t umc_inst,
> @@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
> return 0;
> }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> + struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> + uint32_t i = 0;
> + int ret = 0;
> +
> + if (!err_data || !ecc_err)
> + return -EINVAL;
> +
> + for (i = 0; i < ecc_err->err_pages.count; i++) {
> + ret = amdgpu_umc_fill_error_record(err_data,
> + ecc_err->addr,
> + ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> + MCA_IPID_2_UMC_CH(ecc_err->ipid),
> + MCA_IPID_2_UMC_INST(ecc_err->ipid));
> + if (ret)
> + break;
> + }
> +
> + err_data->de_count++;
> +
> + return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> + void *ras_error_status)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> + struct radix_tree_root *ecc_tree;
> + int new_detected, ret, i;
> +
> + ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> + mutex_lock(&con->umc_ecc_log.lock);
> + new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> + 0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> + for (i = 0; i < new_detected; i++) {
> + if (!entries[i])
> + continue;
> +
> + ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> + if (ret) {
> + dev_err(adev->dev, "Fail to fill umc error record,
> ret:%d\n", ret);
> + break;
> + }
> + radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> + }
> + mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
> struct amdgpu_umc_ras umc_v12_0_ras = {
> .ras_block = {
> .hw_ops = &umc_v12_0_ras_hw_ops,
> @@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
> },
> .err_cnt_init = umc_v12_0_err_cnt_init,
> .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> - .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> - .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> + .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
> .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
> .update_ecc_status = umc_v12_0_update_ecc_status, };
> --
> 2.34.1
More information about the amd-gfx
mailing list