[PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

Mon Apr 22 09:33:18 UTC 2024

[AMD Official Use Only - General]

update

-----------------
Best Regards,
Thomas

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Chai, Thomas
Sent: Monday, April 22, 2024 5:21 PM
To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Monday, April 22, 2024 4:14 PM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice
> <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas
> <YiPeng.Chai at amd.com>
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +++++++++++++++++++++++++-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

> [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the
> purposes

[Thomas]  This is from eeprom view to define the maximum number of eeprom table entries written each time.
The actual data items that need to be written to eeprom may be less than this value(e.g.  later, write the data items before address conversion), or may be more than this value.
If the value is less than this value, it will be written according to the actual value, and if it is more than this value, it will be written in multiple times.

> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>                                           uint32_t node_inst,
>                                           uint32_t umc_inst, @@ -633,6
> +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> +                             struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> +     struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> +     uint32_t i = 0;
> +     int ret = 0;
> +
> +     if (!err_data || !ecc_err)
> +             return -EINVAL;
> +
> +     for (i = 0; i < ecc_err->err_pages.count; i++) {
> +             ret = amdgpu_umc_fill_error_record(err_data,
> +                             ecc_err->addr,
> +                             ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> +                             MCA_IPID_2_UMC_CH(ecc_err->ipid),
> +                             MCA_IPID_2_UMC_INST(ecc_err->ipid));
> +             if (ret)
> +                     break;
> +     }
> +
> +     err_data->de_count++;
> +
> +     return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> +                                     void *ras_error_status) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> +     struct radix_tree_root *ecc_tree;
> +     int new_detected, ret, i;
> +
> +     ecc_tree = &con->umc_ecc_log.de_page_tree;
> +
> +     mutex_lock(&con->umc_ecc_log.lock);
> +     new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> +                     0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> +     for (i = 0; i < new_detected; i++) {
> +             if (!entries[i])
> +                     continue;
> +
> +             ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> +             if (ret) {
> +                     dev_err(adev->dev, "Fail to fill umc error
> + record,
> ret:%d\n", ret);
> +                     break;
> +             }
> +             radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> +     }
> +     mutex_unlock(&con->umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>       .ras_block = {
>               .hw_ops = &umc_v12_0_ras_hw_ops, @@ -640,8 +694,7 @@
> struct amdgpu_umc_ras umc_v12_0_ras = {
>       },
>       .err_cnt_init = umc_v12_0_err_cnt_init,
>       .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> -     .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> -     .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> +     .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>       .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>       .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1