[PATCH 2/3] drm/amdgpu: optimize logging deferred error info
Chai, Thomas
YiPeng.Chai at amd.com
Thu Jul 18 03:35:13 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
-----------------
Best Regards,
Thomas
-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Thursday, July 18, 2024 10:57 AM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
Subject: RE: [PATCH 2/3] drm/amdgpu: optimize logging deferred error info
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Wednesday, July 17, 2024 4:16 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang,
> Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley
> <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Subject: [PATCH 2/3] drm/amdgpu: optimize logging deferred error info
>
> 1. Use pa_pfn as the radix-tree key index to log
> deferred error info.
> 2. Use local array to store expanded bad pages.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++----
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 65 ++++++++++++-------------
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 ++
> 4 files changed, 40 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index dcf1f3dbb5c4..f607ff620015 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -476,10 +476,10 @@ struct ras_err_pages { };
>
> struct ras_ecc_err {
> - u64 hash_index;
> uint64_t status;
> uint64_t ipid;
> uint64_t addr;
> + uint64_t pa_pfn;
> struct ras_err_pages err_pages;
> };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 5d08c03fe543..2fc90799bf8d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -523,18 +523,10 @@ int amdgpu_umc_logs_ecc_err(struct amdgpu_device
> *adev,
> ecc_log = &con->umc_ecc_log;
>
> mutex_lock(&ecc_log->lock);
> - ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
> - if (!ret) {
> - struct ras_err_pages *err_pages = &ecc_err->err_pages;
> - int i;
> -
> - /* Reserve memory */
> - for (i = 0; i < err_pages->count; i++)
> - amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
> -
> + ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err);
> + if (!ret)
> radix_tree_tag_set(ecc_tree,
> - ecc_err->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> - }
> + ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
> mutex_unlock(&ecc_log->lock);
>
> return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index eca5ac6a0532..f2235c9ead29 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -524,9 +524,9 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> uint16_t hwid, mcatype;
> uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
> - uint64_t err_addr, hash_val = 0, pa_addr = 0;
> + uint64_t err_addr, pa_addr = 0;
> struct ras_ecc_err *ecc_err;
> - int count, ret;
> + int count, ret, i;
>
> hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
> mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -559,39
> +559,18 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device
> *adev,
> if (ret)
> return ret;
>
> - memset(page_pfn, 0, sizeof(page_pfn));
> - count = umc_v12_0_expand_addr_to_bad_pages(adev,
> - pa_addr,
> - page_pfn, ARRAY_SIZE(page_pfn));
> - if (count <= 0) {
> - dev_warn(adev->dev, "Fail to convert error address!
> count:%d\n", count);
> - return 0;
> - }
> -
> - ret = amdgpu_umc_build_pages_hash(adev,
> - page_pfn, count, &hash_val);
> - if (ret) {
> - dev_err(adev->dev, "Fail to build error pages hash\n");
> - return ret;
> - }
> -
> ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
> if (!ecc_err)
> return -ENOMEM;
>
> - ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn),
> GFP_KERNEL);
> - if (!ecc_err->err_pages.pfn) {
> - kfree(ecc_err);
> - return -ENOMEM;
> - }
> -
> - memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err-
> >err_pages.pfn));
> - ecc_err->err_pages.count = count;
> -
> - ecc_err->hash_index = hash_val;
> ecc_err->status = status;
> ecc_err->ipid = ipid;
> ecc_err->addr = addr;
> + ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >>
> +AMDGPU_GPU_PAGE_SHIFT;
> +
> + /* If converted pa_pfn is 0, use pa xor pfn. */
> + if (!ecc_err->pa_pfn)
> + ecc_err->pa_pfn = BIT_ULL(UMC_V12_0_PA_R13_BIT) >>
> +AMDGPU_GPU_PAGE_SHIFT;
>[Tao] why address 0 should be avoided?
[Thomas] When address is 0, it means the data has just been loaded from EEPROM, it should calculate pa_pfn.
>
> ret = amdgpu_umc_logs_ecc_err(adev, &con-
> >umc_ecc_log.de_page_tree, ecc_err);
> if (ret) {
> @@ -600,13 +579,25 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
> else
> dev_err(adev->dev, "Fail to log ecc error!
> ret:%d\n", ret);
>
> - kfree(ecc_err->err_pages.pfn);
> kfree(ecc_err);
> return ret;
> }
>
> con->umc_ecc_log.de_queried_count++;
>
> + memset(page_pfn, 0, sizeof(page_pfn));
> + count = umc_v12_0_expand_addr_to_bad_pages(adev,
> + pa_addr,
> + page_pfn, ARRAY_SIZE(page_pfn));
> + if (count <= 0) {
> + dev_warn(adev->dev, "Fail to convert error address!
> count:%d\n", count);
> + return 0;
> + }
> +
> + /* Reserve memory */
> + for (i = 0; i < count; i++)
> + amdgpu_ras_reserve_page(adev, page_pfn[i]);
> +
> /* The problem case is as follows:
> * 1. GPU A triggers a gpu ras reset, and GPU A drives
> * GPU B to also perform a gpu ras reset.
> @@ -631,16 +622,21 @@ static int umc_v12_0_fill_error_record(struct
> amdgpu_device *adev,
> struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> - uint32_t i = 0;
> - int ret = 0;
> + uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
> + int ret, i, count;
>
> if (!err_data || !ecc_err)
> return -EINVAL;
>
> - for (i = 0; i < ecc_err->err_pages.count; i++) {
> + memset(page_pfn, 0, sizeof(page_pfn));
> + count = umc_v12_0_expand_addr_to_bad_pages(adev,
> + ecc_err->pa_pfn <<
> AMDGPU_GPU_PAGE_SHIFT,
> + page_pfn, ARRAY_SIZE(page_pfn));
> +
> + for (i = 0; i < count; i++) {
> ret = amdgpu_umc_fill_error_record(err_data,
> ecc_err->addr,
> - ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> + page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
> MCA_IPID_2_UMC_CH(ecc_err->ipid),
> MCA_IPID_2_UMC_INST(ecc_err->ipid));
> if (ret)
> @@ -674,7 +670,8 @@ static void
> umc_v12_0_query_ras_ecc_err_addr(struct
> amdgpu_device *adev,
> dev_err(adev->dev, "Fail to fill umc error
> record, ret:%d\n", ret);
> break;
> }
> - radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> + radix_tree_tag_clear(ecc_tree,
> + entries[i]->pa_pfn,
> UMC_ECC_NEW_DETECTED_TAG);
> }
> mutex_unlock(&con->umc_ecc_log.lock);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> index b4974793850b..be5598d76c1d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> @@ -81,6 +81,11 @@
> (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
> (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
>
> +#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \
> + ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \
> + (0x1ULL << UMC_V12_0_PA_C4_BIT) | \
> + (0x1ULL << UMC_V12_0_PA_R13_BIT)))
> +
> bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t
> mc_umc_status); bool umc_v12_0_is_uncorrectable_error(struct
> amdgpu_device *adev, uint64_t mc_umc_status); bool
> umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t
> mc_umc_status);
> --
> 2.34.1
More information about the amd-gfx
mailing list