[PATCH V2 5/5] drm/amdgpu:Support retiring multiple MCA error address pages

Zhang, Hawking Hawking.Zhang at amd.com
Thu Jan 18 08:18:36 UTC 2024


[AMD Official Use Only - General]

Series is

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: Chai, Thomas <YiPeng.Chai at amd.com>
Sent: Thursday, January 18, 2024 14:43
To: amd-gfx at lists.freedesktop.org
Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Subject: [PATCH V2 5/5] drm/amdgpu:Support retiring multiple MCA error address pages

Support retiring multiple MCA error address pages in one in-band query for umc v12_0.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++---  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 ++-  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 66 +++++++++++++------------
 3 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 61a02dbac087..879e1e59ac76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3909,8 +3909,7 @@ static int ras_err_info_cmp(void *priv, struct list_head *a, struct list_head *b  }

 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
-                               struct amdgpu_smuio_mcm_config_info *mcm_info,
-                               struct ras_err_addr *err_addr)
+                               struct amdgpu_smuio_mcm_config_info *mcm_info)
 {
        struct ras_err_node *err_node;

@@ -3922,10 +3921,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
        if (!err_node)
                return NULL;

-       memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
+       INIT_LIST_HEAD(&err_node->err_info.err_addr_list);

-       if (err_addr)
-               memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
+       memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));

        err_data->err_list_count++;
        list_add_tail(&err_node->node, &err_data->err_node_list); @@ -3934,6 +3932,29 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
        return &err_node->err_info;
 }

+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct
+ras_err_addr *err_addr) {
+       struct ras_err_addr *mca_err_addr;
+
+       mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
+       if (!mca_err_addr)
+               return;
+
+       INIT_LIST_HEAD(&mca_err_addr->node);
+
+       mca_err_addr->err_status = err_addr->err_status;
+       mca_err_addr->err_ipid = err_addr->err_ipid;
+       mca_err_addr->err_addr = err_addr->err_addr;
+
+       list_add_tail(&mca_err_addr->node, &err_info->err_addr_list); }
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct
+ras_err_addr *mca_err_addr) {
+       list_del(&mca_err_addr->node);
+       kfree(mca_err_addr);
+}
+
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
                struct amdgpu_smuio_mcm_config_info *mcm_info,
                struct ras_err_addr *err_addr, u64 count) @@ -3946,10 +3967,13 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
        if (!count)
                return 0;

-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
        if (!err_info)
                return -EINVAL;

+       if (err_addr && err_addr->err_status)
+               amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
        err_info->ue_count += count;
        err_data->ue_count += count;

@@ -3968,7 +3992,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
        if (!count)
                return 0;

-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
        if (!err_info)
                return -EINVAL;

@@ -3990,10 +4014,13 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
        if (!count)
                return 0;

-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
        if (!err_info)
                return -EINVAL;

+       if (err_addr && err_addr->err_status)
+               amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
        err_info->de_count += count;
        err_data->de_count += count;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 9c3df9985fad..a25aea6ae230 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -474,6 +474,7 @@ struct ras_fs_data {  };

 struct ras_err_addr {
+       struct list_head node;
        uint64_t err_status;
        uint64_t err_ipid;
        uint64_t err_addr;
@@ -484,7 +485,7 @@ struct ras_err_info {
        u64 ce_count;
        u64 ue_count;
        u64 de_count;
-       struct ras_err_addr err_addr;
+       struct list_head err_addr_list;
 };

 struct ras_err_node {
@@ -856,4 +857,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)  ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
                                  struct aca_handle *handle, char *buf, void *data);

+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
+                       struct ras_err_addr *err_addr);
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
+               struct ras_err_addr *mca_err_addr);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 1e8e97d72f1e..f9dc1855ac4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -385,42 +385,46 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade  {
        struct ras_err_node *err_node;
        uint64_t mc_umc_status;
+       struct ras_err_info *err_info;
+       struct ras_err_addr *mca_err_addr, *tmp;
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;

        for_each_ras_error(err_node, err_data) {
-               mc_umc_status = err_node->err_info.err_addr.err_status;
-               if (!mc_umc_status)
+               err_info = &err_node->err_info;
+               if (list_empty(&err_info->err_addr_list))
                        continue;

-               if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
-                   umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
-                       uint64_t mca_addr, err_addr, mca_ipid;
-                       uint32_t InstanceIdLo;
-                       struct amdgpu_smuio_mcm_config_info *mcm_info;
-
-                       mcm_info = &err_node->err_info.mcm_info;
-                       mca_addr = err_node->err_info.err_addr.err_addr;
-                       mca_ipid = err_node->err_info.err_addr.err_ipid;
-
-                       err_addr =  REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
-                       InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
-
-                       dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
-                               mca_ipid,
-                               mcm_info->die_id,
-                               MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-                               MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
-                               err_addr);
-
-                       umc_v12_0_convert_error_address(adev,
-                               err_data, err_addr,
-                               MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
-                               MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-                               mcm_info->die_id);
-
-                       /* Clear umc error address content */
-                       memset(&err_node->err_info.err_addr,
-                               0, sizeof(err_node->err_info.err_addr));
+               list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
+                       mc_umc_status = mca_err_addr->err_status;
+                       if (mc_umc_status &&
+                               (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
+                                umc_v12_0_is_deferred_error(adev, mc_umc_status))) {
+                               uint64_t mca_addr, err_addr, mca_ipid;
+                               uint32_t InstanceIdLo;
+
+                               mca_addr = mca_err_addr->err_addr;
+                               mca_ipid = mca_err_addr->err_ipid;
+
+                               err_addr = REG_GET_FIELD(mca_addr,
+                                                       MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+                               InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
+
+                               dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
+                                       mca_ipid,
+                                       err_info->mcm_info.die_id,
+                                       MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+                                       MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+                                       err_addr);
+
+                               umc_v12_0_convert_error_address(adev,
+                                       err_data, err_addr,
+                                       MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+                                       MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+                                       err_info->mcm_info.die_id);
+                       }
+
+                       /* Delete error address node from list and free memory */
+                       amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr);
                }
        }
 }
--
2.34.1



More information about the amd-gfx mailing list