[PATCH V2 5/5] drm/amdgpu:Support retiring multiple MCA error address pages

YiPeng Chai YiPeng.Chai at amd.com
Thu Jan 18 06:42:57 UTC 2024


Support retiring multiple MCA error address pages in
one in-band query for umc v12_0.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 ++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 66 +++++++++++++------------
 3 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 61a02dbac087..879e1e59ac76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3909,8 +3909,7 @@ static int ras_err_info_cmp(void *priv, struct list_head *a, struct list_head *b
 }
 
 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
-				struct amdgpu_smuio_mcm_config_info *mcm_info,
-				struct ras_err_addr *err_addr)
+				struct amdgpu_smuio_mcm_config_info *mcm_info)
 {
 	struct ras_err_node *err_node;
 
@@ -3922,10 +3921,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
 	if (!err_node)
 		return NULL;
 
-	memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
+	INIT_LIST_HEAD(&err_node->err_info.err_addr_list);
 
-	if (err_addr)
-		memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
+	memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
 
 	err_data->err_list_count++;
 	list_add_tail(&err_node->node, &err_data->err_node_list);
@@ -3934,6 +3932,29 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
 	return &err_node->err_info;
 }
 
+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
+{
+	struct ras_err_addr *mca_err_addr;
+
+	mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
+	if (!mca_err_addr)
+		return;
+
+	INIT_LIST_HEAD(&mca_err_addr->node);
+
+	mca_err_addr->err_status = err_addr->err_status;
+	mca_err_addr->err_ipid = err_addr->err_ipid;
+	mca_err_addr->err_addr = err_addr->err_addr;
+
+	list_add_tail(&mca_err_addr->node, &err_info->err_addr_list);
+}
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
+{
+	list_del(&mca_err_addr->node);
+	kfree(mca_err_addr);
+}
+
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
 		struct amdgpu_smuio_mcm_config_info *mcm_info,
 		struct ras_err_addr *err_addr, u64 count)
@@ -3946,10 +3967,13 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
 	if (!count)
 		return 0;
 
-	err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
 	if (!err_info)
 		return -EINVAL;
 
+	if (err_addr && err_addr->err_status)
+		amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
 	err_info->ue_count += count;
 	err_data->ue_count += count;
 
@@ -3968,7 +3992,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
 	if (!count)
 		return 0;
 
-	err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
 	if (!err_info)
 		return -EINVAL;
 
@@ -3990,10 +4014,13 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
 	if (!count)
 		return 0;
 
-	err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
+	err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
 	if (!err_info)
 		return -EINVAL;
 
+	if (err_addr && err_addr->err_status)
+		amdgpu_ras_add_mca_err_addr(err_info, err_addr);
+
 	err_info->de_count += count;
 	err_data->de_count += count;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 9c3df9985fad..a25aea6ae230 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -474,6 +474,7 @@ struct ras_fs_data {
 };
 
 struct ras_err_addr {
+	struct list_head node;
 	uint64_t err_status;
 	uint64_t err_ipid;
 	uint64_t err_addr;
@@ -484,7 +485,7 @@ struct ras_err_info {
 	u64 ce_count;
 	u64 ue_count;
 	u64 de_count;
-	struct ras_err_addr err_addr;
+	struct list_head err_addr_list;
 };
 
 struct ras_err_node {
@@ -856,4 +857,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
 				  struct aca_handle *handle, char *buf, void *data);
 
+void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
+			struct ras_err_addr *err_addr);
+
+void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
+		struct ras_err_addr *mca_err_addr);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 1e8e97d72f1e..f9dc1855ac4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -385,42 +385,46 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
 {
 	struct ras_err_node *err_node;
 	uint64_t mc_umc_status;
+	struct ras_err_info *err_info;
+	struct ras_err_addr *mca_err_addr, *tmp;
 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 
 	for_each_ras_error(err_node, err_data) {
-		mc_umc_status = err_node->err_info.err_addr.err_status;
-		if (!mc_umc_status)
+		err_info = &err_node->err_info;
+		if (list_empty(&err_info->err_addr_list))
 			continue;
 
-		if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
-		    umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
-			uint64_t mca_addr, err_addr, mca_ipid;
-			uint32_t InstanceIdLo;
-			struct amdgpu_smuio_mcm_config_info *mcm_info;
-
-			mcm_info = &err_node->err_info.mcm_info;
-			mca_addr = err_node->err_info.err_addr.err_addr;
-			mca_ipid = err_node->err_info.err_addr.err_ipid;
-
-			err_addr =  REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
-			InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
-
-			dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
-				mca_ipid,
-				mcm_info->die_id,
-				MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-				MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
-				err_addr);
-
-			umc_v12_0_convert_error_address(adev,
-				err_data, err_addr,
-				MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
-				MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
-				mcm_info->die_id);
-
-			/* Clear umc error address content */
-			memset(&err_node->err_info.err_addr,
-				0, sizeof(err_node->err_info.err_addr));
+		list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
+			mc_umc_status = mca_err_addr->err_status;
+			if (mc_umc_status &&
+				(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
+				 umc_v12_0_is_deferred_error(adev, mc_umc_status))) {
+				uint64_t mca_addr, err_addr, mca_ipid;
+				uint32_t InstanceIdLo;
+
+				mca_addr = mca_err_addr->err_addr;
+				mca_ipid = mca_err_addr->err_ipid;
+
+				err_addr = REG_GET_FIELD(mca_addr,
+							MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+				InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
+
+				dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
+					mca_ipid,
+					err_info->mcm_info.die_id,
+					MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+					MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+					err_addr);
+
+				umc_v12_0_convert_error_address(adev,
+					err_data, err_addr,
+					MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+					MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+					err_info->mcm_info.die_id);
+			}
+
+			/* Delete error address node from list and free memory */
+			amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr);
 		}
 	}
 }
-- 
2.34.1



More information about the amd-gfx mailing list