[PATCH 07/10] drm/amdgpu: add RAS error count query for gfx_v9_4_3

Fri May 12 21:43:48 UTC 2023

From: Tao Zhou <tao.zhou1 at amd.com>

Query GFX RAS ce/ue count.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 56 +++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 188b4d9a2cbb..bfd041ba51d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3724,6 +3724,55 @@ static const struct soc15_reg_entry gfx_v9_4_3_ea_err_status_regs = {
 	SOC15_REG_ENTRY(GC, 0, regGCEA_ERR_STATUS), 0, 1, 16
 };
 
+static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
+					void *ras_error_status, int xcc_id)
+{
+	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+	unsigned long ce_count = 0, ue_count = 0;
+	uint32_t i, j, k;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	for (i = 0; i < ARRAY_SIZE(gfx_v9_4_3_ce_reg_list); i++) {
+		for (j = 0; j < gfx_v9_4_3_ce_reg_list[i].se_num; j++) {
+			for (k = 0; k < gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst; k++) {
+				/* no need to select if instance number is 1 */
+				if (gfx_v9_4_3_ce_reg_list[i].se_num > 1 ||
+				    gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst > 1)
+					gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id);
+
+				amdgpu_ras_inst_query_ras_error_count(adev,
+					&(gfx_v9_4_3_ce_reg_list[i].reg_entry),
+					1,
+					gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].mem_id_ent,
+					gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].size,
+					GET_INST(GC, xcc_id),
+					AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE,
+					&ce_count);
+
+				amdgpu_ras_inst_query_ras_error_count(adev,
+					&(gfx_v9_4_3_ue_reg_list[i].reg_entry),
+					1,
+					gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent,
+					gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size,
+					GET_INST(GC, xcc_id),
+					AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+					&ue_count);
+			}
+		}
+	}
+
+	gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
+			xcc_id);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	/* the caller should make sure initialize value of
+	 * err_data->ue_count and err_data->ce_count
+	 */
+	err_data->ce_count += ce_count;
+	err_data->ue_count += ue_count;
+}
+
 static void gfx_v9_4_3_inst_query_ea_err_status(struct amdgpu_device *adev,
 					int xcc_id)
 {
@@ -3826,6 +3875,13 @@ static void gfx_v9_4_3_inst_reset_ras_err_status(struct amdgpu_device *adev,
 	gfx_v9_4_3_inst_reset_ea_err_status(adev, xcc_id);
 }
 
+static void gfx_v9_4_3_query_ras_error_count(struct amdgpu_device *adev,
+					void *ras_error_status)
+{
+	amdgpu_gfx_ras_error_func(adev, ras_error_status,
+			gfx_v9_4_3_inst_query_ras_err_count);
+}
+
 static void gfx_v9_4_3_query_ras_error_status(struct amdgpu_device *adev)
 {
 	amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_query_ras_err_status);
-- 
2.40.1