[PATCH 04/11] drm/amdgpu: implement umc query error count callback

Hawking Zhang Hawking.Zhang at amd.com
Fri Apr 2 06:43:38 UTC 2021


umc query_ras_error_count will be invoked to query
umc correctable and uncorrectable error. It will
reset the umc ras error counter after the query.

Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
Acked-by: Alex Deucher <alexander.deucher at amd.com>
Reviewed-by: John Clements <John.Clements at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 90 +++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h |  2 +
 2 files changed, 92 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 05fec10b1ed9..fe666ac93b54 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -27,6 +27,13 @@
 #include "umc/umc_6_7_0_offset.h"
 #include "umc/umc_6_7_0_sh_mask.h"
 
+static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
+					      uint32_t umc_inst,
+					      uint32_t ch_inst)
+{
+	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
+}
+
 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
 						   uint32_t umc_reg_offset,
 						   unsigned long *error_count)
@@ -94,6 +101,89 @@ static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev
 		*error_count += 1;
 }
 
+static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
+						   uint32_t umc_reg_offset)
+{
+	uint32_t ecc_err_cnt_addr;
+	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
+
+	ecc_err_cnt_sel_addr =
+		SOC15_REG_OFFSET(UMC, 0,
+				regUMCCH0_0_EccErrCntSel);
+	ecc_err_cnt_addr =
+		SOC15_REG_OFFSET(UMC, 0,
+				regUMCCH0_0_EccErrCnt);
+
+	/* select the lower chip */
+	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
+				       umc_reg_offset) * 4);
+	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
+					UMCCH0_0_EccErrCntSel,
+					EccErrCntCsSel, 0);
+	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
+			ecc_err_cnt_sel);
+
+	/* clear lower chip error count */
+	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
+			UMC_V6_7_CE_CNT_INIT);
+
+	/* select the higher chip */
+	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
+					umc_reg_offset) * 4);
+	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
+					UMCCH0_0_EccErrCntSel,
+					EccErrCntCsSel, 1);
+	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
+			ecc_err_cnt_sel);
+
+	/* clear higher chip error count */
+	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
+			UMC_V6_7_CE_CNT_INIT);
+}
+
+static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
+{
+	uint32_t umc_inst        = 0;
+	uint32_t ch_inst         = 0;
+	uint32_t umc_reg_offset  = 0;
+
+	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
+		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
+							 umc_inst,
+							 ch_inst);
+
+		umc_v6_7_reset_error_count_per_channel(adev,
+						       umc_reg_offset);
+	}
+}
+
+static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
+					   void *ras_error_status)
+{
+	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+
+	uint32_t umc_inst        = 0;
+	uint32_t ch_inst         = 0;
+	uint32_t umc_reg_offset  = 0;
+
+	/*TODO: driver needs to toggle DF Cstate to ensure
+	 * safe access of UMC registers. Will add the protection */
+	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
+		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
+							 umc_inst,
+							 ch_inst);
+		umc_v6_7_query_correctable_error_count(adev,
+						       umc_reg_offset,
+						       &(err_data->ce_count));
+		umc_v6_7_querry_uncorrectable_error_count(adev,
+							  umc_reg_offset,
+							  &(err_data->ue_count));
+	}
+
+	umc_v6_7_reset_error_count(adev);
+}
+
 const struct amdgpu_umc_funcs umc_v6_7_funcs = {
 	.ras_late_init = amdgpu_umc_ras_late_init,
+	.query_ras_error_count = umc_v6_7_query_ras_error_count,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index 6b881226b4f3..e59dbdb6ef9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -30,6 +30,8 @@
 /* umc ce count initial value */
 #define UMC_V6_7_CE_CNT_INIT	(UMC_V6_7_CE_CNT_MAX - UMC_V6_7_CE_INT_THRESHOLD)
 
+#define UMC_V6_7_INST_DIST	0x40000
+
 extern const struct amdgpu_umc_funcs umc_v6_7_funcs;
 
 #endif
-- 
2.17.1



More information about the amd-gfx mailing list