[Intel-xe] [PATCH 08/11] drm/xe: Support SOC NONFATAL error handling for PVC.

Himal Prasad Ghimiray himal.prasad.ghimiray at intel.com
Wed Sep 27 11:46:24 UTC 2023


Report the SOC nonfatal hardware error and update the counters which
will increment incase of error.

Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
 drivers/gpu/drm/xe/xe_hw_error.c | 118 ++++++++++++++++++++++++++-----
 drivers/gpu/drm/xe/xe_hw_error.h |  42 +++++++++++
 2 files changed, 143 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index fa05bad5e684..aeece9e705dc 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -276,6 +276,67 @@ static const struct err_msg_cntr_pair soc_mstr_lcl_err_reg_fatal[] = {
 	[14 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
 };
 
+static const struct err_msg_cntr_pair soc_mstr_glbl_err_reg_nonfatal[] = {
+	[0]         = {"MASTER LOCAL Reported",			XE_SOC_HW_ERR_MSTR_LCL_NONFATAL},
+	[1]         = {"SLAVE GLOBAL Reported",			XE_SOC_HW_ERR_SLAVE_GLBL_NONFATAL},
+	[2]         = {"HBM SS0: Channel0",			XE_SOC_HW_ERR_HBM0_CHNL0_NONFATAL},
+	[3]         = {"HBM SS0: Channel1",			XE_SOC_HW_ERR_HBM0_CHNL1_NONFATAL},
+	[4]         = {"HBM SS0: Channel2",			XE_SOC_HW_ERR_HBM0_CHNL2_NONFATAL},
+	[5]         = {"HBM SS0: Channel3",			XE_SOC_HW_ERR_HBM0_CHNL3_NONFATAL},
+	[6]         = {"HBM SS0: Channel4",			XE_SOC_HW_ERR_HBM0_CHNL4_NONFATAL},
+	[7]         = {"HBM SS0: Channel5",			XE_SOC_HW_ERR_HBM0_CHNL5_NONFATAL},
+	[8]         = {"HBM SS0: Channel6",                     XE_SOC_HW_ERR_HBM0_CHNL6_NONFATAL},
+	[9]         = {"HBM SS0: Channel7",                     XE_SOC_HW_ERR_HBM0_CHNL7_NONFATAL},
+	[10]        = {"HBM SS1: Channel0",                     XE_SOC_HW_ERR_HBM1_CHNL0_NONFATAL},
+	[11]        = {"HBM SS1: Channel1",                     XE_SOC_HW_ERR_HBM1_CHNL1_NONFATAL},
+	[12]        = {"HBM SS1: Channel2",                     XE_SOC_HW_ERR_HBM1_CHNL2_NONFATAL},
+	[13]        = {"HBM SS1: Channel3",                     XE_SOC_HW_ERR_HBM1_CHNL3_NONFATAL},
+	[14]        = {"HBM SS1: Channel4",                     XE_SOC_HW_ERR_HBM1_CHNL4_NONFATAL},
+	[15]        = {"HBM SS1: Channel5",                     XE_SOC_HW_ERR_HBM1_CHNL5_NONFATAL},
+	[16]        = {"HBM SS1: Channel6",                     XE_SOC_HW_ERR_HBM1_CHNL6_NONFATAL},
+	[17]        = {"HBM SS1: Channel7",                     XE_SOC_HW_ERR_HBM1_CHNL7_NONFATAL},
+	[18 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair soc_slave_glbl_err_reg_nonfatal[] = {
+	[0]         = {"SLAVE LOCAL Reported",			XE_SOC_HW_ERR_SLAVE_LCL_NONFATAL},
+	[1]         = {"HBM SS2: Channel0",			XE_SOC_HW_ERR_HBM2_CHNL0_NONFATAL},
+	[2]         = {"HBM SS2: Channel1",			XE_SOC_HW_ERR_HBM2_CHNL1_NONFATAL},
+	[3]         = {"HBM SS2: Channel2",			XE_SOC_HW_ERR_HBM2_CHNL2_NONFATAL},
+	[4]         = {"HBM SS2: Channel3",			XE_SOC_HW_ERR_HBM2_CHNL3_NONFATAL},
+	[5]         = {"HBM SS2: Channel4",			XE_SOC_HW_ERR_HBM2_CHNL4_NONFATAL},
+	[6]         = {"HBM SS2: Channel5",			XE_SOC_HW_ERR_HBM2_CHNL5_NONFATAL},
+	[7]         = {"HBM SS2: Channel6",                     XE_SOC_HW_ERR_HBM2_CHNL6_NONFATAL},
+	[8]         = {"HBM SS2: Channel7",                     XE_SOC_HW_ERR_HBM2_CHNL7_NONFATAL},
+	[9]         = {"HBM SS3: Channel0",                     XE_SOC_HW_ERR_HBM3_CHNL0_NONFATAL},
+	[10]        = {"HBM SS3: Channel1",                     XE_SOC_HW_ERR_HBM3_CHNL1_NONFATAL},
+	[11]        = {"HBM SS3: Channel2",                     XE_SOC_HW_ERR_HBM3_CHNL2_NONFATAL},
+	[12]        = {"HBM SS3: Channel3",                     XE_SOC_HW_ERR_HBM3_CHNL3_NONFATAL},
+	[13]        = {"HBM SS3: Channel4",                     XE_SOC_HW_ERR_HBM3_CHNL4_NONFATAL},
+	[14]        = {"HBM SS3: Channel5",                     XE_SOC_HW_ERR_HBM3_CHNL5_NONFATAL},
+	[15]        = {"HBM SS3: Channel6",                     XE_SOC_HW_ERR_HBM3_CHNL6_NONFATAL},
+	[16]        = {"HBM SS3: Channel7",                     XE_SOC_HW_ERR_HBM3_CHNL7_NONFATAL},
+	[18]	    = {"ANR MDFI",				XE_SOC_HW_ERR_ANR_MDFI_NONFATAL},
+	[17]        = {"Undefined",                             XE_SOC_HW_ERR_UNKNOWN_NONFATAL},
+	[19 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair soc_slave_lcl_err_reg_nonfatal[] = {
+	[0 ... 31]  = {"Undefined",			XE_SOC_HW_ERR_UNKNOWN_NONFATAL},
+};
+
+static const struct err_msg_cntr_pair soc_mstr_lcl_err_reg_nonfatal[] = {
+	[0 ... 3]   = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_NONFATAL},
+	[4]         = {"Base Die MDFI T2T",			XE_SOC_HW_ERR_MDFI_T2T_NONFATAL},
+	[5]         = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_NONFATAL},
+	[6]         = {"Base Die MDFI T2C",			XE_SOC_HW_ERR_MDFI_T2C_NONFATAL},
+	[7]         = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_NONFATAL},
+	[8]         = {"Invalid CSC PSF Command Parity",	XE_SOC_HW_ERR_CSC_PSF_CMD_NONFATAL},
+	[9]         = {"Invalid CSC PSF Unexpected Completion",	XE_SOC_HW_ERR_CSC_PSF_CMP_NONFATAL},
+	[10]        = {"Invalid CSC PSF Unsupported Request",	XE_SOC_HW_ERR_CSC_PSF_REQ_NONFATAL},
+	[11 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
 static void xe_assign_hw_err_regs(struct xe_device *xe)
 {
 	const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
@@ -521,18 +582,20 @@ xe_gsc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 }
 
 static void
-xe_soc_log_err_update_cntr(struct xe_tile *tile,
+xe_soc_log_err_update_cntr(struct xe_tile *tile, const enum hardware_error hw_err,
 			   u32 errbit, const struct err_msg_cntr_pair *reg_info)
 {
 	const char *errmsg;
 	u32 indx;
 
+	const char *hwerr_to_str = hardware_error_type_to_str(hw_err);
+
 	errmsg = reg_info[errbit].errmsg;
 	indx = reg_info[errbit].cntr_indx;
 
 	drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
-			    "Tile%d %s SOC FATAL error, bit[%d] is set\n",
-			    tile->id, errmsg, errbit);
+			    "Tile%d %s SOC %s error, bit[%d] is set\n",
+			    tile->id, hwerr_to_str, errmsg, errbit);
 	tile->errors.count[indx]++;
 }
 
@@ -540,15 +603,34 @@ static void
 xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 {
 	unsigned long mst_glb_errstat, slv_glb_errstat, lcl_errstat;
+
+	const struct err_msg_cntr_pair *soc_mstr_glbl_err_reg;
+	const struct err_msg_cntr_pair *soc_mstr_lcl_err_reg;
+	const struct err_msg_cntr_pair *soc_slave_glbl_err_reg;
+	const struct err_msg_cntr_pair *soc_slave_lcl_err_reg;
 	u32 errbit, base, slave_base;
 	int i;
+
+	const char *hwerr_to_str = hardware_error_type_to_str(hw_err);
 	struct xe_gt *gt = tile->primary_gt;
 
 	lockdep_assert_held(&tile_to_xe(tile)->irq.lock);
 
-	if ((tile_to_xe(tile)->info.platform != XE_PVC) && hw_err != HARDWARE_ERROR_FATAL)
+	if ((tile_to_xe(tile)->info.platform != XE_PVC) && hw_err == HARDWARE_ERROR_CORRECTABLE)
 		return;
 
+	if (hw_err == HARDWARE_ERROR_FATAL) {
+		soc_mstr_glbl_err_reg = soc_mstr_glbl_err_reg_fatal;
+		soc_mstr_lcl_err_reg = soc_mstr_lcl_err_reg_fatal;
+		soc_slave_glbl_err_reg = soc_slave_glbl_err_reg_fatal;
+		soc_slave_lcl_err_reg = soc_slave_lcl_err_reg_fatal;
+	} else if (hw_err == HARDWARE_ERROR_NONFATAL) {
+		soc_mstr_glbl_err_reg = soc_mstr_glbl_err_reg_nonfatal;
+		soc_mstr_lcl_err_reg = soc_mstr_lcl_err_reg_nonfatal;
+		soc_slave_glbl_err_reg = soc_slave_glbl_err_reg_nonfatal;
+		soc_slave_lcl_err_reg = soc_slave_lcl_err_reg_nonfatal;
+	}
+
 	base = SOC_PVC_BASE;
 	slave_base = SOC_PVC_SLAVE_BASE;
 
@@ -564,33 +646,34 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 
 	mst_glb_errstat = xe_mmio_read32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err));
 	drm_info(&tile_to_xe(tile)->drm, HW_ERR
-		 "Tile%d SOC_GLOBAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n",
-		 tile->id, mst_glb_errstat);
+		 "Tile%d SOC_GLOBAL_ERR_STAT_MASTER_REG_%s:0x%08lx\n",
+		 tile->id, hwerr_to_str, mst_glb_errstat);
 
 	if (mst_glb_errstat & REG_BIT(SOC_SLAVE_IEH)) {
 		slv_glb_errstat = xe_mmio_read32(gt,
 						 SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err));
 		 drm_info(&tile_to_xe(tile)->drm, HW_ERR
-			  "Tile%d SOC_GLOBAL_ERR_STAT_SLAVE_REG_FATAL:0x%08lx\n",
-			  tile->id, slv_glb_errstat);
+			  "Tile%d SOC_GLOBAL_ERR_STAT_SLAVE_REG_%s:0x%08lx\n",
+			  tile->id, hwerr_to_str, slv_glb_errstat);
 
 		if (slv_glb_errstat & REG_BIT(SOC_IEH1_LOCAL_ERR_STATUS)) {
 			lcl_errstat = xe_mmio_read32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base,
 										      hw_err));
 			 drm_info(&tile_to_xe(tile)->drm, HW_ERR
-				  "Tile%d SOC_LOCAL_ERR_STAT_SLAVE_REG_FATAL:0x%08lx\n",
-				  tile->id, lcl_errstat);
+				  "Tile%d SOC_LOCAL_ERR_STAT_SLAVE_REG_%s:0x%08lx\n",
+				  tile->id, hwerr_to_str, lcl_errstat);
 
 			for_each_set_bit(errbit, &lcl_errstat, 32)
-				xe_soc_log_err_update_cntr(tile, errbit,
-							   soc_slave_lcl_err_reg_fatal);
+				xe_soc_log_err_update_cntr(tile, hw_err, errbit,
+							   soc_slave_lcl_err_reg);
 
 			xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
 					lcl_errstat);
 		}
 
 		for_each_set_bit(errbit, &slv_glb_errstat, 32)
-			xe_soc_log_err_update_cntr(tile, errbit, soc_slave_glbl_err_reg_fatal);
+			xe_soc_log_err_update_cntr(tile, errbit, hw_err,
+						   soc_slave_glbl_err_reg);
 
 		xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
 				slv_glb_errstat);
@@ -598,17 +681,18 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 
 	if (mst_glb_errstat & REG_BIT(SOC_IEH0_LOCAL_ERR_STATUS)) {
 		lcl_errstat = xe_mmio_read32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err));
-		drm_info(&tile_to_xe(tile)->drm, HW_ERR "SOC_LOCAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n",
-			 lcl_errstat);
+		drm_info(&tile_to_xe(tile)->drm, HW_ERR "Tile%d SOC_LOCAL_ERR_STAT_MASTER_REG_%s:0x%08lx\n",
+			 tile->id, hwerr_to_str, lcl_errstat);
 
 		for_each_set_bit(errbit, &lcl_errstat, 32)
-			xe_soc_log_err_update_cntr(tile, errbit, soc_mstr_lcl_err_reg_fatal);
+			xe_soc_log_err_update_cntr(tile, hw_err, errbit,
+						   soc_mstr_lcl_err_reg);
 
 		xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err), lcl_errstat);
 	}
 
 	for_each_set_bit(errbit, &mst_glb_errstat, 32)
-		xe_soc_log_err_update_cntr(tile, errbit, soc_mstr_glbl_err_reg_fatal);
+		xe_soc_log_err_update_cntr(tile, errbit, hw_err, soc_mstr_glbl_err_reg);
 
 	xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err),
 			mst_glb_errstat);
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
index 05838e082abd..a458a90b34a2 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.h
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -115,6 +115,48 @@ enum xe_tile_hw_errors {
 	XE_SOC_HW_ERR_PCIE_PSF_CMD_FATAL,
 	XE_SOC_HW_ERR_PCIE_PSF_CMP_FATAL,
 	XE_SOC_HW_ERR_PCIE_PSF_REQ_FATAL,
+	XE_SOC_HW_ERR_MSTR_LCL_NONFATAL,
+	XE_SOC_HW_ERR_SLAVE_GLBL_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL0_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL1_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL2_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL3_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL4_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL5_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL6_NONFATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL7_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL0_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL1_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL2_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL3_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL4_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL5_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL6_NONFATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL7_NONFATAL,
+	XE_SOC_HW_ERR_UNKNOWN_NONFATAL,
+	XE_SOC_HW_ERR_SLAVE_LCL_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL0_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL1_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL2_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL3_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL4_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL5_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL6_NONFATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL7_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL0_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL1_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL2_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL3_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL4_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL5_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL6_NONFATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL7_NONFATAL,
+	XE_SOC_HW_ERR_ANR_MDFI_NONFATAL,
+	XE_SOC_HW_ERR_MDFI_T2T_NONFATAL,
+	XE_SOC_HW_ERR_MDFI_T2C_NONFATAL,
+	XE_SOC_HW_ERR_CSC_PSF_CMD_NONFATAL,
+	XE_SOC_HW_ERR_CSC_PSF_CMP_NONFATAL,
+	XE_SOC_HW_ERR_CSC_PSF_REQ_NONFATAL,
 	XE_TILE_HW_ERROR_MAX,
 };
 
-- 
2.25.1



More information about the Intel-xe mailing list