[Intel-xe] [PATCH 07/11] drm/xe: Support SOC FATAL error handling for PVC.

Himal Prasad Ghimiray himal.prasad.ghimiray at intel.com
Wed Sep 27 11:46:23 UTC 2023


Report the SOC fatal hardware error and update the counters which will
increment incase of error.

Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h |  28 +++
 drivers/gpu/drm/xe/xe_hw_error.c             | 170 +++++++++++++++++++
 drivers/gpu/drm/xe/xe_hw_error.h             |  58 ++++++-
 3 files changed, 254 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
index fa16eaf9436b..04701c62f0d9 100644
--- a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
@@ -20,4 +20,32 @@
 #define GSC_HEC_ERR_STAT_REG(base, x)                  XE_REG(_PICK_EVEN((x), \
 								(base) + _GSC_HEC_CORR_ERR_STATUS, \
 								(base) + _GSC_HEC_UNCOR_ERR_STATUS))
+#define SOC_PVC_BASE	               0x00282000
+#define SOC_PVC_SLAVE_BASE             0x00283000
+
+#define _SOC_LERRCORSTS		       0x000294
+#define _SOC_LERRUNCSTS		       0x000280
+#define SOC_LOCAL_ERR_STAT_SLAVE_REG(base, x)		XE_REG((x) > HARDWARE_ERROR_CORRECTABLE ? \
+								(base) + _SOC_LERRUNCSTS : \
+								(base) + _SOC_LERRCORSTS)
+
+#define SOC_LOCAL_ERR_STAT_MASTER_REG(base, x)		XE_REG((x) > HARDWARE_ERROR_CORRECTABLE ? \
+								(base) + _SOC_LERRUNCSTS : \
+								(base) + _SOC_LERRCORSTS)
+#define _SOC_GSYSEVTCTL		        0x000264
+
+#define SOC_GSYSEVTCTL_REG(base, slave_base, x)		XE_REG(_PICK_EVEN((x), \
+								(base) + _SOC_GSYSEVTCTL, \
+								slave_base + _SOC_GSYSEVTCTL))
+#define _SOC_GCOERRSTS		        0x000200
+#define _SOC_GNFERRSTS		        0x000210
+#define _SOC_GFAERRSTS		        0x000220
+#define SOC_GLOBAL_ERR_STAT_SLAVE_REG(base, x)		XE_REG(_PICK_EVEN((x), \
+								(base) + _SOC_GCOERRSTS, \
+								(base) + _SOC_GNFERRSTS))
+
+#define SOC_GLOBAL_ERR_STAT_MASTER_REG(base, x)		XE_REG(_PICK_EVEN((x), \
+								(base) + _SOC_GCOERRSTS, \
+								(base) + _SOC_GNFERRSTS))
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 76ae12df013c..fa05bad5e684 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -207,6 +207,75 @@ static const struct err_msg_cntr_pair gsc_correctable_err_reg[] = {
 	[2 ... 31] = {"Undefined",					XE_GSC_HW_ERR_UNKNOWN_CORR},
 };
 
+static const struct err_msg_cntr_pair soc_mstr_glbl_err_reg_fatal[] = {
+	[0]         = {"MASTER LOCAL Reported",			XE_SOC_HW_ERR_MSTR_LCL_FATAL},
+	[1]         = {"SLAVE GLOBAL Reported",			XE_SOC_HW_ERR_SLAVE_GLBL_FATAL},
+	[2]         = {"HBM SS0: Channel0",			XE_SOC_HW_ERR_HBM0_CHNL0_FATAL},
+	[3]         = {"HBM SS0: Channel1",			XE_SOC_HW_ERR_HBM0_CHNL1_FATAL},
+	[4]         = {"HBM SS0: Channel2",			XE_SOC_HW_ERR_HBM0_CHNL2_FATAL},
+	[5]         = {"HBM SS0: Channel3",			XE_SOC_HW_ERR_HBM0_CHNL3_FATAL},
+	[6]         = {"HBM SS0: Channel4",			XE_SOC_HW_ERR_HBM0_CHNL4_FATAL},
+	[7]         = {"HBM SS0: Channel5",			XE_SOC_HW_ERR_HBM0_CHNL5_FATAL},
+	[8]         = {"HBM SS0: Channel6",                     XE_SOC_HW_ERR_HBM0_CHNL6_FATAL},
+	[9]         = {"HBM SS0: Channel7",                     XE_SOC_HW_ERR_HBM0_CHNL7_FATAL},
+	[10]        = {"HBM SS1: Channel0",                     XE_SOC_HW_ERR_HBM1_CHNL0_FATAL},
+	[11]        = {"HBM SS1: Channel1",                     XE_SOC_HW_ERR_HBM1_CHNL1_FATAL},
+	[12]        = {"HBM SS1: Channel2",                     XE_SOC_HW_ERR_HBM1_CHNL2_FATAL},
+	[13]        = {"HBM SS1: Channel3",                     XE_SOC_HW_ERR_HBM1_CHNL3_FATAL},
+	[14]        = {"HBM SS1: Channel4",                     XE_SOC_HW_ERR_HBM1_CHNL4_FATAL},
+	[15]        = {"HBM SS1: Channel5",                     XE_SOC_HW_ERR_HBM1_CHNL5_FATAL},
+	[16]        = {"HBM SS1: Channel6",                     XE_SOC_HW_ERR_HBM1_CHNL6_FATAL},
+	[17]        = {"HBM SS1: Channel7",                     XE_SOC_HW_ERR_HBM1_CHNL7_FATAL},
+	[18]	    = {"PUNIT",					XE_SOC_HW_ERR_PUNIT_FATAL},
+	[19 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair soc_slave_glbl_err_reg_fatal[] = {
+	[0]         = {"SLAVE LOCAL Reported",			XE_SOC_HW_ERR_SLAVE_LCL_FATAL},
+	[1]         = {"HBM SS2: Channel0",			XE_SOC_HW_ERR_HBM2_CHNL0_FATAL},
+	[2]         = {"HBM SS2: Channel1",			XE_SOC_HW_ERR_HBM2_CHNL1_FATAL},
+	[3]         = {"HBM SS2: Channel2",			XE_SOC_HW_ERR_HBM2_CHNL2_FATAL},
+	[4]         = {"HBM SS2: Channel3",			XE_SOC_HW_ERR_HBM2_CHNL3_FATAL},
+	[5]         = {"HBM SS2: Channel4",			XE_SOC_HW_ERR_HBM2_CHNL4_FATAL},
+	[6]         = {"HBM SS2: Channel5",			XE_SOC_HW_ERR_HBM2_CHNL5_FATAL},
+	[7]         = {"HBM SS2: Channel6",                     XE_SOC_HW_ERR_HBM2_CHNL6_FATAL},
+	[8]         = {"HBM SS2: Channel7",                     XE_SOC_HW_ERR_HBM2_CHNL7_FATAL},
+	[9]         = {"HBM SS3: Channel0",                     XE_SOC_HW_ERR_HBM3_CHNL0_FATAL},
+	[10]        = {"HBM SS3: Channel1",                     XE_SOC_HW_ERR_HBM3_CHNL1_FATAL},
+	[11]        = {"HBM SS3: Channel2",                     XE_SOC_HW_ERR_HBM3_CHNL2_FATAL},
+	[12]        = {"HBM SS3: Channel3",                     XE_SOC_HW_ERR_HBM3_CHNL3_FATAL},
+	[13]        = {"HBM SS3: Channel4",                     XE_SOC_HW_ERR_HBM3_CHNL4_FATAL},
+	[14]        = {"HBM SS3: Channel5",                     XE_SOC_HW_ERR_HBM3_CHNL5_FATAL},
+	[15]        = {"HBM SS3: Channel6",                     XE_SOC_HW_ERR_HBM3_CHNL6_FATAL},
+	[16]        = {"HBM SS3: Channel7",                     XE_SOC_HW_ERR_HBM3_CHNL7_FATAL},
+	[18]	    = {"ANR MDFI",				XE_SOC_HW_ERR_ANR_MDFI_FATAL},
+	[17]        = {"Undefined",                             XE_SOC_HW_ERR_UNKNOWN_FATAL},
+	[19 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair soc_slave_lcl_err_reg_fatal[] = {
+	[0]         = {"Local IEH Internal: Malformed PCIe AER",     XE_SOC_HW_ERR_PCIE_AER_FATAL},
+	[1]         = {"Local IEH Internal: Malformed PCIe ERR",     XE_SOC_HW_ERR_PCIE_ERR_FATAL},
+	[2]         = {"Local IEH Internal: UR CONDITIONS IN IEH",   XE_SOC_HW_ERR_UR_COND_FATAL},
+	[3]         = {"Local IEH Internal: FROM SERR SOURCES",      XE_SOC_HW_ERR_SERR_SRCS_FATAL},
+	[4 ... 31]  = {"Undefined",				     XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair soc_mstr_lcl_err_reg_fatal[] = {
+	[0 ... 3]   = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+	[4]         = {"Base Die MDFI T2T",			XE_SOC_HW_ERR_MDFI_T2T_FATAL},
+	[5]         = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+	[6]         = {"Base Die MDFI T2C",			XE_SOC_HW_ERR_MDFI_T2C_FATAL},
+	[7]         = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+	[8]         = {"Invalid CSC PSF Command Parity",	XE_SOC_HW_ERR_CSC_PSF_CMD_FATAL},
+	[9]         = {"Invalid CSC PSF Unexpected Completion",	XE_SOC_HW_ERR_CSC_PSF_CMP_FATAL},
+	[10]        = {"Invalid CSC PSF Unsupported Request",	XE_SOC_HW_ERR_CSC_PSF_REQ_FATAL},
+	[11]        = {"Invalid PCIe PSF Command Parity",	XE_SOC_HW_ERR_PCIE_PSF_CMD_FATAL},
+	[12]        = {"PCIe PSF Unexpected Completion",	XE_SOC_HW_ERR_PCIE_PSF_CMP_FATAL},
+	[13]        = {"PCIe PSF Unsupported Request",		XE_SOC_HW_ERR_PCIE_PSF_REQ_FATAL},
+	[14 ... 31] = {"Undefined",				XE_SOC_HW_ERR_UNKNOWN_FATAL},
+};
+
 static void xe_assign_hw_err_regs(struct xe_device *xe)
 {
 	const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
@@ -451,6 +520,104 @@ xe_gsc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 	xe_mmio_write32(mmio, GSC_HEC_ERR_STAT_REG(base, hw_err), errsrc);
 }
 
+static void
+xe_soc_log_err_update_cntr(struct xe_tile *tile,
+			   u32 errbit, const struct err_msg_cntr_pair *reg_info)
+{
+	const char *errmsg;
+	u32 indx;
+
+	errmsg = reg_info[errbit].errmsg;
+	indx = reg_info[errbit].cntr_indx;
+
+	drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
+			    "Tile%d %s SOC FATAL error, bit[%d] is set\n",
+			    tile->id, errmsg, errbit);
+	tile->errors.count[indx]++;
+}
+
+static void
+xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
+{
+	unsigned long mst_glb_errstat, slv_glb_errstat, lcl_errstat;
+	u32 errbit, base, slave_base;
+	int i;
+	struct xe_gt *gt = tile->primary_gt;
+
+	lockdep_assert_held(&tile_to_xe(tile)->irq.lock);
+
+	if ((tile_to_xe(tile)->info.platform != XE_PVC) && hw_err != HARDWARE_ERROR_FATAL)
+		return;
+
+	base = SOC_PVC_BASE;
+	slave_base = SOC_PVC_SLAVE_BASE;
+
+	/*
+	 * Mask error type in GSYSEVTCTL so that no new errors of the type
+	 * will be reported. Read the master global IEH error register if
+	 * BIT 1 is set then process the slave IEH first. If BIT 0 in
+	 * global error register is set then process the corresponding
+	 * Local error registers
+	 */
+	for (i = 0; i < PVC_NUM_IEH; i++)
+		xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i), ~REG_BIT(hw_err));
+
+	mst_glb_errstat = xe_mmio_read32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err));
+	drm_info(&tile_to_xe(tile)->drm, HW_ERR
+		 "Tile%d SOC_GLOBAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n",
+		 tile->id, mst_glb_errstat);
+
+	if (mst_glb_errstat & REG_BIT(SOC_SLAVE_IEH)) {
+		slv_glb_errstat = xe_mmio_read32(gt,
+						 SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err));
+		 drm_info(&tile_to_xe(tile)->drm, HW_ERR
+			  "Tile%d SOC_GLOBAL_ERR_STAT_SLAVE_REG_FATAL:0x%08lx\n",
+			  tile->id, slv_glb_errstat);
+
+		if (slv_glb_errstat & REG_BIT(SOC_IEH1_LOCAL_ERR_STATUS)) {
+			lcl_errstat = xe_mmio_read32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base,
+										      hw_err));
+			 drm_info(&tile_to_xe(tile)->drm, HW_ERR
+				  "Tile%d SOC_LOCAL_ERR_STAT_SLAVE_REG_FATAL:0x%08lx\n",
+				  tile->id, lcl_errstat);
+
+			for_each_set_bit(errbit, &lcl_errstat, 32)
+				xe_soc_log_err_update_cntr(tile, errbit,
+							   soc_slave_lcl_err_reg_fatal);
+
+			xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
+					lcl_errstat);
+		}
+
+		for_each_set_bit(errbit, &slv_glb_errstat, 32)
+			xe_soc_log_err_update_cntr(tile, errbit, soc_slave_glbl_err_reg_fatal);
+
+		xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
+				slv_glb_errstat);
+	}
+
+	if (mst_glb_errstat & REG_BIT(SOC_IEH0_LOCAL_ERR_STATUS)) {
+		lcl_errstat = xe_mmio_read32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err));
+		drm_info(&tile_to_xe(tile)->drm, HW_ERR "SOC_LOCAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n",
+			 lcl_errstat);
+
+		for_each_set_bit(errbit, &lcl_errstat, 32)
+			xe_soc_log_err_update_cntr(tile, errbit, soc_mstr_lcl_err_reg_fatal);
+
+		xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err), lcl_errstat);
+	}
+
+	for_each_set_bit(errbit, &mst_glb_errstat, 32)
+		xe_soc_log_err_update_cntr(tile, errbit, soc_mstr_glbl_err_reg_fatal);
+
+	xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err),
+			mst_glb_errstat);
+
+	for (i = 0; i < PVC_NUM_IEH; i++)
+		xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
+				(HARDWARE_ERROR_MAX << 1) + 1);
+}
+
 static void
 xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 {
@@ -498,6 +665,9 @@ xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
 
 		if (errbit == 8)
 			xe_gsc_hw_error_handler(tile, hw_err);
+
+		if (errbit == 16)
+			xe_soc_hw_error_handler(tile, hw_err);
 	}
 
 	xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
index ee7705b3343b..05838e082abd 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.h
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -65,6 +65,56 @@ enum xe_tile_hw_errors {
 	XE_GSC_HW_ERR_SELF_MBIST_UNCOR,
 	XE_GSC_HW_ERR_AON_RF_PARITY_UNCOR,
 	XE_GSC_HW_ERR_UNKNOWN_UNCOR,
+	XE_SOC_HW_ERR_MSTR_LCL_FATAL,
+	XE_SOC_HW_ERR_SLAVE_GLBL_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL0_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL1_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL2_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL3_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL4_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL5_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL6_FATAL,
+	XE_SOC_HW_ERR_HBM0_CHNL7_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL0_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL1_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL2_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL3_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL4_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL5_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL6_FATAL,
+	XE_SOC_HW_ERR_HBM1_CHNL7_FATAL,
+	XE_SOC_HW_ERR_PUNIT_FATAL,
+	XE_SOC_HW_ERR_UNKNOWN_FATAL,
+	XE_SOC_HW_ERR_SLAVE_LCL_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL0_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL1_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL2_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL3_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL4_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL5_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL6_FATAL,
+	XE_SOC_HW_ERR_HBM2_CHNL7_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL0_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL1_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL2_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL3_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL4_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL5_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL6_FATAL,
+	XE_SOC_HW_ERR_HBM3_CHNL7_FATAL,
+	XE_SOC_HW_ERR_ANR_MDFI_FATAL,
+	XE_SOC_HW_ERR_PCIE_AER_FATAL,
+	XE_SOC_HW_ERR_PCIE_ERR_FATAL,
+	XE_SOC_HW_ERR_UR_COND_FATAL,
+	XE_SOC_HW_ERR_SERR_SRCS_FATAL,
+	XE_SOC_HW_ERR_MDFI_T2T_FATAL,
+	XE_SOC_HW_ERR_MDFI_T2C_FATAL,
+	XE_SOC_HW_ERR_CSC_PSF_CMD_FATAL,
+	XE_SOC_HW_ERR_CSC_PSF_CMP_FATAL,
+	XE_SOC_HW_ERR_CSC_PSF_REQ_FATAL,
+	XE_SOC_HW_ERR_PCIE_PSF_CMD_FATAL,
+	XE_SOC_HW_ERR_PCIE_PSF_CMP_FATAL,
+	XE_SOC_HW_ERR_PCIE_PSF_REQ_FATAL,
 	XE_TILE_HW_ERROR_MAX,
 };
 
@@ -109,8 +159,12 @@ enum xe_gt_hw_errors {
 	XE_GT_HW_ERROR_MAX,
 };
 
-#define ERR_STAT_GT_COR_VCTR_LEN (4)
-#define ERR_STAT_GT_FATAL_VCTR_LEN (8)
+#define ERR_STAT_GT_COR_VCTR_LEN	(4)
+#define ERR_STAT_GT_FATAL_VCTR_LEN	(8)
+#define PVC_NUM_IEH			(1)
+#define SOC_SLAVE_IEH                   (1)
+#define SOC_IEH0_LOCAL_ERR_STATUS       (0)
+#define SOC_IEH1_LOCAL_ERR_STATUS       (0)
 
 struct err_msg_cntr_pair {
 	const char *errmsg;
-- 
2.25.1



More information about the Intel-xe mailing list