[Intel-xe] [PATCH v2 07/10] drm/xe: Support SOC NONFATAL error handling for PVC.

Himal Prasad Ghimiray himal.prasad.ghimiray at intel.com
Wed Oct 18 04:00:30 UTC 2023


Report the SOC nonfatal hardware error and update the counters which
will increment incase of error.

v2
- Use xe_assign_hw_err_regs to initilaize registers.
- Dont use the counters if error is being reported by second level
  registers.
- Fix Num of IEH to 2.
- Follow the convention source_typeoferror_errorname for enum and error
reporting.(Aravind)

Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
 drivers/gpu/drm/xe/xe_hw_error.c | 70 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_hw_error.h | 39 ++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 55f8613e8b6d..8b968b117c18 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -258,6 +258,67 @@ static const struct err_name_index_pair pvc_soc_mstr_lcl_err_reg_fatal[] = {
 	[14 ... 31] = {"Undefined",				XE_HW_ERR_SOC_FATAL_UNKNOWN},
 };
 
+static const struct err_name_index_pair pvc_soc_mstr_glbl_err_reg_nonfatal[] = {
+	[0]         = {"MASTER LOCAL Reported",			XE_HW_ERR_TILE_UNSPEC},
+	[1]         = {"SLAVE GLOBAL Reported",			XE_HW_ERR_TILE_UNSPEC},
+	[2]         = {"HBM SS0: Channel0",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL0},
+	[3]         = {"HBM SS0: Channel1",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL1},
+	[4]         = {"HBM SS0: Channel2",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL2},
+	[5]         = {"HBM SS0: Channel3",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL3},
+	[6]         = {"HBM SS0: Channel4",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL4},
+	[7]         = {"HBM SS0: Channel5",			XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL5},
+	[8]         = {"HBM SS0: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL6},
+	[9]         = {"HBM SS0: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL7},
+	[10]        = {"HBM SS1: Channel0",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL0},
+	[11]        = {"HBM SS1: Channel1",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL1},
+	[12]        = {"HBM SS1: Channel2",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL2},
+	[13]        = {"HBM SS1: Channel3",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL3},
+	[14]        = {"HBM SS1: Channel4",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL4},
+	[15]        = {"HBM SS1: Channel5",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL5},
+	[16]        = {"HBM SS1: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL6},
+	[17]        = {"HBM SS1: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL7},
+	[18 ... 31] = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_soc_slave_glbl_err_reg_nonfatal[] = {
+	[0]         = {"SLAVE LOCAL Reported",			XE_HW_ERR_TILE_UNSPEC},
+	[1]         = {"HBM SS2: Channel0",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL0},
+	[2]         = {"HBM SS2: Channel1",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL1},
+	[3]         = {"HBM SS2: Channel2",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL2},
+	[4]         = {"HBM SS2: Channel3",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL3},
+	[5]         = {"HBM SS2: Channel4",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL4},
+	[6]         = {"HBM SS2: Channel5",			XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL5},
+	[7]         = {"HBM SS2: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL6},
+	[8]         = {"HBM SS2: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL7},
+	[9]         = {"HBM SS3: Channel0",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL0},
+	[10]        = {"HBM SS3: Channel1",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL1},
+	[11]        = {"HBM SS3: Channel2",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL2},
+	[12]        = {"HBM SS3: Channel3",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL3},
+	[13]        = {"HBM SS3: Channel4",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL4},
+	[14]        = {"HBM SS3: Channel5",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL5},
+	[15]        = {"HBM SS3: Channel6",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL6},
+	[16]        = {"HBM SS3: Channel7",                     XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL7},
+	[18]	    = {"ANR MDFI",				XE_HW_ERR_SOC_NONFATAL_ANR_MDFI},
+	[17]        = {"Undefined",                             XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[19 ... 31] = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_soc_slave_lcl_err_reg_nonfatal[] = {
+	[0 ... 31]  = {"Undefined",			XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_soc_mstr_lcl_err_reg_nonfatal[] = {
+	[0 ... 3]   = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[4]         = {"Base Die MDFI T2T",			XE_HW_ERR_SOC_NONFATAL_MDFI_T2T},
+	[5]         = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[6]         = {"Base Die MDFI T2C",			XE_HW_ERR_SOC_NONFATAL_MDFI_T2C},
+	[7]         = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+	[8]         = {"Invalid CSC PSF Command Parity",	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMD},
+	[9]         = {"Invalid CSC PSF Unexpected Completion",	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMP},
+	[10]        = {"Invalid CSC PSF Unsupported Request",	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_REQ},
+	[11 ... 31] = {"Undefined",				XE_HW_ERR_SOC_NONFATAL_UNKNOWN},
+};
+
 void xe_assign_hw_err_regs(struct xe_device *xe)
 {
 	const struct err_name_index_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
@@ -295,6 +356,10 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
 		soc_mstr_lcl[HARDWARE_ERROR_FATAL] = pvc_soc_mstr_lcl_err_reg_fatal;
 		soc_slave_glbl[HARDWARE_ERROR_FATAL] = pvc_soc_slave_glbl_err_reg_fatal;
 		soc_slave_lcl[HARDWARE_ERROR_FATAL] = pvc_soc_slave_lcl_err_reg_fatal;
+		soc_mstr_glbl[HARDWARE_ERROR_NONFATAL] = pvc_soc_mstr_glbl_err_reg_nonfatal;
+		soc_mstr_lcl[HARDWARE_ERROR_NONFATAL] = pvc_soc_mstr_lcl_err_reg_nonfatal;
+		soc_slave_glbl[HARDWARE_ERROR_NONFATAL] = pvc_soc_slave_glbl_err_reg_nonfatal;
+		soc_slave_lcl[HARDWARE_ERROR_NONFATAL] = pvc_soc_slave_lcl_err_reg_nonfatal;
 	}
 
 }
@@ -578,7 +643,10 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 
 	lockdep_assert_held(&tile_to_xe(tile)->irq.lock);
 
-	if ((tile_to_xe(tile)->info.platform != XE_PVC) ||  hw_err != HARDWARE_ERROR_FATAL)
+	if (tile_to_xe(tile)->info.platform != XE_PVC)
+		return;
+
+	if (hw_err == HARDWARE_ERROR_CORRECTABLE)
 		return;
 
 	base = SOC_PVC_BASE;
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
index 700474aed171..59b331f52783 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.h
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -112,6 +112,45 @@ enum xe_soc_hw_errors {
 	XE_HW_ERR_SOC_FATAL_PCIE_PSF_CMD,
 	XE_HW_ERR_SOC_FATAL_PCIE_PSF_CMP,
 	XE_HW_ERR_SOC_FATAL_PCIE_PSF_REQ,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_UNKNOWN,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL0,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL1,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL2,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL3,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL4,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL5,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL6,
+	XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL7,
+	XE_HW_ERR_SOC_NONFATAL_ANR_MDFI,
+	XE_HW_ERR_SOC_NONFATAL_MDFI_T2T,
+	XE_HW_ERR_SOC_NONFATAL_MDFI_T2C,
+	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMD,
+	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMP,
+	XE_HW_ERR_SOC_NONFATAL_CSC_PSF_REQ,
 	XE_TILE_HW_ERROR_MAX,
 };
 
-- 
2.25.1



More information about the Intel-xe mailing list