[Intel-xe] [PATCH v2 09/11] drm/xe: Handle MDFI error severity.
Himal Prasad Ghimiray
himal.prasad.ghimiray at intel.com
Thu Oct 19 13:25:32 UTC 2023
NONFATAL and FATAL MDFI(T2T/T2C) errors are reported by same IEH
register and bits (Bit 4 and Bit 6 of 0x282280). To determine the
severity read local first error header log register (0x2822b0).
Value 0x00330000 ensures severity is fatal and 0x00310000 is for NONFATAL
errors. This register doesn't need explicit clearing, clearing MDFI
bit in IEH reg will clear this register too. Incase of nonfatal value
being reported by status register in fatal flow don't clean the MDFI IEH
bit and continue. Same needs to be addressed if value read by status
register is fatal in nonfatal flow.
v2
- Add commit message.
Cc: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 9 +++++++++
drivers/gpu/drm/xe/xe_hw_error.c | 16 ++++++++++++++--
2 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
index 1b35fe5ee785..235dc93ce274 100644
--- a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
@@ -37,6 +37,8 @@
#define SOC_LOCAL_ERR_STAT_MASTER_REG(base, x) XE_REG((x) > HARDWARE_ERROR_CORRECTABLE ? \
(base) + _SOC_LERRUNCSTS : \
(base) + _SOC_LERRCORSTS)
+#define MDFI_T2T 4
+#define MDFI_T2C 6
#define _DEV_ERR_STAT_NONFATAL 0x100178
@@ -49,6 +51,13 @@
#define XE_SOC_ERROR 16
#define SOC_PVC_BASE 0x282000
+
+#define LOCAL_FIRST_IEH_HEADER_LOG_REG XE_REG(0x2822b0)
+#define MDFI_SEVERITY_FATAL 0x00330000
+#define MDFI_SEVERITY_NONFATAL 0x00310000
+#define MDFI_SEVERITY(x) ((x) == HARDWARE_ERROR_FATAL ? \
+ MDFI_SEVERITY_FATAL : \
+ MDFI_SEVERITY_NONFATAL)
#define SOC_PVC_SLAVE_BASE 0x283000
#define PVC_GSC_HECI1_BASE 0x284000
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 7c41e51fbb11..9b960292470f 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -629,7 +629,7 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
{
unsigned long mst_glb_errstat, slv_glb_errstat, lcl_errstat;
struct hardware_errors_regs *err_regs;
- u32 errbit, base, slave_base;
+ u32 errbit, base, slave_base, ieh_header;
int i;
struct xe_gt *gt = tile->primary_gt;
@@ -697,9 +697,21 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
"Tile%d reported SOC_LOCAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n",
tile->id, lcl_errstat);
- for_each_set_bit(errbit, &lcl_errstat, XE_RAS_REG_SIZE)
+ for_each_set_bit(errbit, &lcl_errstat, XE_RAS_REG_SIZE) {
+ if (errbit == MDFI_T2T || errbit == MDFI_T2C) {
+ ieh_header = xe_mmio_read32(gt, LOCAL_FIRST_IEH_HEADER_LOG_REG);
+ drm_info(&tile_to_xe(tile)->drm, HW_ERR "Tile%d LOCAL_FIRST_IEH_HEADER_LOG_REG:0x%08x\n",
+ tile->id, ieh_header);
+
+ if (ieh_header != MDFI_SEVERITY(hw_err)) {
+ lcl_errstat &= ~REG_BIT(errbit);
+ continue;
+ }
+ }
+
xe_soc_log_err_update_cntr(tile, hw_err, errbit,
err_regs->soc_mstr_lcl[hw_err]);
+ }
xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err), lcl_errstat);
}
--
2.25.1
More information about the Intel-xe
mailing list