[Intel-xe] [PATCH v2 4/4] drm/xe: Process fatal hardware errors.

Thu Aug 10 05:07:55 UTC 2023

Fatal errors are reported as PCIe errors. When a PCIe error is asserted,
the OS will perform a device warm reset which causes the driver to reload.
The error registers are sticky and the values are maintained through a
warm reset. We read these registers during the boot flow of the driver and
increment the respective error counters.

Bspec: 53076

Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Matt Roper <matthew.d.roper at intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
 drivers/gpu/drm/xe/regs/xe_gt_error_regs.h |  3 ++
 drivers/gpu/drm/xe/xe_irq.c                | 37 +++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
index 27a54b7c278a..b389ddd140d0 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
@@ -42,6 +42,9 @@ enum gt_vctr_registers {
 						ERR_STAT_GT_COR_VCTR_REG(x) : \
 						ERR_STAT_GT_FATAL_VCTR_REG(x))
 
+#define DEV_PCIEERR_STATUS              XE_REG(0x100180)
+#define DEV_PCIEERR_IS_FATAL(x)         REG_BIT(x * 4 + 2)
+
 /* Count of GT Correctable and FATAL HW ERRORS */
 enum xe_gt_hw_errors {
 	XE_GT_HW_ERR_SUBSLICE_CORR,
diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
index ab29ac625d3a..7193744c2c45 100644
--- a/drivers/gpu/drm/xe/xe_irq.c
+++ b/drivers/gpu/drm/xe/xe_irq.c
@@ -784,6 +784,41 @@ xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
 	}
 }
 
+/**
+ * process_hw_errors - checks for the occurrence of HW errors
+ *
+ * This checks for the HW Errors including FATAL error that might
+ * have occurred in the previous boot of the driver which will
+ * initiate PCIe FLR reset of the device and cause the
+ * driver to reload.
+ */
+static void process_hw_errors(struct xe_device *xe)
+{
+	struct xe_tile *root_tile = xe_device_get_root_tile(xe);
+	struct xe_gt *root_mmio = root_tile->primary_gt;
+
+	u32 dev_pcieerr_status, master_ctl;
+	struct xe_tile *tile;
+	int i;
+
+	update_valid_error_regs(xe);
+
+	dev_pcieerr_status = xe_mmio_read32(root_mmio, DEV_PCIEERR_STATUS);
+
+	for_each_tile(tile, xe, i) {
+		struct xe_gt *mmio = tile->primary_gt;
+
+		if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
+			xe_hw_error_source_handler(tile, HARDWARE_ERROR_FATAL);
+
+		master_ctl = xe_mmio_read32(mmio, GFX_MSTR_IRQ);
+		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
+		xe_hw_error_irq_handler(tile, master_ctl);
+	}
+	if (dev_pcieerr_status)
+		xe_mmio_write32(root_mmio, DEV_PCIEERR_STATUS, dev_pcieerr_status);
+}
+
 /*
  * Top-level interrupt handler for Xe_LP+ and beyond.  These platforms have
  * a "master tile" interrupt register which must be consulted before the
@@ -984,7 +1019,7 @@ int xe_irq_install(struct xe_device *xe)
 		return -EINVAL;
 	}
 
-	update_valid_error_regs(xe);
+	process_hw_errors(xe);
 
 	xe->irq.enabled = true;
 
-- 
2.25.1