[Intel-xe] [PATCH v5 4/4] drm/xe: Process fatal hardware errors.
Himal Prasad Ghimiray
himal.prasad.ghimiray at intel.com
Wed Aug 23 08:58:42 UTC 2023
Fatal errors are reported as PCIe errors. When a PCIe error is asserted,
the OS will perform a device warm reset which causes the driver to reload.
The error registers are sticky and the values are maintained through a
warm reset. We read these registers during the boot flow of the driver and
increment the respective error counters.
Bspec: 53076
Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Matt Roper <matthew.d.roper at intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
drivers/gpu/drm/xe/regs/xe_regs.h | 3 +++
drivers/gpu/drm/xe/xe_hw_error.c | 37 ++++++++++++++++++++++++++++++-
drivers/gpu/drm/xe/xe_hw_error.h | 3 ++-
drivers/gpu/drm/xe/xe_irq.c | 2 +-
4 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
index e223975a5acf..b8f2b1762d3f 100644
--- a/drivers/gpu/drm/xe/regs/xe_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_regs.h
@@ -90,5 +90,8 @@
#define GT_DW_IRQ(x) REG_BIT(x)
#define XE_ERROR_IRQ(x) REG_BIT(26 + (x))
+#define DEV_PCIEERR_STATUS XE_REG(0x100180)
+#define DEV_PCIEERR_IS_FATAL(x) REG_BIT(x * 4 + 2)
+
#define PVC_RP_STATE_CAP XE_REG(0x281014)
#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index deb020a509d2..9595e3369656 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -183,7 +183,7 @@ static const struct err_msg_cntr_pair err_stat_gt_correctable_vectr_reg[] = {
[2 ... 3] = {"L3BANK", XE_GT_HW_ERR_L3BANK_CORR},
};
-void xe_assign_hw_err_regs(struct xe_device *xe)
+static void xe_assign_hw_err_regs(struct xe_device *xe)
{
const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
const struct err_msg_cntr_pair **err_stat_gt = xe->hw_err_regs.err_stat_gt;
@@ -417,3 +417,38 @@ xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
xe_hw_error_source_handler(tile, hw_err);
}
}
+
+/**
+ * process_hw_errors - checks for the occurrence of HW errors
+ *
+ * This checks for the HW Errors including FATAL errors that might
+ * have occurred in the previous boot of the driver which will
+ * initiate PCIe FLR reset of the device and cause the
+ * driver to reload.
+ */
+void xe_process_hw_errors(struct xe_device *xe)
+{
+ struct xe_tile *root_tile = xe_device_get_root_tile(xe);
+ struct xe_gt *root_mmio = root_tile->primary_gt;
+
+ u32 dev_pcieerr_status, master_ctl;
+ struct xe_tile *tile;
+ int i;
+
+ xe_assign_hw_err_regs(xe);
+
+ dev_pcieerr_status = xe_mmio_read32(root_mmio, DEV_PCIEERR_STATUS);
+
+ for_each_tile(tile, xe, i) {
+ struct xe_gt *mmio = tile->primary_gt;
+
+ if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
+ xe_hw_error_source_handler(tile, HARDWARE_ERROR_FATAL);
+
+ master_ctl = xe_mmio_read32(mmio, GFX_MSTR_IRQ);
+ xe_hw_error_irq_handler(tile, master_ctl);
+ xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
+ }
+ if (dev_pcieerr_status)
+ xe_mmio_write32(root_mmio, DEV_PCIEERR_STATUS, dev_pcieerr_status);
+}
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
index 3fcbbcc338fe..2812407dd4bf 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.h
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -104,5 +104,6 @@ struct xe_device;
struct xe_tile;
void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
-void xe_assign_hw_err_regs(struct xe_device *xe);
+void xe_process_hw_errors(struct xe_device *xe);
+
#endif
diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
index 48b933234342..be152ebd6ce9 100644
--- a/drivers/gpu/drm/xe/xe_irq.c
+++ b/drivers/gpu/drm/xe/xe_irq.c
@@ -573,7 +573,7 @@ int xe_irq_install(struct xe_device *xe)
return -EINVAL;
}
- xe_assign_hw_err_regs(xe);
+ xe_process_hw_errors(xe);
xe->irq.enabled = true;
--
2.25.1
More information about the Intel-xe
mailing list