[Intel-xe] [PATCH 01/11] drm/xe: Handle errors from various components.

Himal Prasad Ghimiray himal.prasad.ghimiray at intel.com
Wed Sep 27 11:46:17 UTC 2023


The GFX device can generate numbers of classes of error under the new
infrastructure: correctable, non-fatal, and fatal errors.

The non-fatal and fatal error classes distinguish between levels of
severity for uncorrectable errors. Driver will only handle logging
of errors and updating counters from various components within the
graphics device. Anything more will be handled at system level.

For errors that will route as interrupts, three bits in the Master
Interrupt Register will be used to convey the class of error.

For each class of error: Determine source of error (IP block) by reading
the Device Error Source Register (RW1C) that
corresponds to the class of error being serviced.

Bspec: 50875, 53073, 53074, 53075

Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Matt Roper <matthew.d.roper at intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Jani Nikula <jani.nikula at intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
 drivers/gpu/drm/xe/Makefile                  |   1 +
 drivers/gpu/drm/xe/regs/xe_regs.h            |   2 +-
 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h |  15 ++
 drivers/gpu/drm/xe/xe_device_types.h         |  11 +
 drivers/gpu/drm/xe/xe_hw_error.c             | 211 +++++++++++++++++++
 drivers/gpu/drm/xe/xe_hw_error.h             |  64 ++++++
 drivers/gpu/drm/xe/xe_irq.c                  |   5 +
 7 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
 create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
 create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index b1681d1416eb..be1f3afec3dc 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -76,6 +76,7 @@ xe-y += xe_bb.o \
 	xe_guc_submit.o \
 	xe_hw_engine.o \
 	xe_hw_engine_class_sysfs.o \
+	xe_hw_error.o \
 	xe_hw_fence.o \
 	xe_huc.o \
 	xe_huc_debugfs.o \
diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
index 8a93ab169e04..863df80d69bf 100644
--- a/drivers/gpu/drm/xe/regs/xe_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_regs.h
@@ -91,7 +91,7 @@
 #define   GU_MISC_IRQ				REG_BIT(29)
 #define   DISPLAY_IRQ				REG_BIT(16)
 #define   GT_DW_IRQ(x)				REG_BIT(x)
+#define   XE_ERROR_IRQ(x)			REG_BIT(26 + (x))
 
 #define PVC_RP_STATE_CAP			XE_REG(0x281014)
-
 #endif
diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
new file mode 100644
index 000000000000..db78d6687213
--- /dev/null
+++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+#ifndef XE_TILE_ERROR_REGS_H_
+#define XE_TILE_ERROR_REGS_H_
+
+#include <linux/stddef.h>
+
+#define _DEV_ERR_STAT_NONFATAL                         0x100178
+#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
+#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
+								_DEV_ERR_STAT_CORRECTABLE, \
+								_DEV_ERR_STAT_NONFATAL))
+#endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 32ab0fea04ee..4d8b0724d1fe 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -14,6 +14,7 @@
 
 #include "xe_devcoredump_types.h"
 #include "xe_gt_types.h"
+#include "xe_hw_error.h"
 #include "xe_platform_types.h"
 #include "xe_pmu.h"
 #include "xe_step_types.h"
@@ -173,6 +174,11 @@ struct xe_tile {
 
 	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
 	struct kobject *sysfs;
+
+	/** @tile_hw_errors: hardware errors reported for the tile */
+	struct tile_hw_errors {
+		unsigned long count[XE_TILE_HW_ERROR_MAX];
+	} errors;
 };
 
 /**
@@ -365,6 +371,11 @@ struct xe_device {
 	/** @pmu: performance monitoring unit */
 	struct xe_pmu pmu;
 
+	/** @hardware_errors_regs: list of hw error regs*/
+	struct hardware_errors_regs {
+		const struct err_msg_cntr_pair *dev_err_stat[HARDWARE_ERROR_MAX];
+	} hw_err_regs;
+
 	/* private: */
 
 #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
new file mode 100644
index 000000000000..357d0f962d91
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#include "xe_hw_error.h"
+
+#include "regs/xe_regs.h"
+#include "regs/xe_tile_error_regs.h"
+#include "xe_device.h"
+#include "xe_mmio.h"
+
+static const char *
+hardware_error_type_to_str(const enum hardware_error hw_err)
+{
+	switch (hw_err) {
+	case HARDWARE_ERROR_CORRECTABLE:
+		return "CORRECTABLE";
+	case HARDWARE_ERROR_NONFATAL:
+		return "NONFATAL";
+	case HARDWARE_ERROR_FATAL:
+		return "FATAL";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static const struct err_msg_cntr_pair dg2_err_stat_fatal_reg[] = {
+	[0]         = {"GT",			XE_TILE_HW_ERR_GT_FATAL},
+	[1 ... 3]   = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[4]         = {"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_FATAL},
+	[5 ... 7]   = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[8]         = {"GSC error",		XE_TILE_HW_ERR_GSC_FATAL},
+	[9 ... 11]  = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[12]        = {"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_FATAL},
+	[13 ... 15] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[16]        = {"SOC",			XE_TILE_HW_ERR_SOC_FATAL},
+	[17 ... 31] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair dg2_err_stat_nonfatal_reg[] = {
+	[0]         = {"GT",			XE_TILE_HW_ERR_GT_NONFATAL},
+	[1 ... 3]   = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[4]         = {"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_NONFATAL},
+	[5 ... 7]   = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[8]         = {"GSC error",		XE_TILE_HW_ERR_GSC_NONFATAL},
+	[9 ... 11]  = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[12]        = {"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_NONFATAL},
+	[13 ... 15] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[16]        = {"SOC",			XE_TILE_HW_ERR_SOC_NONFATAL},
+	[17 ... 19] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[20]        = {"MERT",			XE_TILE_HW_ERR_MERT_NONFATAL},
+	[21 ... 31] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+};
+
+static const struct err_msg_cntr_pair dg2_err_stat_correctable_reg[] = {
+	[0]         = {"GT",			XE_TILE_HW_ERR_GT_CORR},
+	[1 ... 3]   = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
+	[4]         = {"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_CORR},
+	[5 ... 7]   = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
+	[8]         = {"GSC error",		XE_TILE_HW_ERR_GSC_CORR},
+	[9 ... 11]  = {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
+	[12]        = {"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_CORR},
+	[13 ... 15] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_CORR},
+	[16]        = {"SOC",			XE_TILE_HW_ERR_SOC_CORR},
+	[17 ... 31] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_CORR},
+};
+
+static const struct err_msg_cntr_pair pvc_err_stat_fatal_reg[] = {
+	[0]         =  {"GT",			XE_TILE_HW_ERR_GT_FATAL},
+	[1]         =  {"SGGI Cmd Parity",	XE_TILE_HW_ERR_SGGI_FATAL},
+	[2 ... 7]   =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[8]         =  {"GSC error",		XE_TILE_HW_ERR_GSC_FATAL},
+	[9]         =  {"SGLI Cmd Parity",	XE_TILE_HW_ERR_SGLI_FATAL},
+	[10 ... 12] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[13]        =  {"SGCI Cmd Parity",	XE_TILE_HW_ERR_SGCI_FATAL},
+	[14 ... 15] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[16]        =  {"SOC ERROR",		XE_TILE_HW_ERR_SOC_FATAL},
+	[17 ... 19] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+	[20]        =  {"MERT Cmd Parity",	XE_TILE_HW_ERR_MERT_FATAL},
+	[21 ... 31] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair pvc_err_stat_nonfatal_reg[] = {
+	[0]         =  {"GT",			XE_TILE_HW_ERR_GT_NONFATAL},
+	[1]         =  {"SGGI Data Parity",	XE_TILE_HW_ERR_SGGI_NONFATAL},
+	[2 ... 7]   =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[8]         =  {"GSC",			XE_TILE_HW_ERR_GSC_NONFATAL},
+	[9]         =  {"SGLI Data Parity",	XE_TILE_HW_ERR_SGLI_NONFATAL},
+	[10 ... 12] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[13]        =  {"SGCI Data Parity",	XE_TILE_HW_ERR_SGCI_NONFATAL},
+	[14 ... 15] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[16]        =  {"SOC",			XE_TILE_HW_ERR_SOC_NONFATAL},
+	[17 ... 19] =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+	[20]        =  {"MERT Data Parity",	XE_TILE_HW_ERR_MERT_NONFATAL},
+	[21 ... 31] =  {"Undefined",            XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+};
+
+static const struct err_msg_cntr_pair pvc_err_stat_correctable_reg[] = {
+	[0]         =  {"GT",			XE_TILE_HW_ERR_GT_CORR},
+	[1 ... 7]   =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
+	[8]         =  {"GSC",			XE_TILE_HW_ERR_GSC_CORR},
+	[9 ... 31]  =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
+};
+
+static const struct err_msg_cntr_pair dev_err_stat_fatal_reg[] = {
+	[0]         =  {"GT",			XE_TILE_HW_ERR_GT_FATAL},
+	[1 ... 31]  =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
+};
+
+static const struct err_msg_cntr_pair dev_err_stat_nonfatal_reg[] = {
+	[0]         =  {"GT",			XE_TILE_HW_ERR_GT_NONFATAL},
+	[1 ... 31]  =  {"Undefined",            XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+};
+
+static const struct err_msg_cntr_pair dev_err_stat_correctable_reg[] = {
+	[0]         =  {"GT",			XE_TILE_HW_ERR_GT_CORR},
+	[1 ... 31]  =  {"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
+};
+
+void xe_assign_hw_err_regs(struct xe_device *xe)
+{
+	const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
+
+	if (xe->info.platform == XE_DG2) {
+		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dg2_err_stat_correctable_reg;
+		dev_err_stat[HARDWARE_ERROR_NONFATAL] = dg2_err_stat_nonfatal_reg;
+		dev_err_stat[HARDWARE_ERROR_FATAL] = dg2_err_stat_fatal_reg;
+	} else if (xe->info.platform == XE_PVC) {
+		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
+		dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
+		dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
+	} else {
+		/* For other platforms report only GT errors */
+		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dev_err_stat_correctable_reg;
+		dev_err_stat[HARDWARE_ERROR_NONFATAL] = dev_err_stat_nonfatal_reg;
+		dev_err_stat[HARDWARE_ERROR_FATAL] = dev_err_stat_fatal_reg;
+	}
+}
+
+static void
+xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
+{
+	const char *hw_err_str = hardware_error_type_to_str(hw_err);
+	const struct hardware_errors_regs *err_regs;
+	const struct err_msg_cntr_pair *errstat;
+	unsigned long errsrc;
+	unsigned long flags;
+	const char *errmsg;
+	struct xe_gt *mmio;
+	u32 indx;
+	u32 errbit;
+
+	spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
+	err_regs = &tile_to_xe(tile)->hw_err_regs;
+	errstat = err_regs->dev_err_stat[hw_err];
+	mmio = tile->primary_gt;
+	errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
+	if (!errsrc) {
+		drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
+				    "TILE%d detected DEV_ERR_STAT_REG_%s blank!\n",
+				    tile->id, hw_err_str);
+		goto unlock;
+	}
+
+	drm_info(&tile_to_xe(tile)->drm, HW_ERR
+		 "TILE%d DEV_ERR_STAT_REG_%s=0x%08lx\n", tile->id, hw_err_str, errsrc);
+
+	for_each_set_bit(errbit, &errsrc, 32) {
+		errmsg = errstat[errbit].errmsg;
+		indx = errstat[errbit].cntr_indx;
+
+		if (hw_err == HARDWARE_ERROR_CORRECTABLE)
+			drm_warn(&tile_to_xe(tile)->drm,
+				 HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
+				 tile->id, errmsg, hw_err_str, errbit);
+
+		else
+			drm_err_ratelimited(&tile_to_xe(tile)->drm,
+					    HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
+					    tile->id, errmsg, hw_err_str, errbit);
+		tile->errors.count[indx]++;
+	}
+
+	xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
+unlock:
+	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
+}
+
+/*
+ * XE Platforms adds three Error bits to the Master Interrupt
+ * Register to support error handling. These three bits are
+ * used to convey the class of error:
+ * FATAL, NONFATAL, or CORRECTABLE.
+ *
+ * To process an interrupt:
+ *       Determine source of error (IP block) by reading
+ *	 the Device Error Source Register (RW1C) that
+ *	 corresponds to the class of error being serviced
+ *	 and log the error.
+ */
+void
+xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
+{
+	enum hardware_error hw_err;
+
+	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
+		if (master_ctl & XE_ERROR_IRQ(hw_err))
+			xe_hw_error_source_handler(tile, hw_err);
+	}
+}
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
new file mode 100644
index 000000000000..c0c05b9130eb
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+#ifndef XE_HW_ERRORS_H_
+#define XE_HW_ERRORS_H_
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/* Error categories reported by hardware */
+enum hardware_error {
+	HARDWARE_ERROR_CORRECTABLE = 0,
+	HARDWARE_ERROR_NONFATAL = 1,
+	HARDWARE_ERROR_FATAL = 2,
+	HARDWARE_ERROR_MAX,
+};
+
+/* Count of  Correctable and Uncorrectable errors reported on tile */
+enum xe_tile_hw_errors {
+	XE_TILE_HW_ERR_GT_FATAL = 0,
+	XE_TILE_HW_ERR_SGGI_FATAL,
+	XE_TILE_HW_ERR_DISPLAY_FATAL,
+	XE_TILE_HW_ERR_SGDI_FATAL,
+	XE_TILE_HW_ERR_SGLI_FATAL,
+	XE_TILE_HW_ERR_SGUNIT_FATAL,
+	XE_TILE_HW_ERR_SGCI_FATAL,
+	XE_TILE_HW_ERR_GSC_FATAL,
+	XE_TILE_HW_ERR_SOC_FATAL,
+	XE_TILE_HW_ERR_MERT_FATAL,
+	XE_TILE_HW_ERR_SGMI_FATAL,
+	XE_TILE_HW_ERR_UNKNOWN_FATAL,
+	XE_TILE_HW_ERR_SGGI_NONFATAL,
+	XE_TILE_HW_ERR_DISPLAY_NONFATAL,
+	XE_TILE_HW_ERR_SGDI_NONFATAL,
+	XE_TILE_HW_ERR_SGLI_NONFATAL,
+	XE_TILE_HW_ERR_GT_NONFATAL,
+	XE_TILE_HW_ERR_SGUNIT_NONFATAL,
+	XE_TILE_HW_ERR_SGCI_NONFATAL,
+	XE_TILE_HW_ERR_GSC_NONFATAL,
+	XE_TILE_HW_ERR_SOC_NONFATAL,
+	XE_TILE_HW_ERR_MERT_NONFATAL,
+	XE_TILE_HW_ERR_SGMI_NONFATAL,
+	XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
+	XE_TILE_HW_ERR_GT_CORR,
+	XE_TILE_HW_ERR_DISPLAY_CORR,
+	XE_TILE_HW_ERR_SGUNIT_CORR,
+	XE_TILE_HW_ERR_GSC_CORR,
+	XE_TILE_HW_ERR_SOC_CORR,
+	XE_TILE_HW_ERR_UNKNOWN_CORR,
+	XE_TILE_HW_ERROR_MAX,
+};
+
+struct err_msg_cntr_pair {
+	const char *errmsg;
+	const u32 cntr_indx;
+};
+
+struct xe_device;
+struct xe_tile;
+
+void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
+void xe_assign_hw_err_regs(struct xe_device *xe);
+#endif
diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
index 504cb94d0ee8..68f5f7b6a74e 100644
--- a/drivers/gpu/drm/xe/xe_irq.c
+++ b/drivers/gpu/drm/xe/xe_irq.c
@@ -434,6 +434,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
 		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
 
 		gt_irq_handler(tile, master_ctl, intr_dw, identity);
+		xe_hw_error_irq_handler(tile, master_ctl);
 
 		/*
 		 * Display interrupts (including display backlight operations
@@ -591,6 +592,10 @@ int xe_irq_install(struct xe_device *xe)
 		return -EINVAL;
 	}
 
+	xe_assign_hw_err_regs(xe);
+
+	xe->irq.enabled = true;
+
 	xe_irq_reset(xe);
 
 	err = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI | PCI_IRQ_MSIX);
-- 
2.25.1



More information about the Intel-xe mailing list