[Intel-gfx] [PATCH 27/37] drm/i915/dg1: Log counter on SLM ECC error

Lucas De Marchi lucas.demarchi at intel.com
Thu May 21 00:37:53 UTC 2020


From: Fernando Pacheco <fernando.pacheco at intel.com>

Correctable and uncorrectable Shared Local Memory (SLM)
ECC errors will be counted in two different Thread Dispatch
Logic (TDL) registers. GuC will receive a message
from TDL when the first correctable/uncorrectable error is
detected by SLM (first after a reset or register clear). This
message is then forwarded to the appropriate severity register.

Correctable errors will route to kernel driver and uncorrectable errors
are expected to route as PCIe Error. Although the option exists to route
both as interrupts.

Service the interrupt and read TDL registers for error count.

Cc: Paulo Zanoni <paulo.r.zanoni at intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Cc: Fernando Pacheco <fernando.pacheco at intel.com>
Cc: Radhakrishna Sripada <radhakrishna.sripada at intel.com>
Signed-off-by: Fernando Pacheco <fernando.pacheco at intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 10 +++++++++-
 drivers/gpu/drm/i915/i915_reg.h |  7 +++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 17e679b910da..ca35edef492d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2536,7 +2536,7 @@ gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
 {
 	void __iomem * const regs = i915->uncore.regs;
 	const char *hw_err_str = hardware_error_type_to_str(hw_err);
-	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
+	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR | SLM_ERROR);
 	u32 errstat;
 
 	lockdep_assert_held(&i915->irq_lock);
@@ -2565,6 +2565,14 @@ gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
 	if (errstat & EU_IC_ERROR)
 		DRM_ERROR("detected EU IC %s hardware error\n", hw_err_str);
 
+	if (errstat & SLM_ERROR) {
+		struct drm_i915_private *dev_priv = i915;
+
+		DRM_ERROR("detected %u SLM %s hardware error(s)\n",
+			  I915_READ(SLM_ECC_ERROR_CNTR(hw_err)),
+			  hw_err_str);
+	}
+
 	/*
 	 * TODO: The remaining GT errors don't have a
 	 * need for targeted logging at the moment. We
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 40cb361b4254..b9c142f86611 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7765,6 +7765,13 @@ enum hardware_error {
 						_ERR_STAT_GT_NONFATAL))
 #define  EU_GRF_ERROR			(1 << 15)
 #define  EU_IC_ERROR			(1 << 14)
+#define  SLM_ERROR			(1 << 13)
+
+#define _SLM_ECC_ERROR_CNT		0xe7f4
+#define _SLM_UNCORR_ECC_ERROR_CNT	0xe7c0
+#define SLM_ECC_ERROR_CNTR(x)		_MMIO((x) == HARDWARE_ERROR_CORRECTABLE ? \
+						_SLM_ECC_ERROR_CNT : \
+						_SLM_UNCORR_ECC_ERROR_CNT)
 
 #define GEN11_RENDER_COPY_INTR_ENABLE	_MMIO(0x190030)
 #define GEN11_VCS_VECS_INTR_ENABLE	_MMIO(0x190034)
-- 
2.26.2



More information about the Intel-gfx mailing list