[PATCH v4 7/9] drm/xe: Add support to handle hardware errors

Fri Jul 11 17:34:54 UTC 2025

On Fri, Jul 11, 2025 at 11:05:04AM +0530, Riana Tauro wrote:
>Hi Umesh
>
>On 7/11/2025 2:39 AM, Umesh Nerlige Ramappa wrote:
>>Resending since it got lost earlier...
>>
>>On Wed, Jul 09, 2025 at 04:50:19PM +0530, Riana Tauro wrote:
>>>Gfx device reports two classes of errors: uncorrectable and
>>>correctable. Depending on the severity uncorrectable errors are
>>>further classified as non fatal and fatal
>>>
>>>Correctable and non-fatal errors are reported as MSI's and bits in
>>>the Master Interrupt Register indicate the class of the error.
>>>The source of the error is then read from the Device Error Source
>>>Register.
>>
>>nit: Since Fatal is a separate category, maybe a split here into a 
>>separate paragraph and some formatting would be good.
>>
>>>Fatal errors are reported as PCIe errors
>>>When a PCIe error is asserted, the OS will perform a device warm reset
>>>which causes the driver to reload. The error registers are sticky
>>>and the values are maintained through a warm reset
>>>
>>>Add basic support to handle these errors
>>>
>>>Bspec: 50875, 53073, 53074, 53075, 53076
>>>
>>>Co-developed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>>Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>>Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>>>---
>>>drivers/gpu/drm/xe/Makefile                |   1 +
>>>drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  15 +++
>>>drivers/gpu/drm/xe/regs/xe_irq_regs.h      |   1 +
>>>drivers/gpu/drm/xe/xe_hw_error.c           | 108 +++++++++++++++++++++
>>>drivers/gpu/drm/xe/xe_hw_error.h           |  15 +++
>>>drivers/gpu/drm/xe/xe_irq.c                |   4 +
>>>6 files changed, 144 insertions(+)
>>>create mode 100644 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>>>create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>>>create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>>
>>>diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>>>index 1d97e5b63f4e..fea8ee3b0785 100644
>>>--- a/drivers/gpu/drm/xe/Makefile
>>>+++ b/drivers/gpu/drm/xe/Makefile
>>>@@ -73,6 +73,7 @@ xe-y += xe_bb.o \
>>>    xe_hw_engine.o \
>>>    xe_hw_engine_class_sysfs.o \
>>>    xe_hw_engine_group.o \
>>>+    xe_hw_error.o \
>>>    xe_hw_fence.o \
>>>    xe_irq.o \
>>>    xe_lrc.o \
>>>diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h 
>>>b/drivers/gpu/ drm/xe/regs/xe_hw_error_regs.h
>>>new file mode 100644
>>>index 000000000000..ed9b81fb28a0
>>>--- /dev/null
>>>+++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>>>@@ -0,0 +1,15 @@
>>>+/* SPDX-License-Identifier: MIT */
>>>+/*
>>>+ * Copyright © 2025 Intel Corporation
>>>+ */
>>>+
>>>+#ifndef _XE_HW_ERROR_REGS_H_
>>>+#define _XE_HW_ERROR_REGS_H_
>>>+
>>>+#define DEV_ERR_STAT_NONFATAL            0x100178
>>>+#define DEV_ERR_STAT_CORRECTABLE        0x10017c
>>>+#define DEV_ERR_STAT_REG(x)            XE_REG(_PICK_EVEN((x), \
>>>+                                  DEV_ERR_STAT_CORRECTABLE, \
>>>+                                  DEV_ERR_STAT_NONFATAL))
>>
>> For x = 1 and x = 2, I don't see the above result in correct values. 
>Can > you please double check?
>
>I had got confused with the same when i took the patch from the other 
>series. But the second part of the macro becomes negative and the 
>registers are correct.
>
>Calculations for 1 and 2
>
>#define _PICK_EVEN(__index, __a, __b) ((__a) + (__index) * ((__b) - (__a)))
>
>_PICK_EVEN([HARDWARE_ERROR_NONFATAL = 1]) = DEV_ERR_STAT_CORRECTABLE + 
>1 * (DEV_ERR_STAT_NONFATAL - DEV_ERR_STAT_CORRECTABLE)
>					    0x10017c + 1 * (0x100178 - 0x10017c)
>   					    0x100178
>
>_PICK_EVEN([HARDWARE_ERROR_FATAL = 2]) = DEV_ERR_STAT_CORRECTABLE + 1 
>* (DEV_ERR_STAT_NONFATAL - DEV_ERR_STAT_CORRECTABLE)
>					    0x10017c + 2 * (0x100178 - 0x10017c)
>   					    0x100174

ok, makes sense now,

Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>

Thanks,
Umesh
>
>Thanks
>Riana
>		
>
>>
>>What about DEV_ERR_STAT_FATAL?
>>
>>Rest looks good,
>>
>>Umesh
>>
>>>+
>>>+#endif
>>>diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h 
>>>b/drivers/gpu/drm/ xe/regs/xe_irq_regs.h
>>>index f0ecfcac4003..2758b64cec9e 100644
>>>--- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>>>+++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>>>@@ -18,6 +18,7 @@
>>>#define GFX_MSTR_IRQ                XE_REG(0x190010, XE_REG_OPTION_VF)
>>>#define   MASTER_IRQ                REG_BIT(31)
>>>#define   GU_MISC_IRQ                REG_BIT(29)
>>>+#define   ERROR_IRQ(x)                REG_BIT(26 + (x))
>>>#define   DISPLAY_IRQ                REG_BIT(16)
>>>#define   GT_DW_IRQ(x)                REG_BIT(x)
>>>
>>>diff --git a/drivers/gpu/drm/xe/xe_hw_error.c 
>>>b/drivers/gpu/drm/xe/ xe_hw_error.c
>>>new file mode 100644
>>>index 000000000000..0f2590839900
>>>--- /dev/null
>>>+++ b/drivers/gpu/drm/xe/xe_hw_error.c
>>>@@ -0,0 +1,108 @@
>>>+// SPDX-License-Identifier: MIT
>>>+/*
>>>+ * Copyright © 2025 Intel Corporation
>>>+ */
>>>+
>>>+#include "regs/xe_hw_error_regs.h"
>>>+#include "regs/xe_irq_regs.h"
>>>+
>>>+#include "xe_device.h"
>>>+#include "xe_hw_error.h"
>>>+#include "xe_mmio.h"
>>>+
>>>+/* Error categories reported by hardware */
>>>+enum hardware_error {
>>>+    HARDWARE_ERROR_CORRECTABLE = 0,
>>>+    HARDWARE_ERROR_NONFATAL = 1,
>>>+    HARDWARE_ERROR_FATAL = 2,
>>>+    HARDWARE_ERROR_MAX,
>>>+};
>>>+
>>>+static const char *hw_error_to_str(const enum hardware_error hw_err)
>>>+{
>>>+    switch (hw_err) {
>>>+    case HARDWARE_ERROR_CORRECTABLE:
>>>+        return "CORRECTABLE";
>>>+    case HARDWARE_ERROR_NONFATAL:
>>>+        return "NONFATAL";
>>>+    case HARDWARE_ERROR_FATAL:
>>>+        return "FATAL";
>>>+    default:
>>>+        return "UNKNOWN";
>>>+    }
>>>+}
>>>+
>>>+static void hw_error_source_handler(struct xe_tile *tile, const 
>>>enum hardware_error hw_err)
>>>+{
>>>+    const char *hw_err_str = hw_error_to_str(hw_err);
>>>+    struct xe_device *xe = tile_to_xe(tile);
>>>+    unsigned long flags;
>>>+    u32 err_src;
>>>+
>>>+    if (xe->info.platform != XE_BATTLEMAGE)
>>>+        return;
>>>+
>>>+    spin_lock_irqsave(&xe->irq.lock, flags);
>>>+    err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
>>>+    if (!err_src) {
>>>+        drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported 
>>>DEV_ERR_STAT_%s blank!\n",
>>>+                    tile->id, hw_err_str);
>>>+        goto unlock;
>>>+    }
>>>+
>>>+    /* TODO: Process errrors per source */
>>>+
>>>+    xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
>>>+
>>>+unlock:
>>>+    spin_unlock_irqrestore(&xe->irq.lock, flags);
>>>+}
>>>+
>>>+/**
>>>+ * xe_hw_error_irq_handler - irq handling for hw errors
>>>+ * @tile: tile instance
>>>+ * @master_ctl: value read from master interrupt register
>>>+ *
>>>+ * Xe platforms add three error bits to the master interrupt 
>>>register to support error handling.
>>>+ * These three bits are used to convey the class of error FATAL, 
>>>NONFATAL, or CORRECTABLE.
>>>+ * To process the interrupt, determine the source of error by 
>>>reading the Device Error Source
>>>+ * Register that corresponds to the class of error being serviced.
>>>+ */
>>>+void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>>+{
>>>+    enum hardware_error hw_err;
>>>+
>>>+    for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
>>>+        if (master_ctl & ERROR_IRQ(hw_err))
>>>+            hw_error_source_handler(tile, hw_err);
>>>+}
>>>+
>>>+/*
>>>+ * Process hardware errors during boot
>>>+ */
>>>+static void process_hw_errors(struct xe_device *xe)
>>>+{
>>>+    struct xe_tile *tile;
>>>+    u32 master_ctl;
>>>+    u8 id;
>>>+
>>>+    for_each_tile(tile, xe, id) {
>>>+        master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ);
>>>+        xe_hw_error_irq_handler(tile, master_ctl);
>>>+        xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl);
>>>+    }
>>>+}
>>>+
>>>+/**
>>>+ * xe_hw_error_init - Initialize hw errors
>>>+ * @xe: xe device instance
>>>+ *
>>>+ * Initialize and process hw errors
>>>+ */
>>>+void xe_hw_error_init(struct xe_device *xe)
>>>+{
>>>+    if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
>>>+        return;
>>>+
>>>+    process_hw_errors(xe);
>>>+}
>>>diff --git a/drivers/gpu/drm/xe/xe_hw_error.h 
>>>b/drivers/gpu/drm/xe/ xe_hw_error.h
>>>new file mode 100644
>>>index 000000000000..d86e28c5180c
>>>--- /dev/null
>>>+++ b/drivers/gpu/drm/xe/xe_hw_error.h
>>>@@ -0,0 +1,15 @@
>>>+/* SPDX-License-Identifier: MIT */
>>>+/*
>>>+ * Copyright © 2025 Intel Corporation
>>>+ */
>>>+#ifndef XE_HW_ERROR_H_
>>>+#define XE_HW_ERROR_H_
>>>+
>>>+#include <linux/types.h>
>>>+
>>>+struct xe_tile;
>>>+struct xe_device;
>>>+
>>>+void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 
>>>master_ctl);
>>>+void xe_hw_error_init(struct xe_device *xe);
>>>+#endif
>>>diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>>index 5362d3174b06..24ccf3bec52c 100644
>>>--- a/drivers/gpu/drm/xe/xe_irq.c
>>>+++ b/drivers/gpu/drm/xe/xe_irq.c
>>>@@ -18,6 +18,7 @@
>>>#include "xe_gt.h"
>>>#include "xe_guc.h"
>>>#include "xe_hw_engine.h"
>>>+#include "xe_hw_error.h"
>>>#include "xe_memirq.h"
>>>#include "xe_mmio.h"
>>>#include "xe_pxp.h"
>>>@@ -466,6 +467,7 @@ static irqreturn_t dg1_irq_handler(int irq, 
>>>void *arg)
>>>        xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>>
>>>        gt_irq_handler(tile, master_ctl, intr_dw, identity);
>>>+        xe_hw_error_irq_handler(tile, master_ctl);
>>>
>>>        /*
>>>         * Display interrupts (including display backlight operations
>>>@@ -753,6 +755,8 @@ int xe_irq_install(struct xe_device *xe)
>>>    int nvec = 1;
>>>    int err;
>>>
>>>+    xe_hw_error_init(xe);
>>>+
>>>    xe_irq_reset(xe);
>>>
>>>    if (xe_device_has_msix(xe)) {
>>>-- 
>>>2.47.1
>>>
>