[PATCH v4 7/9] drm/xe: Add support to handle hardware errors

Fri Jul 11 05:35:04 UTC 2025

Hi Umesh

On 7/11/2025 2:39 AM, Umesh Nerlige Ramappa wrote:
> Resending since it got lost earlier...
> 
> On Wed, Jul 09, 2025 at 04:50:19PM +0530, Riana Tauro wrote:
>> Gfx device reports two classes of errors: uncorrectable and
>> correctable. Depending on the severity uncorrectable errors are
>> further classified as non fatal and fatal
>>
>> Correctable and non-fatal errors are reported as MSI's and bits in
>> the Master Interrupt Register indicate the class of the error.
>> The source of the error is then read from the Device Error Source
>> Register.
> 
> nit: Since Fatal is a separate category, maybe a split here into a 
> separate paragraph and some formatting would be good.
> 
>> Fatal errors are reported as PCIe errors
>> When a PCIe error is asserted, the OS will perform a device warm reset
>> which causes the driver to reload. The error registers are sticky
>> and the values are maintained through a warm reset
>>
>> Add basic support to handle these errors
>>
>> Bspec: 50875, 53073, 53074, 53075, 53076
>>
>> Co-developed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>> drivers/gpu/drm/xe/Makefile                |   1 +
>> drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  15 +++
>> drivers/gpu/drm/xe/regs/xe_irq_regs.h      |   1 +
>> drivers/gpu/drm/xe/xe_hw_error.c           | 108 +++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_hw_error.h           |  15 +++
>> drivers/gpu/drm/xe/xe_irq.c                |   4 +
>> 6 files changed, 144 insertions(+)
>> create mode 100644 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 1d97e5b63f4e..fea8ee3b0785 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -73,6 +73,7 @@ xe-y += xe_bb.o \
>>     xe_hw_engine.o \
>>     xe_hw_engine_class_sysfs.o \
>>     xe_hw_engine_group.o \
>> +    xe_hw_error.o \
>>     xe_hw_fence.o \
>>     xe_irq.o \
>>     xe_lrc.o \
>> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/ 
>> drm/xe/regs/xe_hw_error_regs.h
>> new file mode 100644
>> index 000000000000..ed9b81fb28a0
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_HW_ERROR_REGS_H_
>> +#define _XE_HW_ERROR_REGS_H_
>> +
>> +#define DEV_ERR_STAT_NONFATAL            0x100178
>> +#define DEV_ERR_STAT_CORRECTABLE        0x10017c
>> +#define DEV_ERR_STAT_REG(x)            XE_REG(_PICK_EVEN((x), \
>> +                                  DEV_ERR_STAT_CORRECTABLE, \
>> +                                  DEV_ERR_STAT_NONFATAL))
> 
 > For x = 1 and x = 2, I don't see the above result in correct values. 
Can > you please double check?

I had got confused with the same when i took the patch from the other 
series. But the second part of the macro becomes negative and the 
registers are correct.

Calculations for 1 and 2

#define _PICK_EVEN(__index, __a, __b) ((__a) + (__index) * ((__b) - (__a)))

_PICK_EVEN([HARDWARE_ERROR_NONFATAL = 1]) = DEV_ERR_STAT_CORRECTABLE + 1 
* (DEV_ERR_STAT_NONFATAL - DEV_ERR_STAT_CORRECTABLE)
					    0x10017c + 1 * (0x100178 - 0x10017c)
    					    0x100178

_PICK_EVEN([HARDWARE_ERROR_FATAL = 2]) = DEV_ERR_STAT_CORRECTABLE + 1 * 
(DEV_ERR_STAT_NONFATAL - DEV_ERR_STAT_CORRECTABLE)
					    0x10017c + 2 * (0x100178 - 0x10017c)
    					    0x100174

Thanks
Riana

> 
> What about DEV_ERR_STAT_FATAL?
> 
> Rest looks good,
> 
> Umesh
> 
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h b/drivers/gpu/drm/ 
>> xe/regs/xe_irq_regs.h
>> index f0ecfcac4003..2758b64cec9e 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> @@ -18,6 +18,7 @@
>> #define GFX_MSTR_IRQ                XE_REG(0x190010, XE_REG_OPTION_VF)
>> #define   MASTER_IRQ                REG_BIT(31)
>> #define   GU_MISC_IRQ                REG_BIT(29)
>> +#define   ERROR_IRQ(x)                REG_BIT(26 + (x))
>> #define   DISPLAY_IRQ                REG_BIT(16)
>> #define   GT_DW_IRQ(x)                REG_BIT(x)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/ 
>> xe_hw_error.c
>> new file mode 100644
>> index 000000000000..0f2590839900
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -0,0 +1,108 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include "regs/xe_hw_error_regs.h"
>> +#include "regs/xe_irq_regs.h"
>> +
>> +#include "xe_device.h"
>> +#include "xe_hw_error.h"
>> +#include "xe_mmio.h"
>> +
>> +/* Error categories reported by hardware */
>> +enum hardware_error {
>> +    HARDWARE_ERROR_CORRECTABLE = 0,
>> +    HARDWARE_ERROR_NONFATAL = 1,
>> +    HARDWARE_ERROR_FATAL = 2,
>> +    HARDWARE_ERROR_MAX,
>> +};
>> +
>> +static const char *hw_error_to_str(const enum hardware_error hw_err)
>> +{
>> +    switch (hw_err) {
>> +    case HARDWARE_ERROR_CORRECTABLE:
>> +        return "CORRECTABLE";
>> +    case HARDWARE_ERROR_NONFATAL:
>> +        return "NONFATAL";
>> +    case HARDWARE_ERROR_FATAL:
>> +        return "FATAL";
>> +    default:
>> +        return "UNKNOWN";
>> +    }
>> +}
>> +
>> +static void hw_error_source_handler(struct xe_tile *tile, const enum 
>> hardware_error hw_err)
>> +{
>> +    const char *hw_err_str = hw_error_to_str(hw_err);
>> +    struct xe_device *xe = tile_to_xe(tile);
>> +    unsigned long flags;
>> +    u32 err_src;
>> +
>> +    if (xe->info.platform != XE_BATTLEMAGE)
>> +        return;
>> +
>> +    spin_lock_irqsave(&xe->irq.lock, flags);
>> +    err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
>> +    if (!err_src) {
>> +        drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported 
>> DEV_ERR_STAT_%s blank!\n",
>> +                    tile->id, hw_err_str);
>> +        goto unlock;
>> +    }
>> +
>> +    /* TODO: Process errrors per source */
>> +
>> +    xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
>> +
>> +unlock:
>> +    spin_unlock_irqrestore(&xe->irq.lock, flags);
>> +}
>> +
>> +/**
>> + * xe_hw_error_irq_handler - irq handling for hw errors
>> + * @tile: tile instance
>> + * @master_ctl: value read from master interrupt register
>> + *
>> + * Xe platforms add three error bits to the master interrupt register 
>> to support error handling.
>> + * These three bits are used to convey the class of error FATAL, 
>> NONFATAL, or CORRECTABLE.
>> + * To process the interrupt, determine the source of error by reading 
>> the Device Error Source
>> + * Register that corresponds to the class of error being serviced.
>> + */
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>> +{
>> +    enum hardware_error hw_err;
>> +
>> +    for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
>> +        if (master_ctl & ERROR_IRQ(hw_err))
>> +            hw_error_source_handler(tile, hw_err);
>> +}
>> +
>> +/*
>> + * Process hardware errors during boot
>> + */
>> +static void process_hw_errors(struct xe_device *xe)
>> +{
>> +    struct xe_tile *tile;
>> +    u32 master_ctl;
>> +    u8 id;
>> +
>> +    for_each_tile(tile, xe, id) {
>> +        master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ);
>> +        xe_hw_error_irq_handler(tile, master_ctl);
>> +        xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl);
>> +    }
>> +}
>> +
>> +/**
>> + * xe_hw_error_init - Initialize hw errors
>> + * @xe: xe device instance
>> + *
>> + * Initialize and process hw errors
>> + */
>> +void xe_hw_error_init(struct xe_device *xe)
>> +{
>> +    if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
>> +        return;
>> +
>> +    process_hw_errors(xe);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/ 
>> xe_hw_error.h
>> new file mode 100644
>> index 000000000000..d86e28c5180c
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +#ifndef XE_HW_ERROR_H_
>> +#define XE_HW_ERROR_H_
>> +
>> +#include <linux/types.h>
>> +
>> +struct xe_tile;
>> +struct xe_device;
>> +
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 
>> master_ctl);
>> +void xe_hw_error_init(struct xe_device *xe);
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>> index 5362d3174b06..24ccf3bec52c 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -18,6 +18,7 @@
>> #include "xe_gt.h"
>> #include "xe_guc.h"
>> #include "xe_hw_engine.h"
>> +#include "xe_hw_error.h"
>> #include "xe_memirq.h"
>> #include "xe_mmio.h"
>> #include "xe_pxp.h"
>> @@ -466,6 +467,7 @@ static irqreturn_t dg1_irq_handler(int irq, void 
>> *arg)
>>         xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>
>>         gt_irq_handler(tile, master_ctl, intr_dw, identity);
>> +        xe_hw_error_irq_handler(tile, master_ctl);
>>
>>         /*
>>          * Display interrupts (including display backlight operations
>> @@ -753,6 +755,8 @@ int xe_irq_install(struct xe_device *xe)
>>     int nvec = 1;
>>     int err;
>>
>> +    xe_hw_error_init(xe);
>> +
>>     xe_irq_reset(xe);
>>
>>     if (xe_device_has_msix(xe)) {
>> -- 
>> 2.47.1
>>