[PATCH v3 5/7] drm/xe: Add support to handle hardware errors

Thu Jul 10 05:54:07 UTC 2025

Hi Stuart

On 7/9/2025 10:57 PM, Summers, Stuart wrote:
> On Wed, 2025-07-02 at 19:41 +0530, Riana Tauro wrote:
>> Gfx device reports two classes of errors: uncorrectable and
>> correctable. Depending on the severity uncorrectable errors are
>> further classified as non fatal and fatal
>>
>> Correctable and non-fatal errors are reported as MSI's and bits in
>> the Master Interrupt Register indicate the class of the error.
>> The source of the error is then read from the Device Error Source
>> Register. Fatal errors are reported as PCIe errors
>> When a PCIe error is asserted, the OS will perform a device warm
>> reset
>> which causes the driver to reload. The error registers are sticky
>> and the values are maintained through a warm reset
>>
>> Add basic support to handle these errors
>>
>> Bspec: 50875, 53073, 53074, 53075, 53076
>>
>> Co-developed-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>>   drivers/gpu/drm/xe/Makefile                |   1 +
>>   drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  15 +++
>>   drivers/gpu/drm/xe/regs/xe_irq_regs.h      |   1 +
>>   drivers/gpu/drm/xe/xe_hw_error.c           | 108
>> +++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_hw_error.h           |  15 +++
>>   drivers/gpu/drm/xe/xe_irq.c                |   4 +
>>   6 files changed, 144 insertions(+)
>>   create mode 100644 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>>   create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>>   create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile
>> b/drivers/gpu/drm/xe/Makefile
>> index 1d97e5b63f4e..fea8ee3b0785 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -73,6 +73,7 @@ xe-y += xe_bb.o \
>>          xe_hw_engine.o \
>>          xe_hw_engine_class_sysfs.o \
>>          xe_hw_engine_group.o \
>> +       xe_hw_error.o \
>>          xe_hw_fence.o \
>>          xe_irq.o \
>>          xe_lrc.o \
>> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> new file mode 100644
>> index 000000000000..ed9b81fb28a0
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_HW_ERROR_REGS_H_
>> +#define _XE_HW_ERROR_REGS_H_
>> +
>> +#define DEV_ERR_STAT_NONFATAL                  0x100178
>> +#define DEV_ERR_STAT_CORRECTABLE               0x10017c
>> +#define
>> DEV_ERR_STAT_REG(x)                    XE_REG(_PICK_EVEN((x), \
>> +
>> DEV_ERR_STAT_CORRECTABLE, \
>> +
>> DEV_ERR_STAT_NONFATAL))
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> index f0ecfcac4003..2758b64cec9e 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> @@ -18,6 +18,7 @@
>>   #define GFX_MSTR_IRQ                           XE_REG(0x190010,
>> XE_REG_OPTION_VF)
>>   #define   MASTER_IRQ                           REG_BIT(31)
>>   #define   GU_MISC_IRQ                          REG_BIT(29)
>> +#define   ERROR_IRQ(x)                         REG_BIT(26 + (x))
>>   #define   DISPLAY_IRQ                          REG_BIT(16)
>>   #define   GT_DW_IRQ(x)                         REG_BIT(x)
>>   
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c
>> b/drivers/gpu/drm/xe/xe_hw_error.c
>> new file mode 100644
>> index 000000000000..0f2590839900
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -0,0 +1,108 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include "regs/xe_hw_error_regs.h"
>> +#include "regs/xe_irq_regs.h"
>> +
>> +#include "xe_device.h"
>> +#include "xe_hw_error.h"
>> +#include "xe_mmio.h"
>> +
>> +/* Error categories reported by hardware */
>> +enum hardware_error {
>> +       HARDWARE_ERROR_CORRECTABLE = 0,
>> +       HARDWARE_ERROR_NONFATAL = 1,
>> +       HARDWARE_ERROR_FATAL = 2,
>> +       HARDWARE_ERROR_MAX,
>> +};
>> +
>> +static const char *hw_error_to_str(const enum hardware_error hw_err)
>> +{
>> +       switch (hw_err) {
>> +       case HARDWARE_ERROR_CORRECTABLE:
>> +               return "CORRECTABLE";
>> +       case HARDWARE_ERROR_NONFATAL:
>> +               return "NONFATAL";
>> +       case HARDWARE_ERROR_FATAL:
>> +               return "FATAL";
>> +       default:
>> +               return "UNKNOWN";
>> +       }
>> +}
>> +
>> +static void hw_error_source_handler(struct xe_tile *tile, const enum
>> hardware_error hw_err)
>> +{
>> +       const char *hw_err_str = hw_error_to_str(hw_err);
>> +       struct xe_device *xe = tile_to_xe(tile);
>> +       unsigned long flags;
>> +       u32 err_src;
>> +
>> +       if (xe->info.platform != XE_BATTLEMAGE)
> 
> Why is this only on BMG? I see these same bits available on other
> platforms, e.g. LNL.
> 
>> +               return;
>> +
>> +       spin_lock_irqsave(&xe->irq.lock, flags);
>> +       err_src = xe_mmio_read32(&tile->mmio,
>> DEV_ERR_STAT_REG(hw_err));
>> +       if (!err_src) {
>> +               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported
>> DEV_ERR_STAT_%s blank!\n",
>> +                                   tile->id, hw_err_str);
>> +               goto unlock;
>> +       }
>> +
>> +       /* TODO: Process errrors per source */
> 
> Should at least print the bits out on the initial implementation?

This patch is taken from 
https://patchwork.freedesktop.org/series/125373/ which was not merged 
due to absence of upstream consumer
Himal/Aravind can provide more details..

I have taken a single patch in this series to add support for
csc errors as it has recovery mechanism and a upstream consumer ie. fwupd.

The processing of all the bits according to source should be a separate 
series. I can retain the TODO and remove the bmg check

> 
>> +
>> +       xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err),
>> err_src);
>> +
>> +unlock:
>> +       spin_unlock_irqrestore(&xe->irq.lock, flags);
>> +}
>> +
>> +/**
>> + * xe_hw_error_irq_handler - irq handling for hw errors
>> + * @tile: tile instance
>> + * @master_ctl: value read from master interrupt register
>> + *
>> + * Xe platforms add three error bits to the master interrupt
>> register to support error handling.
>> + * These three bits are used to convey the class of error FATAL,
>> NONFATAL, or CORRECTABLE.
>> + * To process the interrupt, determine the source of error by
>> reading the Device Error Source
>> + * Register that corresponds to the class of error being serviced.
>> + */
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
>> master_ctl)
>> +{
>> +       enum hardware_error hw_err;
>> +
>> +       for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
>> +               if (master_ctl & ERROR_IRQ(hw_err))
>> +                       hw_error_source_handler(tile, hw_err);
>> +}
>> +
>> +/*
>> + * Process hardware errors during boot
>> + */
>> +static void process_hw_errors(struct xe_device *xe)
>> +{
>> +       struct xe_tile *tile;
>> +       u32 master_ctl;
>> +       u8 id;
>> +
>> +       for_each_tile(tile, xe, id) {
>> +               master_ctl = xe_mmio_read32(&tile->mmio,
>> GFX_MSTR_IRQ);
>> +               xe_hw_error_irq_handler(tile, master_ctl);
>> +               xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ,
>> master_ctl);
>> +       }
>> +}
>> +
>> +/**
>> + * xe_hw_error_init - Initialize hw errors
>> + * @xe: xe device instance
>> + *
>> + * Initialize and process hw errors
>> + */
>> +void xe_hw_error_init(struct xe_device *xe)
>> +{
>> +       if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
> 
> Again, why skipping integrated? It seems like this might also be viable
> for, for instance, LNL? Of course some of the bits might make less
> sense for those platforms if they are PCIe-specific. But at least
> printing the register on an error seems interesting.

Are you suggesting to print the raw value in a drm_err log?
I could add that and remove the dgfx check, but if it is processing
of the indiviual sources and keeping count then that should be a 
different series

Thanks
Riana

> 
> Thanks,
> Stuart
> 
>> +               return;
>> +
>> +       process_hw_errors(xe);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h
>> b/drivers/gpu/drm/xe/xe_hw_error.h
>> new file mode 100644
>> index 000000000000..d86e28c5180c
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +#ifndef XE_HW_ERROR_H_
>> +#define XE_HW_ERROR_H_
>> +
>> +#include <linux/types.h>
>> +
>> +struct xe_tile;
>> +struct xe_device;
>> +
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
>> master_ctl);
>> +void xe_hw_error_init(struct xe_device *xe);
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c
>> b/drivers/gpu/drm/xe/xe_irq.c
>> index 5362d3174b06..24ccf3bec52c 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -18,6 +18,7 @@
>>   #include "xe_gt.h"
>>   #include "xe_guc.h"
>>   #include "xe_hw_engine.h"
>> +#include "xe_hw_error.h"
>>   #include "xe_memirq.h"
>>   #include "xe_mmio.h"
>>   #include "xe_pxp.h"
>> @@ -466,6 +467,7 @@ static irqreturn_t dg1_irq_handler(int irq, void
>> *arg)
>>                  xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>   
>>                  gt_irq_handler(tile, master_ctl, intr_dw, identity);
>> +               xe_hw_error_irq_handler(tile, master_ctl);
>>   
>>                  /*
>>                   * Display interrupts (including display backlight
>> operations
>> @@ -753,6 +755,8 @@ int xe_irq_install(struct xe_device *xe)
>>          int nvec = 1;
>>          int err;
>>   
>> +       xe_hw_error_init(xe);
>> +
>>          xe_irq_reset(xe);
>>   
>>          if (xe_device_has_msix(xe)) {
>