[PATCH v5 7/9] drm/xe: Add support to handle hardware errors

Tue Jul 15 16:53:05 UTC 2025

On Tue, 2025-07-15 at 22:18 +0530, Riana Tauro wrote:
> Hi Stuart
> 
> On 7/15/2025 7:38 PM, Summers, Stuart wrote:
> > On Tue, 2025-07-15 at 16:17 +0530, Riana Tauro wrote:
> > > Gfx device reports two classes of errors: uncorrectable and
> > > correctable. Depending on the severity uncorrectable errors are
> > > further
> > > classified Non-Fatal and Fatal
> > > 
> > > Correctable and Non-Fatal errors: These errors are reported as
> > > MSI.
> > > Bits in
> > > the Master Interrupt Register indicate the class of the error.
> > > The source of the error is then read from the Device Error Source
> > > Register.
> > > 
> > > Fatal errors: These are reported as PCIe errors
> > > When a PCIe error is asserted, the OS will perform a SBR
> > > (Secondary
> > > Bus reset) which causes the driver to reload.
> > > The error registers are sticky and the values are maintained
> > > through
> > > SBR
> > > 
> > > Add basic support to handle these errors
> > > 
> > > Bspec: 50875, 53073, 53074, 53075, 53076
> > > 
> > > v2: Remove platform checks and print raw error value (Stuart)
> > >      Format commit message (Umesh)
> > > 
> > > Cc: Stuart Summers <stuart.summers at intel.com>
> > > Co-developed-by: Himal Prasad Ghimiray
> > > <himal.prasad.ghimiray at intel.com>
> > > Signed-off-by: Himal Prasad Ghimiray
> > > <himal.prasad.ghimiray at intel.com>
> > > Signed-off-by: Riana Tauro <riana.tauro at intel.com>
> > > Reviewed-by: Umesh Nerlige Ramappa
> > > <umesh.nerlige.ramappa at intel.com>
> > > ---
> > >   drivers/gpu/drm/xe/Makefile                |   1 +
> > >   drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  15 +++
> > >   drivers/gpu/drm/xe/regs/xe_irq_regs.h      |   1 +
> > >   drivers/gpu/drm/xe/xe_hw_error.c           | 106
> > > +++++++++++++++++++++
> > >   drivers/gpu/drm/xe/xe_hw_error.h           |  15 +++
> > >   drivers/gpu/drm/xe/xe_irq.c                |   4 +
> > >   6 files changed, 142 insertions(+)
> > >   create mode 100644 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> > >   create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
> > >   create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
> > > 
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index 07c71a29963d..02f73dffec55 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -80,6 +80,7 @@ xe-y += xe_bb.o \
> > >          xe_hw_engine.o \
> > >          xe_hw_engine_class_sysfs.o \
> > >          xe_hw_engine_group.o \
> > > +       xe_hw_error.o \
> > >          xe_hw_fence.o \
> > >          xe_irq.o \
> > >          xe_lrc.o \
> > > diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> > > b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> > > new file mode 100644
> > > index 000000000000..ed9b81fb28a0
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
> > > @@ -0,0 +1,15 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2025 Intel Corporation
> > > + */
> > > +
> > > +#ifndef _XE_HW_ERROR_REGS_H_
> > > +#define _XE_HW_ERROR_REGS_H_
> > > +
> > > +#define DEV_ERR_STAT_NONFATAL                  0x100178
> > > +#define DEV_ERR_STAT_CORRECTABLE               0x10017c
> > > +#define
> > > DEV_ERR_STAT_REG(x)                    XE_REG(_PICK_EVEN((x), \
> > > +
> > > DEV_ERR_STAT_CORRECTABLE, \
> > > +
> > > DEV_ERR_STAT_NONFATAL))
> > > +
> > > +#endif
> > > diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> > > b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> > > index 13635e4331d4..7c2a3a140142 100644
> > > --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> > > +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> > > @@ -18,6 +18,7 @@
> > >   #define GFX_MSTR_IRQ                           XE_REG(0x190010,
> > > XE_REG_OPTION_VF)
> > >   #define   MASTER_IRQ                           REG_BIT(31)
> > >   #define   GU_MISC_IRQ                          REG_BIT(29)
> > > +#define   ERROR_IRQ(x)                         REG_BIT(26 + (x))
> > >   #define   DISPLAY_IRQ                          REG_BIT(16)
> > >   #define   I2C_IRQ                              REG_BIT(12)
> > >   #define   GT_DW_IRQ(x)                         REG_BIT(x)
> > > diff --git a/drivers/gpu/drm/xe/xe_hw_error.c
> > > b/drivers/gpu/drm/xe/xe_hw_error.c
> > > new file mode 100644
> > > index 000000000000..0b09f9950cb9
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> > > @@ -0,0 +1,106 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2025 Intel Corporation
> > > + */
> > > +
> > > +#include "regs/xe_hw_error_regs.h"
> > > +#include "regs/xe_irq_regs.h"
> > > +
> > > +#include "xe_device.h"
> > > +#include "xe_hw_error.h"
> > > +#include "xe_mmio.h"
> > > +
> > > +/* Error categories reported by hardware */
> > > +enum hardware_error {
> > > +       HARDWARE_ERROR_CORRECTABLE = 0,
> > > +       HARDWARE_ERROR_NONFATAL = 1,
> > > +       HARDWARE_ERROR_FATAL = 2,
> > > +       HARDWARE_ERROR_MAX,
> > > +};
> > > +
> > > +static const char *hw_error_to_str(const enum hardware_error
> > > hw_err)
> > > +{
> > > +       switch (hw_err) {
> > > +       case HARDWARE_ERROR_CORRECTABLE:
> > > +               return "CORRECTABLE";
> > > +       case HARDWARE_ERROR_NONFATAL:
> > > +               return "NONFATAL";
> > > +       case HARDWARE_ERROR_FATAL:
> > > +               return "FATAL";
> > > +       default:
> > > +               return "UNKNOWN";
> > > +       }
> > > +}
> > > +
> > > +static void hw_error_source_handler(struct xe_tile *tile, const
> > > enum
> > > hardware_error hw_err)
> > > +{
> > > +       const char *hw_err_str = hw_error_to_str(hw_err);
> > > +       struct xe_device *xe = tile_to_xe(tile);
> > > +       unsigned long flags;
> > > +       u32 err_src;
> > > +
> > > +       spin_lock_irqsave(&xe->irq.lock, flags);
> > > +       err_src = xe_mmio_read32(&tile->mmio,
> > > DEV_ERR_STAT_REG(hw_err));
> > > +       if (!err_src) {
> > > +               drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d
> > > reported
> > > DEV_ERR_STAT_%s blank!\n",
> > > +                                   tile->id, hw_err_str);
> > 
> > This message is essentially the same as the one below, just with
> > "blank" instead of "0x0". And writing back a 0 IMO is
> > inconsequential
> > since this already is some kind of hardware bug (bad interrupt,
> > MMIO
> > write error, etc) if we get to this point.
> 
> May need to check if we really need error logs or debug should be 
> sufficient. BAT shows errors on probe on DG2
> 
> https://intel-gfx-ci.01.org/tree/intel-xe/xe-pw-149756v5/bat-dg2-oem2/igt@xe_module_load@load.html#dmesg-warnings262
> 
> The previous series had a skip for DG2 
> https://patchwork.freedesktop.org/patch/563668/?series=125373&rev=1 b
> ut
> the review comments don't provide more data.
> 
> Will check why there is a failure. We might not need error logs for a
> correctable error but need to confirm for Non-fatal.

Well I guess what I mean here is the 0-value is still an interesting
case. It means we got an error interrupt but the data was empty which
is unexpected, regardless of whether this is correctable or nonfatal or
fatal even. When we get an interrupt, there should be a source to that
interrupt.

Now for correctable we can of course decide that we don't care which
source printed because it was apparently corrected and thus this is
more informational - we aren't expecting a functional difference.

My personal opinion is we should print everything we get here -
correctable or otherwise. That way we can at least track if a given
hardware is seeing an unexpectedly high error count in any given
category.

Thanks,
Stuart

> 
> Thanks
> Riana
> 
> > 
> > Can we save a few lines and get rid of this if check? So the
> > function
> > just becomes:
> > 
> > spin_lock()
> > read_mmio()
> > print_results()
> > write_mmio()
> > spin_unlock()
> > 
> > Without any goto or extra check.
> > 
> > > +               goto unlock;
> > > +       }
> > > +
> > > +       drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s
> > > error 0x%x\n",
> > > +                           tile->id, hw_err_str, err_src);
> > > +
> > > +       xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err),
> > > err_src);
> > > +
> > > +unlock:
> > > +       spin_unlock_irqrestore(&xe->irq.lock, flags);
> > > +}
> > > +
> > > +/**
> > > + * xe_hw_error_irq_handler - irq handling for hw errors
> > > + * @tile: tile instance
> > > + * @master_ctl: value read from master interrupt register
> > > + *
> > > + * Xe platforms add three error bits to the master interrupt
> > > register to support error handling.
> > > + * These three bits are used to convey the class of error FATAL,
> > > NONFATAL, or CORRECTABLE.
> > > + * To process the interrupt, determine the source of error by
> > > reading the Device Error Source
> > > + * Register that corresponds to the class of error being
> > > serviced.
> > > + */
> > > +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
> > > master_ctl)
> > > +{
> > > +       enum hardware_error hw_err;
> > > +
> > > +       for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
> > > +               if (master_ctl & ERROR_IRQ(hw_err))
> > > +                       hw_error_source_handler(tile, hw_err);
> > > +}
> > > +
> > > +/*
> > > + * Process hardware errors during boot
> > > + */
> > > +static void process_hw_errors(struct xe_device *xe)
> > > +{
> > > +       struct xe_tile *tile;
> > > +       u32 master_ctl;
> > > +       u8 id;
> > > +
> > > +       for_each_tile(tile, xe, id) {
> > > +               master_ctl = xe_mmio_read32(&tile->mmio,
> > > GFX_MSTR_IRQ);
> > > +               xe_hw_error_irq_handler(tile, master_ctl);
> > > +               xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ,
> > > master_ctl);
> > > +       }
> > > +}
> > > +
> > > +/**
> > > + * xe_hw_error_init - Initialize hw errors
> > > + * @xe: xe device instance
> > > + *
> > > + * Initialize and process hw errors
> > > + */
> > > +void xe_hw_error_init(struct xe_device *xe)
> > > +{
> > > +       if (IS_SRIOV_VF(xe))
> > > +               return;
> > > +
> > 
> > Maybe a comment here as to why we are doing this on init? Something
> > like:
> > /* Check for errors that occurred during boot prior to driver load
> > */
> > 
> > Thanks,
> > Stuart
> > 
> > > +       process_hw_errors(xe);
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/xe_hw_error.h
> > > b/drivers/gpu/drm/xe/xe_hw_error.h
> > > new file mode 100644
> > > index 000000000000..d86e28c5180c
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/xe_hw_error.h
> > > @@ -0,0 +1,15 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2025 Intel Corporation
> > > + */
> > > +#ifndef XE_HW_ERROR_H_
> > > +#define XE_HW_ERROR_H_
> > > +
> > > +#include <linux/types.h>
> > > +
> > > +struct xe_tile;
> > > +struct xe_device;
> > > +
> > > +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
> > > master_ctl);
> > > +void xe_hw_error_init(struct xe_device *xe);
> > > +#endif
> > > diff --git a/drivers/gpu/drm/xe/xe_irq.c
> > > b/drivers/gpu/drm/xe/xe_irq.c
> > > index 5df5b8c2a3e4..870edaf69388 100644
> > > --- a/drivers/gpu/drm/xe/xe_irq.c
> > > +++ b/drivers/gpu/drm/xe/xe_irq.c
> > > @@ -18,6 +18,7 @@
> > >   #include "xe_gt.h"
> > >   #include "xe_guc.h"
> > >   #include "xe_hw_engine.h"
> > > +#include "xe_hw_error.h"
> > >   #include "xe_i2c.h"
> > >   #include "xe_memirq.h"
> > >   #include "xe_mmio.h"
> > > @@ -468,6 +469,7 @@ static irqreturn_t dg1_irq_handler(int irq,
> > > void
> > > *arg)
> > >                  xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
> > >   
> > >                  gt_irq_handler(tile, master_ctl, intr_dw,
> > > identity);
> > > +               xe_hw_error_irq_handler(tile, master_ctl);
> > >   
> > >                  /*
> > >                   * Display interrupts (including display
> > > backlight
> > > operations
> > > @@ -756,6 +758,8 @@ int xe_irq_install(struct xe_device *xe)
> > >          int nvec = 1;
> > >          int err;
> > >   
> > > +       xe_hw_error_init(xe);
> > > +
> > >          xe_irq_reset(xe);
> > >   
> > >          if (xe_device_has_msix(xe)) {
> > 
>