[PATCH v3 5/7] drm/xe: Add support to handle hardware errors
Riana Tauro
riana.tauro at intel.com
Thu Jul 10 05:54:07 UTC 2025
Hi Stuart
On 7/9/2025 10:57 PM, Summers, Stuart wrote:
> On Wed, 2025-07-02 at 19:41 +0530, Riana Tauro wrote:
>> Gfx device reports two classes of errors: uncorrectable and
>> correctable. Depending on the severity uncorrectable errors are
>> further classified as non fatal and fatal
>>
>> Correctable and non-fatal errors are reported as MSI's and bits in
>> the Master Interrupt Register indicate the class of the error.
>> The source of the error is then read from the Device Error Source
>> Register. Fatal errors are reported as PCIe errors
>> When a PCIe error is asserted, the OS will perform a device warm
>> reset
>> which causes the driver to reload. The error registers are sticky
>> and the values are maintained through a warm reset
>>
>> Add basic support to handle these errors
>>
>> Bspec: 50875, 53073, 53074, 53075, 53076
>>
>> Co-developed-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Himal Prasad Ghimiray
>> <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>> drivers/gpu/drm/xe/Makefile | 1 +
>> drivers/gpu/drm/xe/regs/xe_hw_error_regs.h | 15 +++
>> drivers/gpu/drm/xe/regs/xe_irq_regs.h | 1 +
>> drivers/gpu/drm/xe/xe_hw_error.c | 108
>> +++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_hw_error.h | 15 +++
>> drivers/gpu/drm/xe/xe_irq.c | 4 +
>> 6 files changed, 144 insertions(+)
>> create mode 100644 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile
>> b/drivers/gpu/drm/xe/Makefile
>> index 1d97e5b63f4e..fea8ee3b0785 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -73,6 +73,7 @@ xe-y += xe_bb.o \
>> xe_hw_engine.o \
>> xe_hw_engine_class_sysfs.o \
>> xe_hw_engine_group.o \
>> + xe_hw_error.o \
>> xe_hw_fence.o \
>> xe_irq.o \
>> xe_lrc.o \
>> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> new file mode 100644
>> index 000000000000..ed9b81fb28a0
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_HW_ERROR_REGS_H_
>> +#define _XE_HW_ERROR_REGS_H_
>> +
>> +#define DEV_ERR_STAT_NONFATAL 0x100178
>> +#define DEV_ERR_STAT_CORRECTABLE 0x10017c
>> +#define
>> DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
>> +
>> DEV_ERR_STAT_CORRECTABLE, \
>> +
>> DEV_ERR_STAT_NONFATAL))
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> index f0ecfcac4003..2758b64cec9e 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> @@ -18,6 +18,7 @@
>> #define GFX_MSTR_IRQ XE_REG(0x190010,
>> XE_REG_OPTION_VF)
>> #define MASTER_IRQ REG_BIT(31)
>> #define GU_MISC_IRQ REG_BIT(29)
>> +#define ERROR_IRQ(x) REG_BIT(26 + (x))
>> #define DISPLAY_IRQ REG_BIT(16)
>> #define GT_DW_IRQ(x) REG_BIT(x)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c
>> b/drivers/gpu/drm/xe/xe_hw_error.c
>> new file mode 100644
>> index 000000000000..0f2590839900
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -0,0 +1,108 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include "regs/xe_hw_error_regs.h"
>> +#include "regs/xe_irq_regs.h"
>> +
>> +#include "xe_device.h"
>> +#include "xe_hw_error.h"
>> +#include "xe_mmio.h"
>> +
>> +/* Error categories reported by hardware */
>> +enum hardware_error {
>> + HARDWARE_ERROR_CORRECTABLE = 0,
>> + HARDWARE_ERROR_NONFATAL = 1,
>> + HARDWARE_ERROR_FATAL = 2,
>> + HARDWARE_ERROR_MAX,
>> +};
>> +
>> +static const char *hw_error_to_str(const enum hardware_error hw_err)
>> +{
>> + switch (hw_err) {
>> + case HARDWARE_ERROR_CORRECTABLE:
>> + return "CORRECTABLE";
>> + case HARDWARE_ERROR_NONFATAL:
>> + return "NONFATAL";
>> + case HARDWARE_ERROR_FATAL:
>> + return "FATAL";
>> + default:
>> + return "UNKNOWN";
>> + }
>> +}
>> +
>> +static void hw_error_source_handler(struct xe_tile *tile, const enum
>> hardware_error hw_err)
>> +{
>> + const char *hw_err_str = hw_error_to_str(hw_err);
>> + struct xe_device *xe = tile_to_xe(tile);
>> + unsigned long flags;
>> + u32 err_src;
>> +
>> + if (xe->info.platform != XE_BATTLEMAGE)
>
> Why is this only on BMG? I see these same bits available on other
> platforms, e.g. LNL.
>
>> + return;
>> +
>> + spin_lock_irqsave(&xe->irq.lock, flags);
>> + err_src = xe_mmio_read32(&tile->mmio,
>> DEV_ERR_STAT_REG(hw_err));
>> + if (!err_src) {
>> + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported
>> DEV_ERR_STAT_%s blank!\n",
>> + tile->id, hw_err_str);
>> + goto unlock;
>> + }
>> +
>> + /* TODO: Process errrors per source */
>
> Should at least print the bits out on the initial implementation?
This patch is taken from
https://patchwork.freedesktop.org/series/125373/ which was not merged
due to absence of upstream consumer
Himal/Aravind can provide more details..
I have taken a single patch in this series to add support for
csc errors as it has recovery mechanism and a upstream consumer ie. fwupd.
The processing of all the bits according to source should be a separate
series. I can retain the TODO and remove the bmg check
>
>> +
>> + xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err),
>> err_src);
>> +
>> +unlock:
>> + spin_unlock_irqrestore(&xe->irq.lock, flags);
>> +}
>> +
>> +/**
>> + * xe_hw_error_irq_handler - irq handling for hw errors
>> + * @tile: tile instance
>> + * @master_ctl: value read from master interrupt register
>> + *
>> + * Xe platforms add three error bits to the master interrupt
>> register to support error handling.
>> + * These three bits are used to convey the class of error FATAL,
>> NONFATAL, or CORRECTABLE.
>> + * To process the interrupt, determine the source of error by
>> reading the Device Error Source
>> + * Register that corresponds to the class of error being serviced.
>> + */
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
>> master_ctl)
>> +{
>> + enum hardware_error hw_err;
>> +
>> + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
>> + if (master_ctl & ERROR_IRQ(hw_err))
>> + hw_error_source_handler(tile, hw_err);
>> +}
>> +
>> +/*
>> + * Process hardware errors during boot
>> + */
>> +static void process_hw_errors(struct xe_device *xe)
>> +{
>> + struct xe_tile *tile;
>> + u32 master_ctl;
>> + u8 id;
>> +
>> + for_each_tile(tile, xe, id) {
>> + master_ctl = xe_mmio_read32(&tile->mmio,
>> GFX_MSTR_IRQ);
>> + xe_hw_error_irq_handler(tile, master_ctl);
>> + xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ,
>> master_ctl);
>> + }
>> +}
>> +
>> +/**
>> + * xe_hw_error_init - Initialize hw errors
>> + * @xe: xe device instance
>> + *
>> + * Initialize and process hw errors
>> + */
>> +void xe_hw_error_init(struct xe_device *xe)
>> +{
>> + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
>
> Again, why skipping integrated? It seems like this might also be viable
> for, for instance, LNL? Of course some of the bits might make less
> sense for those platforms if they are PCIe-specific. But at least
> printing the register on an error seems interesting.
Are you suggesting to print the raw value in a drm_err log?
I could add that and remove the dgfx check, but if it is processing
of the indiviual sources and keeping count then that should be a
different series
Thanks
Riana
>
> Thanks,
> Stuart
>
>> + return;
>> +
>> + process_hw_errors(xe);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h
>> b/drivers/gpu/drm/xe/xe_hw_error.h
>> new file mode 100644
>> index 000000000000..d86e28c5180c
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +#ifndef XE_HW_ERROR_H_
>> +#define XE_HW_ERROR_H_
>> +
>> +#include <linux/types.h>
>> +
>> +struct xe_tile;
>> +struct xe_device;
>> +
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
>> master_ctl);
>> +void xe_hw_error_init(struct xe_device *xe);
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c
>> b/drivers/gpu/drm/xe/xe_irq.c
>> index 5362d3174b06..24ccf3bec52c 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -18,6 +18,7 @@
>> #include "xe_gt.h"
>> #include "xe_guc.h"
>> #include "xe_hw_engine.h"
>> +#include "xe_hw_error.h"
>> #include "xe_memirq.h"
>> #include "xe_mmio.h"
>> #include "xe_pxp.h"
>> @@ -466,6 +467,7 @@ static irqreturn_t dg1_irq_handler(int irq, void
>> *arg)
>> xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>
>> gt_irq_handler(tile, master_ctl, intr_dw, identity);
>> + xe_hw_error_irq_handler(tile, master_ctl);
>>
>> /*
>> * Display interrupts (including display backlight
>> operations
>> @@ -753,6 +755,8 @@ int xe_irq_install(struct xe_device *xe)
>> int nvec = 1;
>> int err;
>>
>> + xe_hw_error_init(xe);
>> +
>> xe_irq_reset(xe);
>>
>> if (xe_device_has_msix(xe)) {
>
More information about the Intel-xe
mailing list