[PATCH v4 7/9] drm/xe: Add support to handle hardware errors
Riana Tauro
riana.tauro at intel.com
Fri Jul 11 05:35:04 UTC 2025
Hi Umesh
On 7/11/2025 2:39 AM, Umesh Nerlige Ramappa wrote:
> Resending since it got lost earlier...
>
> On Wed, Jul 09, 2025 at 04:50:19PM +0530, Riana Tauro wrote:
>> Gfx device reports two classes of errors: uncorrectable and
>> correctable. Depending on the severity uncorrectable errors are
>> further classified as non fatal and fatal
>>
>> Correctable and non-fatal errors are reported as MSI's and bits in
>> the Master Interrupt Register indicate the class of the error.
>> The source of the error is then read from the Device Error Source
>> Register.
>
> nit: Since Fatal is a separate category, maybe a split here into a
> separate paragraph and some formatting would be good.
>
>> Fatal errors are reported as PCIe errors
>> When a PCIe error is asserted, the OS will perform a device warm reset
>> which causes the driver to reload. The error registers are sticky
>> and the values are maintained through a warm reset
>>
>> Add basic support to handle these errors
>>
>> Bspec: 50875, 53073, 53074, 53075, 53076
>>
>> Co-developed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>> ---
>> drivers/gpu/drm/xe/Makefile | 1 +
>> drivers/gpu/drm/xe/regs/xe_hw_error_regs.h | 15 +++
>> drivers/gpu/drm/xe/regs/xe_irq_regs.h | 1 +
>> drivers/gpu/drm/xe/xe_hw_error.c | 108 +++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_hw_error.h | 15 +++
>> drivers/gpu/drm/xe/xe_irq.c | 4 +
>> 6 files changed, 144 insertions(+)
>> create mode 100644 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 1d97e5b63f4e..fea8ee3b0785 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -73,6 +73,7 @@ xe-y += xe_bb.o \
>> xe_hw_engine.o \
>> xe_hw_engine_class_sysfs.o \
>> xe_hw_engine_group.o \
>> + xe_hw_error.o \
>> xe_hw_fence.o \
>> xe_irq.o \
>> xe_lrc.o \
>> diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/
>> drm/xe/regs/xe_hw_error_regs.h
>> new file mode 100644
>> index 000000000000..ed9b81fb28a0
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_HW_ERROR_REGS_H_
>> +#define _XE_HW_ERROR_REGS_H_
>> +
>> +#define DEV_ERR_STAT_NONFATAL 0x100178
>> +#define DEV_ERR_STAT_CORRECTABLE 0x10017c
>> +#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
>> + DEV_ERR_STAT_CORRECTABLE, \
>> + DEV_ERR_STAT_NONFATAL))
>
> For x = 1 and x = 2, I don't see the above result in correct values.
Can > you please double check?
I had got confused with the same when i took the patch from the other
series. But the second part of the macro becomes negative and the
registers are correct.
Calculations for 1 and 2
#define _PICK_EVEN(__index, __a, __b) ((__a) + (__index) * ((__b) - (__a)))
_PICK_EVEN([HARDWARE_ERROR_NONFATAL = 1]) = DEV_ERR_STAT_CORRECTABLE + 1
* (DEV_ERR_STAT_NONFATAL - DEV_ERR_STAT_CORRECTABLE)
0x10017c + 1 * (0x100178 - 0x10017c)
0x100178
_PICK_EVEN([HARDWARE_ERROR_FATAL = 2]) = DEV_ERR_STAT_CORRECTABLE + 1 *
(DEV_ERR_STAT_NONFATAL - DEV_ERR_STAT_CORRECTABLE)
0x10017c + 2 * (0x100178 - 0x10017c)
0x100174
Thanks
Riana
>
> What about DEV_ERR_STAT_FATAL?
>
> Rest looks good,
>
> Umesh
>
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h b/drivers/gpu/drm/
>> xe/regs/xe_irq_regs.h
>> index f0ecfcac4003..2758b64cec9e 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
>> @@ -18,6 +18,7 @@
>> #define GFX_MSTR_IRQ XE_REG(0x190010, XE_REG_OPTION_VF)
>> #define MASTER_IRQ REG_BIT(31)
>> #define GU_MISC_IRQ REG_BIT(29)
>> +#define ERROR_IRQ(x) REG_BIT(26 + (x))
>> #define DISPLAY_IRQ REG_BIT(16)
>> #define GT_DW_IRQ(x) REG_BIT(x)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/
>> xe_hw_error.c
>> new file mode 100644
>> index 000000000000..0f2590839900
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -0,0 +1,108 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include "regs/xe_hw_error_regs.h"
>> +#include "regs/xe_irq_regs.h"
>> +
>> +#include "xe_device.h"
>> +#include "xe_hw_error.h"
>> +#include "xe_mmio.h"
>> +
>> +/* Error categories reported by hardware */
>> +enum hardware_error {
>> + HARDWARE_ERROR_CORRECTABLE = 0,
>> + HARDWARE_ERROR_NONFATAL = 1,
>> + HARDWARE_ERROR_FATAL = 2,
>> + HARDWARE_ERROR_MAX,
>> +};
>> +
>> +static const char *hw_error_to_str(const enum hardware_error hw_err)
>> +{
>> + switch (hw_err) {
>> + case HARDWARE_ERROR_CORRECTABLE:
>> + return "CORRECTABLE";
>> + case HARDWARE_ERROR_NONFATAL:
>> + return "NONFATAL";
>> + case HARDWARE_ERROR_FATAL:
>> + return "FATAL";
>> + default:
>> + return "UNKNOWN";
>> + }
>> +}
>> +
>> +static void hw_error_source_handler(struct xe_tile *tile, const enum
>> hardware_error hw_err)
>> +{
>> + const char *hw_err_str = hw_error_to_str(hw_err);
>> + struct xe_device *xe = tile_to_xe(tile);
>> + unsigned long flags;
>> + u32 err_src;
>> +
>> + if (xe->info.platform != XE_BATTLEMAGE)
>> + return;
>> +
>> + spin_lock_irqsave(&xe->irq.lock, flags);
>> + err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
>> + if (!err_src) {
>> + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported
>> DEV_ERR_STAT_%s blank!\n",
>> + tile->id, hw_err_str);
>> + goto unlock;
>> + }
>> +
>> + /* TODO: Process errrors per source */
>> +
>> + xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
>> +
>> +unlock:
>> + spin_unlock_irqrestore(&xe->irq.lock, flags);
>> +}
>> +
>> +/**
>> + * xe_hw_error_irq_handler - irq handling for hw errors
>> + * @tile: tile instance
>> + * @master_ctl: value read from master interrupt register
>> + *
>> + * Xe platforms add three error bits to the master interrupt register
>> to support error handling.
>> + * These three bits are used to convey the class of error FATAL,
>> NONFATAL, or CORRECTABLE.
>> + * To process the interrupt, determine the source of error by reading
>> the Device Error Source
>> + * Register that corresponds to the class of error being serviced.
>> + */
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>> +{
>> + enum hardware_error hw_err;
>> +
>> + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
>> + if (master_ctl & ERROR_IRQ(hw_err))
>> + hw_error_source_handler(tile, hw_err);
>> +}
>> +
>> +/*
>> + * Process hardware errors during boot
>> + */
>> +static void process_hw_errors(struct xe_device *xe)
>> +{
>> + struct xe_tile *tile;
>> + u32 master_ctl;
>> + u8 id;
>> +
>> + for_each_tile(tile, xe, id) {
>> + master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ);
>> + xe_hw_error_irq_handler(tile, master_ctl);
>> + xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl);
>> + }
>> +}
>> +
>> +/**
>> + * xe_hw_error_init - Initialize hw errors
>> + * @xe: xe device instance
>> + *
>> + * Initialize and process hw errors
>> + */
>> +void xe_hw_error_init(struct xe_device *xe)
>> +{
>> + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
>> + return;
>> +
>> + process_hw_errors(xe);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/
>> xe_hw_error.h
>> new file mode 100644
>> index 000000000000..d86e28c5180c
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -0,0 +1,15 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +#ifndef XE_HW_ERROR_H_
>> +#define XE_HW_ERROR_H_
>> +
>> +#include <linux/types.h>
>> +
>> +struct xe_tile;
>> +struct xe_device;
>> +
>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
>> master_ctl);
>> +void xe_hw_error_init(struct xe_device *xe);
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>> index 5362d3174b06..24ccf3bec52c 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -18,6 +18,7 @@
>> #include "xe_gt.h"
>> #include "xe_guc.h"
>> #include "xe_hw_engine.h"
>> +#include "xe_hw_error.h"
>> #include "xe_memirq.h"
>> #include "xe_mmio.h"
>> #include "xe_pxp.h"
>> @@ -466,6 +467,7 @@ static irqreturn_t dg1_irq_handler(int irq, void
>> *arg)
>> xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>
>> gt_irq_handler(tile, master_ctl, intr_dw, identity);
>> + xe_hw_error_irq_handler(tile, master_ctl);
>>
>> /*
>> * Display interrupts (including display backlight operations
>> @@ -753,6 +755,8 @@ int xe_irq_install(struct xe_device *xe)
>> int nvec = 1;
>> int err;
>>
>> + xe_hw_error_init(xe);
>> +
>> xe_irq_reset(xe);
>>
>> if (xe_device_has_msix(xe)) {
>> --
>> 2.47.1
>>
More information about the Intel-xe
mailing list