[Intel-xe] [PATCH v5 1/4] drm/xe: Handle errors from various components.

Tue Sep 26 10:09:11 UTC 2023

On 26/09/23 10:27, Ghimiray, Himal Prasad wrote:
>
> On 26-09-2023 09:50, Aravind Iddamsetty wrote:
>> On 23/08/23 14:28, Himal Prasad Ghimiray wrote:
>>> The GFX device can generate numbers of classes of error under the new
>>> infrastructure: correctable, non-fatal, and fatal errors.
>> The GFX device reports two classes of errors: uncorrectable and correctable.
>> Depending on the severity uncorrectable errors are further classified as non fatal and fatal.
>>> The non-fatal and fatal error classes distinguish between levels of
>>> severity for uncorrectable errors. Driver will only handle logging
>>> of errors and updating counters from various components within the
>>> graphics device. Anything more will be handled at system level.
>>>
>>> For errors that will route as interrupts, three bits in the Master
>>> Interrupt Register will be used to convey the class of error.
>>>
>>> For each class of error: Determine source of error (IP block) by reading
>>> the Device Error Source Register (RW1C) that
>>> corresponds to the class of error being serviced.
>>>
>>> Bspec: 50875, 53073, 53074, 53075
>> Also may be you want to squash this with the last patch where fatal error processing is done,
>> fatal errors are defined here but processed in your last patch or move all fatal definition to last patch.
> Makes sense. Will squash in last patch.
>>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
>>> Cc: Matthew Brost <matthew.brost at intel.com>
>>> Cc: Matt Roper <matthew.d.roper at intel.com>
>>> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
>>> Cc: Jani Nikula <jani.nikula at intel.com>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/Makefile                  |   1 +
>>>   drivers/gpu/drm/xe/regs/xe_regs.h            |   2 +-
>>>   drivers/gpu/drm/xe/regs/xe_tile_error_regs.h |  15 ++
>>>   drivers/gpu/drm/xe/xe_device_types.h         |  11 +
>>>   drivers/gpu/drm/xe/xe_hw_error.c             | 211 +++++++++++++++++++
>>>   drivers/gpu/drm/xe/xe_hw_error.h             |  64 ++++++
>>>   drivers/gpu/drm/xe/xe_irq.c                  |   3 +
>>>   7 files changed, 306 insertions(+), 1 deletion(-)
>>>   create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>>   create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>>>   create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>>> index 550cdfed729e..6290c8ce0e84 100644
>>> --- a/drivers/gpu/drm/xe/Makefile
>>> +++ b/drivers/gpu/drm/xe/Makefile
>>> @@ -75,6 +75,7 @@ xe-y += xe_bb.o \
>>>       xe_guc_submit.o \
>>>       xe_hw_engine.o \
>>>       xe_hw_engine_class_sysfs.o \
>>> +    xe_hw_error.o \
>>>       xe_hw_fence.o \
>>>       xe_huc.o \
>>>       xe_huc_debugfs.o \
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> index 39d7b0740bf0..e223975a5acf 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> @@ -88,7 +88,7 @@
>>>   #define   GU_MISC_IRQ                REG_BIT(29)
>>>   #define   DISPLAY_IRQ                REG_BIT(16)
>>>   #define   GT_DW_IRQ(x)                REG_BIT(x)
>>> +#define   XE_ERROR_IRQ(x)            REG_BIT(26 + (x))
>>>     #define PVC_RP_STATE_CAP            XE_REG(0x281014)
>>> -
>>>   #endif
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> new file mode 100644
>>> index 000000000000..db78d6687213
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> @@ -0,0 +1,15 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +#ifndef XE_TILE_ERROR_REGS_H_
>>> +#define XE_TILE_ERROR_REGS_H_
>>> +
>>> +#include <linux/stddef.h>
>>> +
>>> +#define _DEV_ERR_STAT_NONFATAL                         0x100178
>>> +#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
>>> +#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
>>> +                                _DEV_ERR_STAT_CORRECTABLE, \
>>> +                                _DEV_ERR_STAT_NONFATAL))
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>> index dbb732e14606..4e4184977709 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -14,6 +14,7 @@
>>>     #include "xe_devcoredump_types.h"
>>>   #include "xe_gt_types.h"
>>> +#include "xe_hw_error.h"
>>>   #include "xe_platform_types.h"
>>>   #include "xe_step_types.h"
>>>   @@ -172,6 +173,11 @@ struct xe_tile {
>>>         /** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>>       struct kobject *sysfs;
>>> +
>>> +    /** @tile_hw_errors: hardware errors reported for the tile */
>>> +    struct tile_hw_errors {
>>> +        unsigned long count[XE_TILE_HW_ERROR_MAX];
>>> +    } errors;
>>>   };
>>>     /**
>>> @@ -359,6 +365,11 @@ struct xe_device {
>>>        */
>>>       struct task_struct *pm_callback_task;
>>>   +    /** @hardware_errors_regs: list of hw error regs*/
>>> +    struct hardware_errors_regs {
>>> +        const struct err_msg_cntr_pair *dev_err_stat[HARDWARE_ERROR_MAX];
>> I'm just thinking if it makes sense to move it to respective structs like tile or gt, any thoughts?
> These structures are platform dependent not tiles/gt. IMO device is right place.

they may be platform specific, but if I notice dev_err_stat is accounted in tile_hw_errors and similarly
err_stat_gt is accounted in gt_hw_errors so they are associated to tile and gt.
>>> +    } hw_err_regs;
>>> +
>>>       /* private: */
>>>     #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>>> new file mode 100644
>>> index 000000000000..357d0f962d91
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>>> @@ -0,0 +1,211 @@
>>> +// SPDX-License-Identifier: MIT
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +
>>> +#include "xe_hw_error.h"
>>> +
>>> +#include "regs/xe_regs.h"
>>> +#include "regs/xe_tile_error_regs.h"
>>> +#include "xe_device.h"
>>> +#include "xe_mmio.h"
>>> +
>>> +static const char *
>>> +hardware_error_type_to_str(const enum hardware_error hw_err)
>>> +{
>>> +    switch (hw_err) {
>>> +    case HARDWARE_ERROR_CORRECTABLE:
>>> +        return "CORRECTABLE";
>>> +    case HARDWARE_ERROR_NONFATAL:
>>> +        return "NONFATAL";
>>> +    case HARDWARE_ERROR_FATAL:
>>> +        return "FATAL";
>>> +    default:
>>> +        return "UNKNOWN";
>>> +    }
>>> +}
>>> +
>>> +static const struct err_msg_cntr_pair dg2_err_stat_fatal_reg[] = {
>> the name err_msg_cntr_pair might not be appropriate as it err name and index into
>> tile_hw_errors. err_name_index_pair ?? thoughts
> ok.
>>> +    [0]         = {"GT",            XE_TILE_HW_ERR_GT_FATAL},
>>> +    [1 ... 3]   = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [4]         = {"DISPLAY",        XE_TILE_HW_ERR_DISPLAY_FATAL},
>>> +    [5 ... 7]   = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [8]         = {"GSC error",        XE_TILE_HW_ERR_GSC_FATAL},
>>> +    [9 ... 11]  = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [12]        = {"SGUNIT",        XE_TILE_HW_ERR_SGUNIT_FATAL},
>>> +    [13 ... 15] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [16]        = {"SOC",            XE_TILE_HW_ERR_SOC_FATAL},
>>> +    [17 ... 31] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dg2_err_stat_nonfatal_reg[] = {
>>> +    [0]         = {"GT",            XE_TILE_HW_ERR_GT_NONFATAL},
>>> +    [1 ... 3]   = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [4]         = {"DISPLAY",        XE_TILE_HW_ERR_DISPLAY_NONFATAL},
>>> +    [5 ... 7]   = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [8]         = {"GSC error",        XE_TILE_HW_ERR_GSC_NONFATAL},
>>> +    [9 ... 11]  = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [12]        = {"SGUNIT",        XE_TILE_HW_ERR_SGUNIT_NONFATAL},
>>> +    [13 ... 15] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [16]        = {"SOC",            XE_TILE_HW_ERR_SOC_NONFATAL},
>>> +    [17 ... 19] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [20]        = {"MERT",            XE_TILE_HW_ERR_MERT_NONFATAL},
>>> +    [21 ... 31] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dg2_err_stat_correctable_reg[] = {
>>> +    [0]         = {"GT",            XE_TILE_HW_ERR_GT_CORR},
>>> +    [1 ... 3]   = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +    [4]         = {"DISPLAY",        XE_TILE_HW_ERR_DISPLAY_CORR},
>>> +    [5 ... 7]   = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +    [8]         = {"GSC error",        XE_TILE_HW_ERR_GSC_CORR},
>>> +    [9 ... 11]  = {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +    [12]        = {"SGUNIT",        XE_TILE_HW_ERR_SGUNIT_CORR},
>>> +    [13 ... 15] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +    [16]        = {"SOC",            XE_TILE_HW_ERR_SOC_CORR},
>>> +    [17 ... 31] = {"Undefined",             XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair pvc_err_stat_fatal_reg[] = {
>>> +    [0]         =  {"GT",            XE_TILE_HW_ERR_GT_FATAL},
>>> +    [1]         =  {"SGGI Cmd Parity",    XE_TILE_HW_ERR_SGGI_FATAL},
>>> +    [2 ... 7]   =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [8]         =  {"GSC error",        XE_TILE_HW_ERR_GSC_FATAL},
>>> +    [9]         =  {"SGLI Cmd Parity",    XE_TILE_HW_ERR_SGLI_FATAL},
>>> +    [10 ... 12] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [13]        =  {"SGCI Cmd Parity",    XE_TILE_HW_ERR_SGCI_FATAL},
>>> +    [14 ... 15] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [16]        =  {"SOC ERROR",        XE_TILE_HW_ERR_SOC_FATAL},
>>> +    [17 ... 19] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +    [20]        =  {"MERT Cmd Parity",    XE_TILE_HW_ERR_MERT_FATAL},
>>> +    [21 ... 31] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair pvc_err_stat_nonfatal_reg[] = {
>>> +    [0]         =  {"GT",            XE_TILE_HW_ERR_GT_NONFATAL},
>>> +    [1]         =  {"SGGI Data Parity",    XE_TILE_HW_ERR_SGGI_NONFATAL},
>>> +    [2 ... 7]   =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [8]         =  {"GSC",            XE_TILE_HW_ERR_GSC_NONFATAL},
>>> +    [9]         =  {"SGLI Data Parity",    XE_TILE_HW_ERR_SGLI_NONFATAL},
>>> +    [10 ... 12] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [13]        =  {"SGCI Data Parity",    XE_TILE_HW_ERR_SGCI_NONFATAL},
>>> +    [14 ... 15] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [16]        =  {"SOC",            XE_TILE_HW_ERR_SOC_NONFATAL},
>>> +    [17 ... 19] =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +    [20]        =  {"MERT Data Parity",    XE_TILE_HW_ERR_MERT_NONFATAL},
>>> +    [21 ... 31] =  {"Undefined",            XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair pvc_err_stat_correctable_reg[] = {
>>> +    [0]         =  {"GT",            XE_TILE_HW_ERR_GT_CORR},
>>> +    [1 ... 7]   =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +    [8]         =  {"GSC",            XE_TILE_HW_ERR_GSC_CORR},
>>> +    [9 ... 31]  =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dev_err_stat_fatal_reg[] = {
>>> +    [0]         =  {"GT",            XE_TILE_HW_ERR_GT_FATAL},
>>> +    [1 ... 31]  =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dev_err_stat_nonfatal_reg[] = {
>>> +    [0]         =  {"GT",            XE_TILE_HW_ERR_GT_NONFATAL},
>>> +    [1 ... 31]  =  {"Undefined",            XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dev_err_stat_correctable_reg[] = {
>>> +    [0]         =  {"GT",            XE_TILE_HW_ERR_GT_CORR},
>>> +    [1 ... 31]  =  {"Undefined",        XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +void xe_assign_hw_err_regs(struct xe_device *xe)
>>> +{
>>> +    const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
>>> +
>>> +    if (xe->info.platform == XE_DG2) {
>>> +        dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dg2_err_stat_correctable_reg;
>>> +        dev_err_stat[HARDWARE_ERROR_NONFATAL] = dg2_err_stat_nonfatal_reg;
>>> +        dev_err_stat[HARDWARE_ERROR_FATAL] = dg2_err_stat_fatal_reg;
>>> +    } else if (xe->info.platform == XE_PVC) {
>>> +        dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
>>> +        dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
>>> +        dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
>>> +    } else {
>>> +        /* For other platforms report only GT errors */
>> why only GT errors??
>
> Because GT errors will only be common to all platforms. The other errors are platform specific.

and why are we not enabling those other errors?
>
>>> +        dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dev_err_stat_correctable_reg;
>>> +        dev_err_stat[HARDWARE_ERROR_NONFATAL] = dev_err_stat_nonfatal_reg;
>>> +        dev_err_stat[HARDWARE_ERROR_FATAL] = dev_err_stat_fatal_reg;
>>> +    }
>>> +}
>>> +
>>> +static void
>>> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>> +{
>>> +    const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>> +    const struct hardware_errors_regs *err_regs;
>>> +    const struct err_msg_cntr_pair *errstat;
>>> +    unsigned long errsrc;
>>> +    unsigned long flags;
>>> +    const char *errmsg;
>>> +    struct xe_gt *mmio;
>>> +    u32 indx;
>>> +    u32 errbit;
>>> +
>>> +    spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
>>> +    err_regs = &tile_to_xe(tile)->hw_err_regs;
>>> +    errstat = err_regs->dev_err_stat[hw_err];
>>> +    mmio = tile->primary_gt;
>>> +    errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
>>> +    if (!errsrc) {
>>> +        drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
>>> +                    "TILE%d detected DEV_ERR_STAT_REG_%s blank!\n",
>>> +                    tile->id, hw_err_str);
>>> +        goto unlock;
>>> +    }
>>> +
>>> +    drm_info(&tile_to_xe(tile)->drm, HW_ERR
>>> +         "TILE%d DEV_ERR_STAT_REG_%s=0x%08lx\n", tile->id, hw_err_str, errsrc);
>>> +
>>> +    for_each_set_bit(errbit, &errsrc, 32) {
>>> +        errmsg = errstat[errbit].errmsg;
>>> +        indx = errstat[errbit].cntr_indx;
>>> +
>>> +        if (hw_err == HARDWARE_ERROR_CORRECTABLE)
>>> +            drm_warn(&tile_to_xe(tile)->drm,
>>> +                 HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
>>> +                 tile->id, errmsg, hw_err_str, errbit);
>>> +
>>> +        else
>>> +            drm_err_ratelimited(&tile_to_xe(tile)->drm,
>>> +                        HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
>>> +                        tile->id, errmsg, hw_err_str, errbit);
>>> +        tile->errors.count[indx]++;
>> The register here is a top level register and some of the sources have second error level registers
>> so the count shall be at second level source for all those that have and not at global level as here
>> it will not give granularity.
>
> My idea of having counter at top level was to have cumulative numbers for errors.

It doesn't give cumulative, as multiple sub errors can be reported in a single MSI, so always
count at lower level and leave the accounting to userspace.

Thanks,

Aravind.
>
>
> It will provide summation of all MSI's in case of correctable gt. Can we removed but looks logical to retain it.
>
>>> +    }
>>> +
>>> +    xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
>>> +unlock:
>>> +    spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>> +}
>>> +
>>> +/*
>>> + * XE Platforms adds three Error bits to the Master Interrupt
>>> + * Register to support error handling. These three bits are
>>> + * used to convey the class of error:
>>> + * FATAL, NONFATAL, or CORRECTABLE.
>>> + *
>>> + * To process an interrupt:
>>> + *       Determine source of error (IP block) by reading
>>> + *     the Device Error Source Register (RW1C) that
>>> + *     corresponds to the class of error being serviced
>>> + *     and log the error.
>>> + */
>>> +void
>>> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>> +{
>>> +    enum hardware_error hw_err;
>>> +
>>> +    for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
>>> +        if (master_ctl & XE_ERROR_IRQ(hw_err))
>>> +            xe_hw_error_source_handler(tile, hw_err);
>>> +    }
>>> +}
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>>> new file mode 100644
>>> index 000000000000..c0c05b9130eb
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>>> @@ -0,0 +1,64 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +#ifndef XE_HW_ERRORS_H_
>>> +#define XE_HW_ERRORS_H_
>>> +
>>> +#include <linux/stddef.h>
>>> +#include <linux/types.h>
>>> +
>>> +/* Error categories reported by hardware */
>>> +enum hardware_error {
>>> +    HARDWARE_ERROR_CORRECTABLE = 0,
>>> +    HARDWARE_ERROR_NONFATAL = 1,
>>> +    HARDWARE_ERROR_FATAL = 2,
>>> +    HARDWARE_ERROR_MAX,
>>> +};
>>> +
>>> +/* Count of  Correctable and Uncorrectable errors reported on tile */
>>> +enum xe_tile_hw_errors {
>>> +    XE_TILE_HW_ERR_GT_FATAL = 0,
>>> +    XE_TILE_HW_ERR_SGGI_FATAL,
>>> +    XE_TILE_HW_ERR_DISPLAY_FATAL,
>>> +    XE_TILE_HW_ERR_SGDI_FATAL,
>>> +    XE_TILE_HW_ERR_SGLI_FATAL,
>>> +    XE_TILE_HW_ERR_SGUNIT_FATAL,
>>> +    XE_TILE_HW_ERR_SGCI_FATAL,
>>> +    XE_TILE_HW_ERR_GSC_FATAL,
>>> +    XE_TILE_HW_ERR_SOC_FATAL,
>>> +    XE_TILE_HW_ERR_MERT_FATAL,
>>> +    XE_TILE_HW_ERR_SGMI_FATAL,
>>> +    XE_TILE_HW_ERR_UNKNOWN_FATAL,
>>> +    XE_TILE_HW_ERR_SGGI_NONFATAL,
>>> +    XE_TILE_HW_ERR_DISPLAY_NONFATAL,
>>> +    XE_TILE_HW_ERR_SGDI_NONFATAL,
>>> +    XE_TILE_HW_ERR_SGLI_NONFATAL,
>>> +    XE_TILE_HW_ERR_GT_NONFATAL,
>>> +    XE_TILE_HW_ERR_SGUNIT_NONFATAL,
>>> +    XE_TILE_HW_ERR_SGCI_NONFATAL,
>>> +    XE_TILE_HW_ERR_GSC_NONFATAL,
>>> +    XE_TILE_HW_ERR_SOC_NONFATAL,
>>> +    XE_TILE_HW_ERR_MERT_NONFATAL,
>>> +    XE_TILE_HW_ERR_SGMI_NONFATAL,
>>> +    XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
>>> +    XE_TILE_HW_ERR_GT_CORR,
>>> +    XE_TILE_HW_ERR_DISPLAY_CORR,
>>> +    XE_TILE_HW_ERR_SGUNIT_CORR,
>>> +    XE_TILE_HW_ERR_GSC_CORR,
>>> +    XE_TILE_HW_ERR_SOC_CORR,
>>> +    XE_TILE_HW_ERR_UNKNOWN_CORR,
>>> +    XE_TILE_HW_ERROR_MAX,
>>> +};
>>> +
>>> +struct err_msg_cntr_pair {
>>> +    const char *errmsg;
>>> +    const u32 cntr_indx;
>>> +};
>>> +
>>> +struct xe_device;
>>> +struct xe_tile;
>>> +
>>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
>>> +void xe_assign_hw_err_regs(struct xe_device *xe);
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 1dee3e832eb5..48b933234342 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -418,6 +418,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>           xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>>             gt_irq_handler(tile, master_ctl, intr_dw, identity);
>>> +        xe_hw_error_irq_handler(tile, master_ctl);
>>>             /*
>>>            * Display interrupts (including display backlight operations
>>> @@ -572,6 +573,8 @@ int xe_irq_install(struct xe_device *xe)
>>>           return -EINVAL;
>>>       }
>>>   +    xe_assign_hw_err_regs(xe);
>>> +
>>>       xe->irq.enabled = true;
>>>         xe_irq_reset(xe);
>> Thanks,
>> Aravind.