[Intel-xe] [PATCH v5 1/4] drm/xe: Handle errors from various components.
Aravind Iddamsetty
aravind.iddamsetty at linux.intel.com
Tue Sep 26 10:09:11 UTC 2023
On 26/09/23 10:27, Ghimiray, Himal Prasad wrote:
>
> On 26-09-2023 09:50, Aravind Iddamsetty wrote:
>> On 23/08/23 14:28, Himal Prasad Ghimiray wrote:
>>> The GFX device can generate numbers of classes of error under the new
>>> infrastructure: correctable, non-fatal, and fatal errors.
>> The GFX device reports two classes of errors: uncorrectable and correctable.
>> Depending on the severity uncorrectable errors are further classified as non fatal and fatal.
>>> The non-fatal and fatal error classes distinguish between levels of
>>> severity for uncorrectable errors. Driver will only handle logging
>>> of errors and updating counters from various components within the
>>> graphics device. Anything more will be handled at system level.
>>>
>>> For errors that will route as interrupts, three bits in the Master
>>> Interrupt Register will be used to convey the class of error.
>>>
>>> For each class of error: Determine source of error (IP block) by reading
>>> the Device Error Source Register (RW1C) that
>>> corresponds to the class of error being serviced.
>>>
>>> Bspec: 50875, 53073, 53074, 53075
>> Also may be you want to squash this with the last patch where fatal error processing is done,
>> fatal errors are defined here but processed in your last patch or move all fatal definition to last patch.
> Makes sense. Will squash in last patch.
>>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
>>> Cc: Matthew Brost <matthew.brost at intel.com>
>>> Cc: Matt Roper <matthew.d.roper at intel.com>
>>> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
>>> Cc: Jani Nikula <jani.nikula at intel.com>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/Makefile | 1 +
>>> drivers/gpu/drm/xe/regs/xe_regs.h | 2 +-
>>> drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 15 ++
>>> drivers/gpu/drm/xe/xe_device_types.h | 11 +
>>> drivers/gpu/drm/xe/xe_hw_error.c | 211 +++++++++++++++++++
>>> drivers/gpu/drm/xe/xe_hw_error.h | 64 ++++++
>>> drivers/gpu/drm/xe/xe_irq.c | 3 +
>>> 7 files changed, 306 insertions(+), 1 deletion(-)
>>> create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>>> create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>>> index 550cdfed729e..6290c8ce0e84 100644
>>> --- a/drivers/gpu/drm/xe/Makefile
>>> +++ b/drivers/gpu/drm/xe/Makefile
>>> @@ -75,6 +75,7 @@ xe-y += xe_bb.o \
>>> xe_guc_submit.o \
>>> xe_hw_engine.o \
>>> xe_hw_engine_class_sysfs.o \
>>> + xe_hw_error.o \
>>> xe_hw_fence.o \
>>> xe_huc.o \
>>> xe_huc_debugfs.o \
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> index 39d7b0740bf0..e223975a5acf 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> @@ -88,7 +88,7 @@
>>> #define GU_MISC_IRQ REG_BIT(29)
>>> #define DISPLAY_IRQ REG_BIT(16)
>>> #define GT_DW_IRQ(x) REG_BIT(x)
>>> +#define XE_ERROR_IRQ(x) REG_BIT(26 + (x))
>>> #define PVC_RP_STATE_CAP XE_REG(0x281014)
>>> -
>>> #endif
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> new file mode 100644
>>> index 000000000000..db78d6687213
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> @@ -0,0 +1,15 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +#ifndef XE_TILE_ERROR_REGS_H_
>>> +#define XE_TILE_ERROR_REGS_H_
>>> +
>>> +#include <linux/stddef.h>
>>> +
>>> +#define _DEV_ERR_STAT_NONFATAL 0x100178
>>> +#define _DEV_ERR_STAT_CORRECTABLE 0x10017c
>>> +#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
>>> + _DEV_ERR_STAT_CORRECTABLE, \
>>> + _DEV_ERR_STAT_NONFATAL))
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>> index dbb732e14606..4e4184977709 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -14,6 +14,7 @@
>>> #include "xe_devcoredump_types.h"
>>> #include "xe_gt_types.h"
>>> +#include "xe_hw_error.h"
>>> #include "xe_platform_types.h"
>>> #include "xe_step_types.h"
>>> @@ -172,6 +173,11 @@ struct xe_tile {
>>> /** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>> struct kobject *sysfs;
>>> +
>>> + /** @tile_hw_errors: hardware errors reported for the tile */
>>> + struct tile_hw_errors {
>>> + unsigned long count[XE_TILE_HW_ERROR_MAX];
>>> + } errors;
>>> };
>>> /**
>>> @@ -359,6 +365,11 @@ struct xe_device {
>>> */
>>> struct task_struct *pm_callback_task;
>>> + /** @hardware_errors_regs: list of hw error regs*/
>>> + struct hardware_errors_regs {
>>> + const struct err_msg_cntr_pair *dev_err_stat[HARDWARE_ERROR_MAX];
>> I'm just thinking if it makes sense to move it to respective structs like tile or gt, any thoughts?
> These structures are platform dependent not tiles/gt. IMO device is right place.
they may be platform specific, but if I notice dev_err_stat is accounted in tile_hw_errors and similarly
err_stat_gt is accounted in gt_hw_errors so they are associated to tile and gt.
>>> + } hw_err_regs;
>>> +
>>> /* private: */
>>> #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>>> new file mode 100644
>>> index 000000000000..357d0f962d91
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>>> @@ -0,0 +1,211 @@
>>> +// SPDX-License-Identifier: MIT
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +
>>> +#include "xe_hw_error.h"
>>> +
>>> +#include "regs/xe_regs.h"
>>> +#include "regs/xe_tile_error_regs.h"
>>> +#include "xe_device.h"
>>> +#include "xe_mmio.h"
>>> +
>>> +static const char *
>>> +hardware_error_type_to_str(const enum hardware_error hw_err)
>>> +{
>>> + switch (hw_err) {
>>> + case HARDWARE_ERROR_CORRECTABLE:
>>> + return "CORRECTABLE";
>>> + case HARDWARE_ERROR_NONFATAL:
>>> + return "NONFATAL";
>>> + case HARDWARE_ERROR_FATAL:
>>> + return "FATAL";
>>> + default:
>>> + return "UNKNOWN";
>>> + }
>>> +}
>>> +
>>> +static const struct err_msg_cntr_pair dg2_err_stat_fatal_reg[] = {
>> the name err_msg_cntr_pair might not be appropriate as it err name and index into
>> tile_hw_errors. err_name_index_pair ?? thoughts
> ok.
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_FATAL},
>>> + [1 ... 3] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [4] = {"DISPLAY", XE_TILE_HW_ERR_DISPLAY_FATAL},
>>> + [5 ... 7] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [8] = {"GSC error", XE_TILE_HW_ERR_GSC_FATAL},
>>> + [9 ... 11] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [12] = {"SGUNIT", XE_TILE_HW_ERR_SGUNIT_FATAL},
>>> + [13 ... 15] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [16] = {"SOC", XE_TILE_HW_ERR_SOC_FATAL},
>>> + [17 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dg2_err_stat_nonfatal_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_NONFATAL},
>>> + [1 ... 3] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [4] = {"DISPLAY", XE_TILE_HW_ERR_DISPLAY_NONFATAL},
>>> + [5 ... 7] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [8] = {"GSC error", XE_TILE_HW_ERR_GSC_NONFATAL},
>>> + [9 ... 11] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [12] = {"SGUNIT", XE_TILE_HW_ERR_SGUNIT_NONFATAL},
>>> + [13 ... 15] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [16] = {"SOC", XE_TILE_HW_ERR_SOC_NONFATAL},
>>> + [17 ... 19] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [20] = {"MERT", XE_TILE_HW_ERR_MERT_NONFATAL},
>>> + [21 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dg2_err_stat_correctable_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_CORR},
>>> + [1 ... 3] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + [4] = {"DISPLAY", XE_TILE_HW_ERR_DISPLAY_CORR},
>>> + [5 ... 7] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + [8] = {"GSC error", XE_TILE_HW_ERR_GSC_CORR},
>>> + [9 ... 11] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + [12] = {"SGUNIT", XE_TILE_HW_ERR_SGUNIT_CORR},
>>> + [13 ... 15] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + [16] = {"SOC", XE_TILE_HW_ERR_SOC_CORR},
>>> + [17 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair pvc_err_stat_fatal_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_FATAL},
>>> + [1] = {"SGGI Cmd Parity", XE_TILE_HW_ERR_SGGI_FATAL},
>>> + [2 ... 7] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [8] = {"GSC error", XE_TILE_HW_ERR_GSC_FATAL},
>>> + [9] = {"SGLI Cmd Parity", XE_TILE_HW_ERR_SGLI_FATAL},
>>> + [10 ... 12] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [13] = {"SGCI Cmd Parity", XE_TILE_HW_ERR_SGCI_FATAL},
>>> + [14 ... 15] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [16] = {"SOC ERROR", XE_TILE_HW_ERR_SOC_FATAL},
>>> + [17 ... 19] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + [20] = {"MERT Cmd Parity", XE_TILE_HW_ERR_MERT_FATAL},
>>> + [21 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair pvc_err_stat_nonfatal_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_NONFATAL},
>>> + [1] = {"SGGI Data Parity", XE_TILE_HW_ERR_SGGI_NONFATAL},
>>> + [2 ... 7] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [8] = {"GSC", XE_TILE_HW_ERR_GSC_NONFATAL},
>>> + [9] = {"SGLI Data Parity", XE_TILE_HW_ERR_SGLI_NONFATAL},
>>> + [10 ... 12] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [13] = {"SGCI Data Parity", XE_TILE_HW_ERR_SGCI_NONFATAL},
>>> + [14 ... 15] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [16] = {"SOC", XE_TILE_HW_ERR_SOC_NONFATAL},
>>> + [17 ... 19] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + [20] = {"MERT Data Parity", XE_TILE_HW_ERR_MERT_NONFATAL},
>>> + [21 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair pvc_err_stat_correctable_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_CORR},
>>> + [1 ... 7] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + [8] = {"GSC", XE_TILE_HW_ERR_GSC_CORR},
>>> + [9 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dev_err_stat_fatal_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_FATAL},
>>> + [1 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dev_err_stat_nonfatal_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_NONFATAL},
>>> + [1 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +};
>>> +
>>> +static const struct err_msg_cntr_pair dev_err_stat_correctable_reg[] = {
>>> + [0] = {"GT", XE_TILE_HW_ERR_GT_CORR},
>>> + [1 ... 31] = {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +void xe_assign_hw_err_regs(struct xe_device *xe)
>>> +{
>>> + const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
>>> +
>>> + if (xe->info.platform == XE_DG2) {
>>> + dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dg2_err_stat_correctable_reg;
>>> + dev_err_stat[HARDWARE_ERROR_NONFATAL] = dg2_err_stat_nonfatal_reg;
>>> + dev_err_stat[HARDWARE_ERROR_FATAL] = dg2_err_stat_fatal_reg;
>>> + } else if (xe->info.platform == XE_PVC) {
>>> + dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
>>> + dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
>>> + dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
>>> + } else {
>>> + /* For other platforms report only GT errors */
>> why only GT errors??
>
> Because GT errors will only be common to all platforms. The other errors are platform specific.
and why are we not enabling those other errors?
>
>>> + dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dev_err_stat_correctable_reg;
>>> + dev_err_stat[HARDWARE_ERROR_NONFATAL] = dev_err_stat_nonfatal_reg;
>>> + dev_err_stat[HARDWARE_ERROR_FATAL] = dev_err_stat_fatal_reg;
>>> + }
>>> +}
>>> +
>>> +static void
>>> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>> +{
>>> + const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>> + const struct hardware_errors_regs *err_regs;
>>> + const struct err_msg_cntr_pair *errstat;
>>> + unsigned long errsrc;
>>> + unsigned long flags;
>>> + const char *errmsg;
>>> + struct xe_gt *mmio;
>>> + u32 indx;
>>> + u32 errbit;
>>> +
>>> + spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
>>> + err_regs = &tile_to_xe(tile)->hw_err_regs;
>>> + errstat = err_regs->dev_err_stat[hw_err];
>>> + mmio = tile->primary_gt;
>>> + errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
>>> + if (!errsrc) {
>>> + drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
>>> + "TILE%d detected DEV_ERR_STAT_REG_%s blank!\n",
>>> + tile->id, hw_err_str);
>>> + goto unlock;
>>> + }
>>> +
>>> + drm_info(&tile_to_xe(tile)->drm, HW_ERR
>>> + "TILE%d DEV_ERR_STAT_REG_%s=0x%08lx\n", tile->id, hw_err_str, errsrc);
>>> +
>>> + for_each_set_bit(errbit, &errsrc, 32) {
>>> + errmsg = errstat[errbit].errmsg;
>>> + indx = errstat[errbit].cntr_indx;
>>> +
>>> + if (hw_err == HARDWARE_ERROR_CORRECTABLE)
>>> + drm_warn(&tile_to_xe(tile)->drm,
>>> + HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
>>> + tile->id, errmsg, hw_err_str, errbit);
>>> +
>>> + else
>>> + drm_err_ratelimited(&tile_to_xe(tile)->drm,
>>> + HW_ERR "TILE%d detected %s %s error, bit[%d] is set\n",
>>> + tile->id, errmsg, hw_err_str, errbit);
>>> + tile->errors.count[indx]++;
>> The register here is a top level register and some of the sources have second error level registers
>> so the count shall be at second level source for all those that have and not at global level as here
>> it will not give granularity.
>
> My idea of having counter at top level was to have cumulative numbers for errors.
It doesn't give cumulative, as multiple sub errors can be reported in a single MSI, so always
count at lower level and leave the accounting to userspace.
Thanks,
Aravind.
>
>
> It will provide summation of all MSI's in case of correctable gt. Can we removed but looks logical to retain it.
>
>>> + }
>>> +
>>> + xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
>>> +unlock:
>>> + spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>> +}
>>> +
>>> +/*
>>> + * XE Platforms adds three Error bits to the Master Interrupt
>>> + * Register to support error handling. These three bits are
>>> + * used to convey the class of error:
>>> + * FATAL, NONFATAL, or CORRECTABLE.
>>> + *
>>> + * To process an interrupt:
>>> + * Determine source of error (IP block) by reading
>>> + * the Device Error Source Register (RW1C) that
>>> + * corresponds to the class of error being serviced
>>> + * and log the error.
>>> + */
>>> +void
>>> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>> +{
>>> + enum hardware_error hw_err;
>>> +
>>> + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
>>> + if (master_ctl & XE_ERROR_IRQ(hw_err))
>>> + xe_hw_error_source_handler(tile, hw_err);
>>> + }
>>> +}
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>>> new file mode 100644
>>> index 000000000000..c0c05b9130eb
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>>> @@ -0,0 +1,64 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +#ifndef XE_HW_ERRORS_H_
>>> +#define XE_HW_ERRORS_H_
>>> +
>>> +#include <linux/stddef.h>
>>> +#include <linux/types.h>
>>> +
>>> +/* Error categories reported by hardware */
>>> +enum hardware_error {
>>> + HARDWARE_ERROR_CORRECTABLE = 0,
>>> + HARDWARE_ERROR_NONFATAL = 1,
>>> + HARDWARE_ERROR_FATAL = 2,
>>> + HARDWARE_ERROR_MAX,
>>> +};
>>> +
>>> +/* Count of Correctable and Uncorrectable errors reported on tile */
>>> +enum xe_tile_hw_errors {
>>> + XE_TILE_HW_ERR_GT_FATAL = 0,
>>> + XE_TILE_HW_ERR_SGGI_FATAL,
>>> + XE_TILE_HW_ERR_DISPLAY_FATAL,
>>> + XE_TILE_HW_ERR_SGDI_FATAL,
>>> + XE_TILE_HW_ERR_SGLI_FATAL,
>>> + XE_TILE_HW_ERR_SGUNIT_FATAL,
>>> + XE_TILE_HW_ERR_SGCI_FATAL,
>>> + XE_TILE_HW_ERR_GSC_FATAL,
>>> + XE_TILE_HW_ERR_SOC_FATAL,
>>> + XE_TILE_HW_ERR_MERT_FATAL,
>>> + XE_TILE_HW_ERR_SGMI_FATAL,
>>> + XE_TILE_HW_ERR_UNKNOWN_FATAL,
>>> + XE_TILE_HW_ERR_SGGI_NONFATAL,
>>> + XE_TILE_HW_ERR_DISPLAY_NONFATAL,
>>> + XE_TILE_HW_ERR_SGDI_NONFATAL,
>>> + XE_TILE_HW_ERR_SGLI_NONFATAL,
>>> + XE_TILE_HW_ERR_GT_NONFATAL,
>>> + XE_TILE_HW_ERR_SGUNIT_NONFATAL,
>>> + XE_TILE_HW_ERR_SGCI_NONFATAL,
>>> + XE_TILE_HW_ERR_GSC_NONFATAL,
>>> + XE_TILE_HW_ERR_SOC_NONFATAL,
>>> + XE_TILE_HW_ERR_MERT_NONFATAL,
>>> + XE_TILE_HW_ERR_SGMI_NONFATAL,
>>> + XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
>>> + XE_TILE_HW_ERR_GT_CORR,
>>> + XE_TILE_HW_ERR_DISPLAY_CORR,
>>> + XE_TILE_HW_ERR_SGUNIT_CORR,
>>> + XE_TILE_HW_ERR_GSC_CORR,
>>> + XE_TILE_HW_ERR_SOC_CORR,
>>> + XE_TILE_HW_ERR_UNKNOWN_CORR,
>>> + XE_TILE_HW_ERROR_MAX,
>>> +};
>>> +
>>> +struct err_msg_cntr_pair {
>>> + const char *errmsg;
>>> + const u32 cntr_indx;
>>> +};
>>> +
>>> +struct xe_device;
>>> +struct xe_tile;
>>> +
>>> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
>>> +void xe_assign_hw_err_regs(struct xe_device *xe);
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 1dee3e832eb5..48b933234342 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -418,6 +418,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>> xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>> gt_irq_handler(tile, master_ctl, intr_dw, identity);
>>> + xe_hw_error_irq_handler(tile, master_ctl);
>>> /*
>>> * Display interrupts (including display backlight operations
>>> @@ -572,6 +573,8 @@ int xe_irq_install(struct xe_device *xe)
>>> return -EINVAL;
>>> }
>>> + xe_assign_hw_err_regs(xe);
>>> +
>>> xe->irq.enabled = true;
>>> xe_irq_reset(xe);
>> Thanks,
>> Aravind.
More information about the Intel-xe
mailing list