[Intel-xe] [PATCH v2 1/4] drm/xe: Handle errors from various components.
Ghimiray, Himal Prasad
himal.prasad.ghimiray at intel.com
Thu Aug 10 09:38:03 UTC 2023
On 10-08-2023 15:01, Ghimiray, Himal Prasad wrote:
>
>
> On 10-08-2023 13:24, Jani Nikula wrote:
>> On Thu, 10 Aug 2023, Himal Prasad Ghimiray<himal.prasad.ghimiray at intel.com> wrote:
>>> The GFX device can generate numbers of classes of error under the new
>>> infrastructure: correctable, non-fatal, and fatal errors.
>>>
>>> The non-fatal and fatal error classes distinguish between levels of
>>> severity for uncorrectable errors. Driver will only handle logging
>>> of errors and updating counters from various components within the
>>> graphics device. Anything more will be handled at system level.
>>>
>>> For errors that will route as interrupts, three bits in the Master
>>> Interrupt Register will be used to convey the class of error.
>>>
>>> For each class of error: Determine source of error (IP block) by reading
>>> the Device Error Source Register (RW1C) that
>>> corresponds to the class of error being serviced.
>>>
>>> Bspec: 50875, 53073, 53074, 53075
>>>
>>> Cc: Rodrigo Vivi<rodrigo.vivi at intel.com>
>>> Cc: Aravind Iddamsetty<aravind.iddamsetty at intel.com>
>>> Cc: Matthew Brost<matthew.brost at intel.com>
>>> Cc: Matt Roper<matthew.d.roper at intel.com>
>>> Cc: Joonas Lahtinen<joonas.lahtinen at linux.intel.com>
>>> Cc: Jani Nikula<jani.nikula at intel.com>
>>> Signed-off-by: Himal Prasad Ghimiray<himal.prasad.ghimiray at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/regs/xe_regs.h | 7 +
>>> drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 108 +++++++++
>>> drivers/gpu/drm/xe/xe_device_types.h | 6 +
>>> drivers/gpu/drm/xe/xe_irq.c | 220 +++++++++++++++++++
>>> 4 files changed, 341 insertions(+)
>>> create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> index ec45b1ba9db1..9901e55fc89c 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> @@ -87,7 +87,14 @@
>>> #define GU_MISC_IRQ REG_BIT(29)
>>> #define DISPLAY_IRQ REG_BIT(16)
>>> #define GT_DW_IRQ(x) REG_BIT(x)
>>> +#define XE_ERROR_IRQ(x) REG_BIT(26 + (x))
>>>
>>> #define PVC_RP_STATE_CAP XE_REG(0x281014)
>>>
>>> +enum hardware_error {
>>> + HARDWARE_ERROR_CORRECTABLE = 0,
>>> + HARDWARE_ERROR_NONFATAL = 1,
>>> + HARDWARE_ERROR_FATAL = 2,
>>> + HARDWARE_ERROR_MAX,
>>> +};
>> This file is about registers. IMO enums belong somewhere else. Define
>> hardware registers using macros.
>
> Hmm. Will look for better placeholder for this enum.
>
>>> #endif
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> new file mode 100644
>>> index 000000000000..fbb794b2f183
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> @@ -0,0 +1,108 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +#ifndef XE_TILE_ERROR_REGS_H_
>>> +#define XE_TILE_ERROR_REGS_H_
>>> +
>>> +#include <linux/stddef.h>
>>> +
>>> +#define _DEV_ERR_STAT_NONFATAL 0x100178
>>> +#define _DEV_ERR_STAT_CORRECTABLE 0x10017c
>>> +#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \
>>> + _DEV_ERR_STAT_CORRECTABLE, \
>>> + _DEV_ERR_STAT_NONFATAL))
>>> +
>>> +#define DEV_ERR_STAT_MAX_ERROR_BIT (21)
>>> +
>>> +/* Count of Correctable and Uncorrectable errors reported on tile */
>>> +enum xe_tile_hw_errors {g
>>> + XE_TILE_HW_ERR_GT_FATAL = 0,
>>> + XE_TILE_HW_ERR_SGGI_FATAL,
>>> + XE_TILE_HW_ERR_DISPLAY_FATAL,
>>> + XE_TILE_HW_ERR_SGDI_FATAL,
>>> + XE_TILE_HW_ERR_SGLI_FATAL,
>>> + XE_TILE_HW_ERR_SGUNIT_FATAL,
>>> + XE_TILE_HW_ERR_SGCI_FATAL,
>>> + XE_TILE_HW_ERR_GSC_FATAL,
>>> + XE_TILE_HW_ERR_SOC_FATAL,
>>> + XE_TILE_HW_ERR_MERT_FATAL,
>>> + XE_TILE_HW_ERR_SGMI_FATAL,
>>> + XE_TILE_HW_ERR_UNKNOWN_FATAL,
>>> + XE_TILE_HW_ERR_SGGI_NONFATAL,
>>> + XE_TILE_HW_ERR_DISPLAY_NONFATAL,
>>> + XE_TILE_HW_ERR_SGDI_NONFATAL,
>>> + XE_TILE_HW_ERR_SGLI_NONFATAL,
>>> + XE_TILE_HW_ERR_GT_NONFATAL,
>>> + XE_TILE_HW_ERR_SGUNIT_NONFATAL,
>>> + XE_TILE_HW_ERR_SGCI_NONFATAL,
>>> + XE_TILE_HW_ERR_GSC_NONFATAL,
>>> + XE_TILE_HW_ERR_SOC_NONFATAL,
>>> + XE_TILE_HW_ERR_MERT_NONFATAL,
>>> + XE_TILE_HW_ERR_SGMI_NONFATAL,
>>> + XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
>>> + XE_TILE_HW_ERR_GT_CORR,
>>> + XE_TILE_HW_ERR_DISPLAY_CORR,
>>> + XE_TILE_HW_ERR_SGUNIT_CORR,
>>> + XE_TILE_HW_ERR_GSC_CORR,
>>> + XE_TILE_HW_ERR_SOC_CORR,
>>> + XE_TILE_HW_ERR_UNKNOWN_CORR,
>>> +};
>> Ditto about enums and regs.
> Will address.
>>> +
>>> +#define XE_TILE_HW_ERROR_MAX (XE_TILE_HW_ERR_UNKNOWN_CORR + 1)
>> If it's an enum, adding that last in the enum does the trick.
> Makes sense.
>>> +
>>> +#define PVC_DEV_ERR_STAT_FATAL_MASK \
>>> + (REG_BIT(0) | \
>>> + REG_BIT(1) | \
>>> + REG_BIT(8) | \
>>> + REG_BIT(9) | \
>>> + REG_BIT(13) | \
>>> + REG_BIT(16) | \
>>> + REG_BIT(20))
>>> +
>>> +#define PVC_DEV_ERR_STAT_NONFATAL_MASK \
>>> + (REG_BIT(0) | \
>>> + REG_BIT(1) | \
>>> + REG_BIT(8) | \
>>> + REG_BIT(9) | \
>>> + REG_BIT(13) | \
>>> + REG_BIT(16) | \
>>> + REG_BIT(20))
>>> +
>>> +#define PVC_DEV_ERR_STAT_CORRECTABLE_MASK \
>>> + (REG_BIT(0) | \
>>> + REG_BIT(8))
>>> +
>>> +#define DG2_DEV_ERR_STAT_FATAL_MASK \
>>> + (REG_BIT(0) | \
>>> + REG_BIT(4) | \
>>> + REG_BIT(8) | \
>>> + REG_BIT(12) | \
>>> + REG_BIT(16))
>>> +
>>> +#define DG2_DEV_ERR_STAT_NONFATAL_MASK \
>>> + (REG_BIT(0) | \
>>> + REG_BIT(4) | \
>>> + REG_BIT(8) | \
>>> + REG_BIT(12) | \
>>> + REG_BIT(16) | \
>>> + REG_BIT(20))
>>> +
>>> +#define DG2_DEV_ERR_STAT_CORRECTABLE_MASK \
>>> + (REG_BIT(0) | \
>>> + REG_BIT(4) | \
>>> + REG_BIT(8) | \
>>> + REG_BIT(12) | \
>>> + REG_BIT(16))
>> Are the above supposed to match what's in xe_tile_hw_errors? Seems
>> rather unmaintainable.
> xe_tile_hw_errors contains superset of applicable platforms and mask determines
> what are applicable bits for a platform.
>>> +
>>> +#define REG_SIZE 32
>>> +
>>> +#define xe_tile_log_hw_err(tile, fmt, ...) \
>>> + drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
>>> + tile->id, ##__VA_ARGS__)
>>> +
>>> +#define xe_tile_log_hw_warn(tile, fmt, ...) \
>>> + drm_warn(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
>>> + tile->id, ##__VA_ARGS__)
>> Do we really want to keep adding new macros for all possible scenarios
>> in the driver? This is getting out of hand.
>>
>> Where's HW_ERR defined?
> include/linux/printk.h defines HW_ERR.
>>> +
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>> index f84ecb976f5d..1335ba74981a 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -16,6 +16,7 @@
>>> #include "xe_gt_types.h"
>>> #include "xe_platform_types.h"
>>> #include "xe_step_types.h"
>>> +#include "regs/xe_tile_error_regs.h"
>>>
>>> #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>>> #include "ext/intel_device_info.h"
>>> @@ -166,6 +167,11 @@ struct xe_tile {
>>>
>>> /** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>> struct kobject *sysfs;
>>> +
>>> + /** @tile_hw_errors: hardware errors reported for the tile */
>>> + struct tile_hw_errors {
>>> + unsigned long hw[XE_TILE_HW_ERROR_MAX];
>> Even with the documentation comment, I have to look up the source code
>> to realize this is the *number* of errors for each class.
>>
>> Maybe "count" is more informative than "hw".
> Ok.
>>> + } errors;
>>> };
>>>
>>> /**
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 2022a5643e01..04a665faea23 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -362,6 +362,223 @@ static void dg1_intr_enable(struct xe_device *xe, bool stall)
>>> xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR);
>>> }
>>>
>>> +static const char *
>>> +hardware_error_type_to_str(const enum hardware_error hw_err)
>>> +{
>>> + switch (hw_err) {
>>> + case HARDWARE_ERROR_CORRECTABLE:
>>> + return "CORRECTABLE";
>>> + case HARDWARE_ERROR_NONFATAL:
>>> + return "NONFATAL";
>>> + case HARDWARE_ERROR_FATAL:
>>> + return "FATAL";
>>> + default:
>>> + return "UNKNOWN";
>>> + }
>>> +}
>>> +
>>> +struct error_msg_counter_pair {
>>> + const char *errmsg;
>>> + int errcounter;
>> Counter? Or type/class/whatever?
>>
>>> +};
>>> +
>>> +struct error_msg_counter_pair dev_err_stat_fatal_reg[] = {
>>> + {"GT", XE_TILE_HW_ERR_GT_FATAL /* Bit Pos 0 */},
>> Does this again tie the enums and the bit positions together, similar to
>> how the mask macros also do above?
>>
>> There needs to be a single point of truth for all of this.
>>
>> I think this needs a redesign.
>
> As commented above this is array which defines all valid bit positions
> irrespective of platform. Mask was
>
> to determine platform specific applicability.
>
>> BR,
>> Jani.
>>
>>> + {"SGGI Cmd Parity", XE_TILE_HW_ERR_SGGI_FATAL /* Bit Pos 1 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"DISPLAY", XE_TILE_HW_ERR_DISPLAY_FATAL /* Bit Pos 4 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"GSC error", XE_TILE_HW_ERR_GSC_FATAL /* Bit Pos 8 */},
>>> + {"SGLI Cmd Parity", XE_TILE_HW_ERR_SGLI_FATAL /* Bit Pos 9 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"SGUNIT", XE_TILE_HW_ERR_SGUNIT_FATAL /* Bit Pos 12 */},
>>> + {"SGCI Cmd Parity", XE_TILE_HW_ERR_SGCI_FATAL /* Bit Pos 13 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"SOC ERROR", XE_TILE_HW_ERR_SOC_FATAL /* Bit Pos 16 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> + {"MERT Cmd Parity", XE_TILE_HW_ERR_MERT_FATAL /* Bit Pos 20 */},
>>> +};
>>> +
>>> +struct error_msg_counter_pair dev_err_stat_nonfatal_reg[] = {
>>> + {"GT", XE_TILE_HW_ERR_GT_NONFATAL /* Bit Pos 0 */},
>>> + {"SGGI Data Parity", XE_TILE_HW_ERR_SGGI_NONFATAL /* Bit Pos 1 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"DISPLAY", XE_TILE_HW_ERR_DISPLAY_NONFATAL /* Bit Pos 4 */},
>>> + {"SGDI Data Parity", XE_TILE_HW_ERR_SGDI_NONFATAL /* Bit Pos 5 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"GSC", XE_TILE_HW_ERR_GSC_NONFATAL /* Bit Pos 8 */},
>>> + {"SGLI Data Parity", XE_TILE_HW_ERR_SGLI_NONFATAL /* Bit Pos 9 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"SGUNIT", XE_TILE_HW_ERR_SGUNIT_NONFATAL /* Bit Pos 12 */},
>>> + {"SGCI Data Parity", XE_TILE_HW_ERR_SGCI_NONFATAL /* Bit Pos 13 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"SOC", XE_TILE_HW_ERR_SOC_NONFATAL /* Bit Pos 16 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL /* Bit Pos 17 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> + {"MERT Data Parity", XE_TILE_HW_ERR_MERT_NONFATAL /* Bit Pos 20 */},
>>> +};
>>> +
>>> +struct error_msg_counter_pair dev_err_stat_correctable_reg[] = {
>>> + {"GT", XE_TILE_HW_ERR_GT_CORR /* Bit Pos 0 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"DISPLAY", XE_TILE_HW_ERR_DISPLAY_CORR /* Bit Pos 4 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"GSC", XE_TILE_HW_ERR_GSC_CORR /* Bit Pos 8 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"SGUNIT", XE_TILE_HW_ERR_SGUNIT_CORR /* Bit Pos 12 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"SOC", XE_TILE_HW_ERR_SOC_CORR /* Bit Pos 16 */},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> + {"Undefined", XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +static void update_valid_error_regs(struct xe_device *xe)
>>> +{
>>> + unsigned long mask = 0;
>>> +
>>> + u32 i;
>>> +
>>> + if (xe->info.platform == XE_DG2) {
>>> + mask = ~(0 | DG2_DEV_ERR_STAT_FATAL_MASK);
>>> + for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> + dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
>>> + {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
>> Nope. For one thing, the arrays really should be static const, placed in
>> rodata, and not mutable.
> My Idea of keeping it mutable is to avoid multiple platform specific
> arrays.
>> For another, if you have a platform with two or more different devices,
>> whichever gets probed last clobbers the data.
>
> Thanks for pointing it out. I had missed this particular scenario. I
> believe defining platform specific array is better
>
> because then we can ensure them to be immutable and we will not be
> required to have platform specific mask also.
>
> I believe this will help with maintainability too.
>
> BR
>
> Himal Ghimiray
>
>>> +
>>> + mask = ~(0 | DG2_DEV_ERR_STAT_NONFATAL_MASK);
>>> + for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> + dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
>>> + {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
>>> +
>>> + mask = ~(0 | DG2_DEV_ERR_STAT_CORRECTABLE_MASK);
>>> + for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> + dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
>>> + {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
>>> + } else if (xe->info.platform == XE_PVC) {
>>> + mask = ~(0 | PVC_DEV_ERR_STAT_FATAL_MASK);
>>> + for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> + dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
>>> + {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
>>> +
>>> + mask = ~(0 | PVC_DEV_ERR_STAT_NONFATAL_MASK);
>>> + for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> + dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
>>> + {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
>>> +
>>> + mask = ~(0 | PVC_DEV_ERR_STAT_CORRECTABLE_MASK);
>>> + for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> + dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
>>> + {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
>>> + }
>>> +}
>>> +
>>> +static void
>>> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>> +{
>>> + const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>> + struct error_msg_counter_pair *errstat;
>>> + unsigned long errsrc;
>>> + unsigned long flags;
>>> + const char *errmsg;
>>> + struct xe_gt *mmio;
>>> + u32 counter;
>>> + u32 errcntr;
>>> + u32 errbit;
>>> +
>>> + switch (hw_err) {
>>> + case HARDWARE_ERROR_FATAL:
>>> + errstat = (struct error_msg_counter_pair *)dev_err_stat_fatal_reg;
>> Why the casts?
>>
>>> + counter = XE_TILE_HW_ERR_UNKNOWN_FATAL;
>>> + break;
>>> + case HARDWARE_ERROR_NONFATAL:
>>> + errstat = (struct error_msg_counter_pair *)dev_err_stat_nonfatal_reg;
>>> + counter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL;
>>> + break;
>>> + case HARDWARE_ERROR_CORRECTABLE:
>>> + errstat = (struct error_msg_counter_pair *)dev_err_stat_correctable_reg;
>>> + counter = XE_TILE_HW_ERR_UNKNOWN_CORR;
>>> + break;
>>> + default:
>>> + return;
>>> + }
>>> +
>>> + spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
>>> + mmio = tile->primary_gt;
>>> + errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
>>> +
>>> + if (!errsrc) {
>>> + xe_tile_log_hw_err(tile, "DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
>>> + goto unlock;
>>> + }
>>> +
>>> + for_each_set_bit(errbit, &errsrc, REG_SIZE) {
>>> + if (errbit < DEV_ERR_STAT_MAX_ERROR_BIT) {
>>> + errmsg = errstat[errbit].errmsg;
>>> + errcntr = errstat[errbit].errcounter;
>>> + } else {
>>> + errmsg = "Undefined";
>>> + errcntr = counter;
>>> + }
>>> +
>>> + if (hw_err == HARDWARE_ERROR_CORRECTABLE)
>>> + xe_tile_log_hw_warn(tile, "%s %s error bit[%d] is set\n",
>>> + errmsg, hw_err_str, errbit);
>>> + else
>>> + xe_tile_log_hw_err(tile, "%s %s error bit[%d] is set\n",
>>> + errmsg, hw_err_str, errbit);
>>> +
>>> + tile->errors.hw[errcntr]++;
>>> + }
>>> +
>>> + xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
>>> +unlock:
>>> + spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>> +}
>>> +
>>> +/*
>>> + * XE Platforms adds three Error bits to the Master Interrupt
>>> + * Register to support error handling. These three bits are
>>> + * used to convey the class of error:
>>> + * FATAL, NONFATAL, or CORRECTABLE.
>>> + *
>>> + * To process an interrupt:
>>> + * Determine source of error (IP block) by reading
>>> + * the Device Error Source Register (RW1C) that
>>> + * corresponds to the class of error being serviced
>>> + * and log the error.
>>> + */
>>> +static void
>>> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>> +{
>>> + enum hardware_error hw_err;
>>> +
>>> + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
>>> + if (master_ctl & XE_ERROR_IRQ(hw_err))
>>> + xe_hw_error_source_handler(tile, hw_err);
>>> + }
>>> +}
>>> +
>>> /*
>>> * Top-level interrupt handler for Xe_LP+ and beyond. These platforms have
>>> * a "master tile" interrupt register which must be consulted before the
>>> @@ -413,6 +630,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>> xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>>
>>> gt_irq_handler(tile, master_ctl, intr_dw, identity);
>>> + xe_hw_error_irq_handler(tile, master_ctl);
>>>
>>> /*
>>> * Display interrupts (including display backlight operations
>>> @@ -561,6 +779,8 @@ int xe_irq_install(struct xe_device *xe)
>>> return -EINVAL;
>>> }
>>>
>>> + update_valid_error_regs(xe);
>>> +
>>> xe->irq.enabled = true;
>>>
>>> xe_irq_reset(xe);
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20230810/522f21d3/attachment-0001.htm>
More information about the Intel-xe
mailing list