[Intel-xe] [PATCH v2 1/4] drm/xe: Handle errors from various components.

Thu Aug 10 10:17:38 UTC 2023

Is there any new content here? This is what it looks like to me:

https://lore.kernel.org/r/78b0e2d2-3c4c-484f-bcd4-fa896adef534@intel.com

BR,
Jani

On Thu, 10 Aug 2023, "Ghimiray, Himal Prasad" <himal.prasad.ghimiray at intel.com> wrote:
> On 10-08-2023 15:01, Ghimiray, Himal Prasad wrote:
>>
>>
>> On 10-08-2023 13:24, Jani Nikula wrote:
>>> On Thu, 10 Aug 2023, Himal Prasad Ghimiray<himal.prasad.ghimiray at intel.com>  wrote:
>>>> The GFX device can generate numbers of classes of error under the new
>>>> infrastructure: correctable, non-fatal, and fatal errors.
>>>>
>>>> The non-fatal and fatal error classes distinguish between levels of
>>>> severity for uncorrectable errors. Driver will only handle logging
>>>> of errors and updating counters from various components within the
>>>> graphics device. Anything more will be handled at system level.
>>>>
>>>> For errors that will route as interrupts, three bits in the Master
>>>> Interrupt Register will be used to convey the class of error.
>>>>
>>>> For each class of error: Determine source of error (IP block) by reading
>>>> the Device Error Source Register (RW1C) that
>>>> corresponds to the class of error being serviced.
>>>>
>>>> Bspec: 50875, 53073, 53074, 53075
>>>>
>>>> Cc: Rodrigo Vivi<rodrigo.vivi at intel.com>
>>>> Cc: Aravind Iddamsetty<aravind.iddamsetty at intel.com>
>>>> Cc: Matthew Brost<matthew.brost at intel.com>
>>>> Cc: Matt Roper<matthew.d.roper at intel.com>
>>>> Cc: Joonas Lahtinen<joonas.lahtinen at linux.intel.com>
>>>> Cc: Jani Nikula<jani.nikula at intel.com>
>>>> Signed-off-by: Himal Prasad Ghimiray<himal.prasad.ghimiray at intel.com>
>>>> ---
>>>>   drivers/gpu/drm/xe/regs/xe_regs.h            |   7 +
>>>>   drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 108 +++++++++
>>>>   drivers/gpu/drm/xe/xe_device_types.h         |   6 +
>>>>   drivers/gpu/drm/xe/xe_irq.c                  | 220 +++++++++++++++++++
>>>>   4 files changed, 341 insertions(+)
>>>>   create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
>>>> index ec45b1ba9db1..9901e55fc89c 100644
>>>> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
>>>> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
>>>> @@ -87,7 +87,14 @@
>>>>   #define   GU_MISC_IRQ				REG_BIT(29)
>>>>   #define   DISPLAY_IRQ				REG_BIT(16)
>>>>   #define   GT_DW_IRQ(x)				REG_BIT(x)
>>>> +#define   XE_ERROR_IRQ(x)			REG_BIT(26 + (x))
>>>>   
>>>>   #define PVC_RP_STATE_CAP			XE_REG(0x281014)
>>>>   
>>>> +enum hardware_error {
>>>> +	HARDWARE_ERROR_CORRECTABLE = 0,
>>>> +	HARDWARE_ERROR_NONFATAL = 1,
>>>> +	HARDWARE_ERROR_FATAL = 2,
>>>> +	HARDWARE_ERROR_MAX,
>>>> +};
>>> This file is about registers. IMO enums belong somewhere else. Define
>>> hardware registers using macros.
>>
>> Hmm. Will look for better placeholder for this enum.
>>
>>>>   #endif
>>>> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>>> new file mode 100644
>>>> index 000000000000..fbb794b2f183
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>>> @@ -0,0 +1,108 @@
>>>> +/* SPDX-License-Identifier: MIT */
>>>> +/*
>>>> + * Copyright © 2023 Intel Corporation
>>>> + */
>>>> +#ifndef XE_TILE_ERROR_REGS_H_
>>>> +#define XE_TILE_ERROR_REGS_H_
>>>> +
>>>> +#include <linux/stddef.h>
>>>> +
>>>> +#define _DEV_ERR_STAT_NONFATAL                         0x100178
>>>> +#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
>>>> +#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
>>>> +								_DEV_ERR_STAT_CORRECTABLE, \
>>>> +								_DEV_ERR_STAT_NONFATAL))
>>>> +
>>>> +#define  DEV_ERR_STAT_MAX_ERROR_BIT         (21)
>>>> +
>>>> +/* Count of  Correctable and Uncorrectable errors reported on tile */
>>>> +enum xe_tile_hw_errors {g
>>>> +	XE_TILE_HW_ERR_GT_FATAL = 0,
>>>> +	XE_TILE_HW_ERR_SGGI_FATAL,
>>>> +	XE_TILE_HW_ERR_DISPLAY_FATAL,
>>>> +	XE_TILE_HW_ERR_SGDI_FATAL,
>>>> +	XE_TILE_HW_ERR_SGLI_FATAL,
>>>> +	XE_TILE_HW_ERR_SGUNIT_FATAL,
>>>> +	XE_TILE_HW_ERR_SGCI_FATAL,
>>>> +	XE_TILE_HW_ERR_GSC_FATAL,
>>>> +	XE_TILE_HW_ERR_SOC_FATAL,
>>>> +	XE_TILE_HW_ERR_MERT_FATAL,
>>>> +	XE_TILE_HW_ERR_SGMI_FATAL,
>>>> +	XE_TILE_HW_ERR_UNKNOWN_FATAL,
>>>> +	XE_TILE_HW_ERR_SGGI_NONFATAL,
>>>> +	XE_TILE_HW_ERR_DISPLAY_NONFATAL,
>>>> +	XE_TILE_HW_ERR_SGDI_NONFATAL,
>>>> +	XE_TILE_HW_ERR_SGLI_NONFATAL,
>>>> +	XE_TILE_HW_ERR_GT_NONFATAL,
>>>> +	XE_TILE_HW_ERR_SGUNIT_NONFATAL,
>>>> +	XE_TILE_HW_ERR_SGCI_NONFATAL,
>>>> +	XE_TILE_HW_ERR_GSC_NONFATAL,
>>>> +	XE_TILE_HW_ERR_SOC_NONFATAL,
>>>> +	XE_TILE_HW_ERR_MERT_NONFATAL,
>>>> +	XE_TILE_HW_ERR_SGMI_NONFATAL,
>>>> +	XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
>>>> +	XE_TILE_HW_ERR_GT_CORR,
>>>> +	XE_TILE_HW_ERR_DISPLAY_CORR,
>>>> +	XE_TILE_HW_ERR_SGUNIT_CORR,
>>>> +	XE_TILE_HW_ERR_GSC_CORR,
>>>> +	XE_TILE_HW_ERR_SOC_CORR,
>>>> +	XE_TILE_HW_ERR_UNKNOWN_CORR,
>>>> +};
>>> Ditto about enums and regs.
>> Will address.
>>>> +
>>>> +#define XE_TILE_HW_ERROR_MAX (XE_TILE_HW_ERR_UNKNOWN_CORR + 1)
>>> If it's an enum, adding that last in the enum does the trick.
>> Makes sense.
>>>> +
>>>> +#define PVC_DEV_ERR_STAT_FATAL_MASK \
>>>> +		(REG_BIT(0) | \
>>>> +		 REG_BIT(1) | \
>>>> +		 REG_BIT(8) | \
>>>> +		 REG_BIT(9) | \
>>>> +		 REG_BIT(13) | \
>>>> +		 REG_BIT(16) | \
>>>> +		 REG_BIT(20))
>>>> +
>>>> +#define PVC_DEV_ERR_STAT_NONFATAL_MASK \
>>>> +		(REG_BIT(0) | \
>>>> +		 REG_BIT(1) | \
>>>> +		 REG_BIT(8) | \
>>>> +		 REG_BIT(9) | \
>>>> +		 REG_BIT(13) | \
>>>> +		 REG_BIT(16) | \
>>>> +		 REG_BIT(20))
>>>> +
>>>> +#define PVC_DEV_ERR_STAT_CORRECTABLE_MASK \
>>>> +		(REG_BIT(0) | \
>>>> +		 REG_BIT(8))
>>>> +
>>>> +#define DG2_DEV_ERR_STAT_FATAL_MASK \
>>>> +		(REG_BIT(0) | \
>>>> +		 REG_BIT(4) | \
>>>> +		 REG_BIT(8) | \
>>>> +		 REG_BIT(12) | \
>>>> +		 REG_BIT(16))
>>>> +
>>>> +#define DG2_DEV_ERR_STAT_NONFATAL_MASK \
>>>> +		(REG_BIT(0) | \
>>>> +		 REG_BIT(4) | \
>>>> +		 REG_BIT(8) | \
>>>> +		 REG_BIT(12) | \
>>>> +		 REG_BIT(16) | \
>>>> +		 REG_BIT(20))
>>>> +
>>>> +#define DG2_DEV_ERR_STAT_CORRECTABLE_MASK \
>>>> +		(REG_BIT(0) | \
>>>> +		 REG_BIT(4) | \
>>>> +		 REG_BIT(8) | \
>>>> +		 REG_BIT(12) | \
>>>> +		 REG_BIT(16))
>>> Are the above supposed to match what's in xe_tile_hw_errors? Seems
>>> rather unmaintainable.
>> xe_tile_hw_errors contains superset of applicable platforms and mask determines
>> what are applicable bits for a platform.
>>>> +
>>>> +#define REG_SIZE 32
>>>> +
>>>> +#define xe_tile_log_hw_err(tile, fmt, ...) \
>>>> +	drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
>>>> +			    tile->id, ##__VA_ARGS__)
>>>> +
>>>> +#define xe_tile_log_hw_warn(tile, fmt, ...) \
>>>> +	drm_warn(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
>>>> +		 tile->id, ##__VA_ARGS__)
>>> Do we really want to keep adding new macros for all possible scenarios
>>> in the driver? This is getting out of hand.
>>>
>>> Where's HW_ERR defined?
>> include/linux/printk.h defines HW_ERR.
>>>> +
>>>> +#endif
>>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>>> index f84ecb976f5d..1335ba74981a 100644
>>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>>> @@ -16,6 +16,7 @@
>>>>   #include "xe_gt_types.h"
>>>>   #include "xe_platform_types.h"
>>>>   #include "xe_step_types.h"
>>>> +#include "regs/xe_tile_error_regs.h"
>>>>   
>>>>   #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>>>>   #include "ext/intel_device_info.h"
>>>> @@ -166,6 +167,11 @@ struct xe_tile {
>>>>   
>>>>   	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>>>   	struct kobject *sysfs;
>>>> +
>>>> +	/** @tile_hw_errors: hardware errors reported for the tile */
>>>> +	struct tile_hw_errors {
>>>> +		unsigned long hw[XE_TILE_HW_ERROR_MAX];
>>> Even with the documentation comment, I have to look up the source code
>>> to realize this is the *number* of errors for each class.
>>>
>>> Maybe "count" is more informative than "hw".
>> Ok.
>>>> +	} errors;
>>>>   };
>>>>   
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>>> index 2022a5643e01..04a665faea23 100644
>>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>>> @@ -362,6 +362,223 @@ static void dg1_intr_enable(struct xe_device *xe, bool stall)
>>>>   		xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR);
>>>>   }
>>>>   
>>>> +static const char *
>>>> +hardware_error_type_to_str(const enum hardware_error hw_err)
>>>> +{
>>>> +	switch (hw_err) {
>>>> +	case HARDWARE_ERROR_CORRECTABLE:
>>>> +		return "CORRECTABLE";
>>>> +	case HARDWARE_ERROR_NONFATAL:
>>>> +		return "NONFATAL";
>>>> +	case HARDWARE_ERROR_FATAL:
>>>> +		return "FATAL";
>>>> +	default:
>>>> +		return "UNKNOWN";
>>>> +	}
>>>> +}
>>>> +
>>>> +struct error_msg_counter_pair {
>>>> +	const char *errmsg;
>>>> +	int errcounter;
>>> Counter? Or type/class/whatever?
>>>
>>>> +};
>>>> +
>>>> +struct error_msg_counter_pair dev_err_stat_fatal_reg[] = {
>>>> +	{"GT",			XE_TILE_HW_ERR_GT_FATAL			/* Bit Pos 0 */},
>>> Does this again tie the enums and the bit positions together, similar to
>>> how the mask macros also do above?
>>>
>>> There needs to be a single point of truth for all of this.
>>>
>>> I think this needs a redesign.
>>
>> As commented above this is array which defines all valid bit positions 
>> irrespective of platform. Mask was
>>
>> to determine platform specific applicability.
>>
>>> BR,
>>> Jani.
>>>
>>>> +	{"SGGI Cmd Parity",	XE_TILE_HW_ERR_SGGI_FATAL		/* Bit Pos 1 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_FATAL		/* Bit Pos 4 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"GSC error",		XE_TILE_HW_ERR_GSC_FATAL		/* Bit Pos 8 */},
>>>> +	{"SGLI Cmd Parity",	XE_TILE_HW_ERR_SGLI_FATAL		/* Bit Pos 9 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_FATAL		/* Bit Pos 12 */},
>>>> +	{"SGCI Cmd Parity",	XE_TILE_HW_ERR_SGCI_FATAL		/* Bit Pos 13 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"SOC ERROR",		XE_TILE_HW_ERR_SOC_FATAL		/* Bit Pos 16 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>>> +	{"MERT Cmd Parity",	XE_TILE_HW_ERR_MERT_FATAL		/* Bit Pos 20 */},
>>>> +};
>>>> +
>>>> +struct error_msg_counter_pair dev_err_stat_nonfatal_reg[] = {
>>>> +	{"GT",			XE_TILE_HW_ERR_GT_NONFATAL		/* Bit Pos 0 */},
>>>> +	{"SGGI Data Parity",	XE_TILE_HW_ERR_SGGI_NONFATAL		/* Bit Pos 1 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_NONFATAL		/* Bit Pos 4 */},
>>>> +	{"SGDI Data Parity",	XE_TILE_HW_ERR_SGDI_NONFATAL		/* Bit Pos 5 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"GSC",			XE_TILE_HW_ERR_GSC_NONFATAL		/* Bit Pos 8 */},
>>>> +	{"SGLI Data Parity",	XE_TILE_HW_ERR_SGLI_NONFATAL		/* Bit Pos 9 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_NONFATAL		/* Bit Pos 12 */},
>>>> +	{"SGCI Data Parity",	XE_TILE_HW_ERR_SGCI_NONFATAL		/* Bit Pos 13 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"SOC",			XE_TILE_HW_ERR_SOC_NONFATAL		/* Bit Pos 16 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL		/* Bit Pos 17 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>>> +	{"MERT Data Parity",	XE_TILE_HW_ERR_MERT_NONFATAL		/* Bit Pos 20 */},
>>>> +};
>>>> +
>>>> +struct error_msg_counter_pair dev_err_stat_correctable_reg[] = {
>>>> +	{"GT",			XE_TILE_HW_ERR_GT_CORR			/* Bit Pos 0 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_CORR		/* Bit Pos 4 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"GSC",			XE_TILE_HW_ERR_GSC_CORR			/* Bit Pos 8 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_CORR		/* Bit Pos 12 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"SOC",			XE_TILE_HW_ERR_SOC_CORR			/* Bit Pos 16 */},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>>> +};
>>>> +
>>>> +static void update_valid_error_regs(struct xe_device *xe)
>>>> +{
>>>> +	unsigned long mask = 0;
>>>> +
>>>> +	u32 i;
>>>> +
>>>> +	if (xe->info.platform == XE_DG2) {
>>>> +		mask = ~(0 | DG2_DEV_ERR_STAT_FATAL_MASK);
>>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>>> +			dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
>>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
>>> Nope. For one thing, the arrays really should be static const, placed in
>>> rodata, and not mutable.
>> My Idea of keeping it mutable is to avoid multiple platform specific 
>> arrays.
>>> For another, if you have a platform with two or more different devices,
>>> whichever gets probed last clobbers the data.
>>
>> Thanks for pointing it out. I had missed this particular scenario.  I 
>> believe defining platform specific array is better
>>
>> because then we can ensure them to be immutable and we will not be 
>> required to have platform specific mask also.
>>
>> I believe this will help with maintainability too.
>>
>> BR
>>
>> Himal Ghimiray
>>
>>>> +
>>>> +		mask = ~(0 | DG2_DEV_ERR_STAT_NONFATAL_MASK);
>>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>>> +			dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
>>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
>>>> +
>>>> +		mask = ~(0 | DG2_DEV_ERR_STAT_CORRECTABLE_MASK);
>>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>>> +			dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
>>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
>>>> +	} else if (xe->info.platform == XE_PVC) {
>>>> +		mask = ~(0 | PVC_DEV_ERR_STAT_FATAL_MASK);
>>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>>> +			dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
>>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
>>>> +
>>>> +		mask = ~(0 | PVC_DEV_ERR_STAT_NONFATAL_MASK);
>>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>>> +			dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
>>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
>>>> +
>>>> +		mask = ~(0 | PVC_DEV_ERR_STAT_CORRECTABLE_MASK);
>>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>>> +			dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
>>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
>>>> +	}
>>>> +}
>>>> +
>>>> +static void
>>>> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>>> +{
>>>> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>>> +	struct error_msg_counter_pair *errstat;
>>>> +	unsigned long errsrc;
>>>> +	unsigned long flags;
>>>> +	const char *errmsg;
>>>> +	struct xe_gt *mmio;
>>>> +	u32 counter;
>>>> +	u32 errcntr;
>>>> +	u32 errbit;
>>>> +
>>>> +	switch (hw_err) {
>>>> +	case HARDWARE_ERROR_FATAL:
>>>> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_fatal_reg;
>>> Why the casts?
>>>
>>>> +		counter = XE_TILE_HW_ERR_UNKNOWN_FATAL;
>>>> +		break;
>>>> +	case HARDWARE_ERROR_NONFATAL:
>>>> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_nonfatal_reg;
>>>> +		counter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL;
>>>> +		break;
>>>> +	case HARDWARE_ERROR_CORRECTABLE:
>>>> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_correctable_reg;
>>>> +		counter = XE_TILE_HW_ERR_UNKNOWN_CORR;
>>>> +		break;
>>>> +	default:
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
>>>> +	mmio = tile->primary_gt;
>>>> +	errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
>>>> +
>>>> +	if (!errsrc) {
>>>> +		xe_tile_log_hw_err(tile, "DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
>>>> +		goto unlock;
>>>> +	}
>>>> +
>>>> +	for_each_set_bit(errbit, &errsrc, REG_SIZE) {
>>>> +		if (errbit < DEV_ERR_STAT_MAX_ERROR_BIT) {
>>>> +			errmsg = errstat[errbit].errmsg;
>>>> +			errcntr = errstat[errbit].errcounter;
>>>> +		} else {
>>>> +			errmsg = "Undefined";
>>>> +			errcntr = counter;
>>>> +		}
>>>> +
>>>> +		if (hw_err == HARDWARE_ERROR_CORRECTABLE)
>>>> +			xe_tile_log_hw_warn(tile, "%s %s error bit[%d] is set\n",
>>>> +					    errmsg, hw_err_str, errbit);
>>>> +		else
>>>> +			xe_tile_log_hw_err(tile, "%s %s error bit[%d] is set\n",
>>>> +					   errmsg, hw_err_str, errbit);
>>>> +
>>>> +		tile->errors.hw[errcntr]++;
>>>> +	}
>>>> +
>>>> +	xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
>>>> +unlock:
>>>> +	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>>> +}
>>>> +
>>>> +/*
>>>> + * XE Platforms adds three Error bits to the Master Interrupt
>>>> + * Register to support error handling. These three bits are
>>>> + * used to convey the class of error:
>>>> + * FATAL, NONFATAL, or CORRECTABLE.
>>>> + *
>>>> + * To process an interrupt:
>>>> + *       Determine source of error (IP block) by reading
>>>> + *	 the Device Error Source Register (RW1C) that
>>>> + *	 corresponds to the class of error being serviced
>>>> + *	 and log the error.
>>>> + */
>>>> +static void
>>>> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>>> +{
>>>> +	enum hardware_error hw_err;
>>>> +
>>>> +	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
>>>> +		if (master_ctl & XE_ERROR_IRQ(hw_err))
>>>> +			xe_hw_error_source_handler(tile, hw_err);
>>>> +	}
>>>> +}
>>>> +
>>>>   /*
>>>>    * Top-level interrupt handler for Xe_LP+ and beyond.  These platforms have
>>>>    * a "master tile" interrupt register which must be consulted before the
>>>> @@ -413,6 +630,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>   		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>>>   
>>>>   		gt_irq_handler(tile, master_ctl, intr_dw, identity);
>>>> +		xe_hw_error_irq_handler(tile, master_ctl);
>>>>   
>>>>   		/*
>>>>   		 * Display interrupts (including display backlight operations
>>>> @@ -561,6 +779,8 @@ int xe_irq_install(struct xe_device *xe)
>>>>   		return -EINVAL;
>>>>   	}
>>>>   
>>>> +	update_valid_error_regs(xe);
>>>> +
>>>>   	xe->irq.enabled = true;
>>>>   
>>>>   	xe_irq_reset(xe);

-- 
Jani Nikula, Intel Open Source Graphics Center