[Intel-xe] [PATCH v2 1/4] drm/xe: Handle errors from various components.

Thu Aug 10 09:38:03 UTC 2023

On 10-08-2023 15:01, Ghimiray, Himal Prasad wrote:
>
>
> On 10-08-2023 13:24, Jani Nikula wrote:
>> On Thu, 10 Aug 2023, Himal Prasad Ghimiray<himal.prasad.ghimiray at intel.com>  wrote:
>>> The GFX device can generate numbers of classes of error under the new
>>> infrastructure: correctable, non-fatal, and fatal errors.
>>>
>>> The non-fatal and fatal error classes distinguish between levels of
>>> severity for uncorrectable errors. Driver will only handle logging
>>> of errors and updating counters from various components within the
>>> graphics device. Anything more will be handled at system level.
>>>
>>> For errors that will route as interrupts, three bits in the Master
>>> Interrupt Register will be used to convey the class of error.
>>>
>>> For each class of error: Determine source of error (IP block) by reading
>>> the Device Error Source Register (RW1C) that
>>> corresponds to the class of error being serviced.
>>>
>>> Bspec: 50875, 53073, 53074, 53075
>>>
>>> Cc: Rodrigo Vivi<rodrigo.vivi at intel.com>
>>> Cc: Aravind Iddamsetty<aravind.iddamsetty at intel.com>
>>> Cc: Matthew Brost<matthew.brost at intel.com>
>>> Cc: Matt Roper<matthew.d.roper at intel.com>
>>> Cc: Joonas Lahtinen<joonas.lahtinen at linux.intel.com>
>>> Cc: Jani Nikula<jani.nikula at intel.com>
>>> Signed-off-by: Himal Prasad Ghimiray<himal.prasad.ghimiray at intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/regs/xe_regs.h            |   7 +
>>>   drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 108 +++++++++
>>>   drivers/gpu/drm/xe/xe_device_types.h         |   6 +
>>>   drivers/gpu/drm/xe/xe_irq.c                  | 220 +++++++++++++++++++
>>>   4 files changed, 341 insertions(+)
>>>   create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>>
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> index ec45b1ba9db1..9901e55fc89c 100644
>>> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
>>> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
>>> @@ -87,7 +87,14 @@
>>>   #define   GU_MISC_IRQ				REG_BIT(29)
>>>   #define   DISPLAY_IRQ				REG_BIT(16)
>>>   #define   GT_DW_IRQ(x)				REG_BIT(x)
>>> +#define   XE_ERROR_IRQ(x)			REG_BIT(26 + (x))
>>>   
>>>   #define PVC_RP_STATE_CAP			XE_REG(0x281014)
>>>   
>>> +enum hardware_error {
>>> +	HARDWARE_ERROR_CORRECTABLE = 0,
>>> +	HARDWARE_ERROR_NONFATAL = 1,
>>> +	HARDWARE_ERROR_FATAL = 2,
>>> +	HARDWARE_ERROR_MAX,
>>> +};
>> This file is about registers. IMO enums belong somewhere else. Define
>> hardware registers using macros.
>
> Hmm. Will look for better placeholder for this enum.
>
>>>   #endif
>>> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> new file mode 100644
>>> index 000000000000..fbb794b2f183
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>>> @@ -0,0 +1,108 @@
>>> +/* SPDX-License-Identifier: MIT */
>>> +/*
>>> + * Copyright © 2023 Intel Corporation
>>> + */
>>> +#ifndef XE_TILE_ERROR_REGS_H_
>>> +#define XE_TILE_ERROR_REGS_H_
>>> +
>>> +#include <linux/stddef.h>
>>> +
>>> +#define _DEV_ERR_STAT_NONFATAL                         0x100178
>>> +#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
>>> +#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
>>> +								_DEV_ERR_STAT_CORRECTABLE, \
>>> +								_DEV_ERR_STAT_NONFATAL))
>>> +
>>> +#define  DEV_ERR_STAT_MAX_ERROR_BIT         (21)
>>> +
>>> +/* Count of  Correctable and Uncorrectable errors reported on tile */
>>> +enum xe_tile_hw_errors {g
>>> +	XE_TILE_HW_ERR_GT_FATAL = 0,
>>> +	XE_TILE_HW_ERR_SGGI_FATAL,
>>> +	XE_TILE_HW_ERR_DISPLAY_FATAL,
>>> +	XE_TILE_HW_ERR_SGDI_FATAL,
>>> +	XE_TILE_HW_ERR_SGLI_FATAL,
>>> +	XE_TILE_HW_ERR_SGUNIT_FATAL,
>>> +	XE_TILE_HW_ERR_SGCI_FATAL,
>>> +	XE_TILE_HW_ERR_GSC_FATAL,
>>> +	XE_TILE_HW_ERR_SOC_FATAL,
>>> +	XE_TILE_HW_ERR_MERT_FATAL,
>>> +	XE_TILE_HW_ERR_SGMI_FATAL,
>>> +	XE_TILE_HW_ERR_UNKNOWN_FATAL,
>>> +	XE_TILE_HW_ERR_SGGI_NONFATAL,
>>> +	XE_TILE_HW_ERR_DISPLAY_NONFATAL,
>>> +	XE_TILE_HW_ERR_SGDI_NONFATAL,
>>> +	XE_TILE_HW_ERR_SGLI_NONFATAL,
>>> +	XE_TILE_HW_ERR_GT_NONFATAL,
>>> +	XE_TILE_HW_ERR_SGUNIT_NONFATAL,
>>> +	XE_TILE_HW_ERR_SGCI_NONFATAL,
>>> +	XE_TILE_HW_ERR_GSC_NONFATAL,
>>> +	XE_TILE_HW_ERR_SOC_NONFATAL,
>>> +	XE_TILE_HW_ERR_MERT_NONFATAL,
>>> +	XE_TILE_HW_ERR_SGMI_NONFATAL,
>>> +	XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
>>> +	XE_TILE_HW_ERR_GT_CORR,
>>> +	XE_TILE_HW_ERR_DISPLAY_CORR,
>>> +	XE_TILE_HW_ERR_SGUNIT_CORR,
>>> +	XE_TILE_HW_ERR_GSC_CORR,
>>> +	XE_TILE_HW_ERR_SOC_CORR,
>>> +	XE_TILE_HW_ERR_UNKNOWN_CORR,
>>> +};
>> Ditto about enums and regs.
> Will address.
>>> +
>>> +#define XE_TILE_HW_ERROR_MAX (XE_TILE_HW_ERR_UNKNOWN_CORR + 1)
>> If it's an enum, adding that last in the enum does the trick.
> Makes sense.
>>> +
>>> +#define PVC_DEV_ERR_STAT_FATAL_MASK \
>>> +		(REG_BIT(0) | \
>>> +		 REG_BIT(1) | \
>>> +		 REG_BIT(8) | \
>>> +		 REG_BIT(9) | \
>>> +		 REG_BIT(13) | \
>>> +		 REG_BIT(16) | \
>>> +		 REG_BIT(20))
>>> +
>>> +#define PVC_DEV_ERR_STAT_NONFATAL_MASK \
>>> +		(REG_BIT(0) | \
>>> +		 REG_BIT(1) | \
>>> +		 REG_BIT(8) | \
>>> +		 REG_BIT(9) | \
>>> +		 REG_BIT(13) | \
>>> +		 REG_BIT(16) | \
>>> +		 REG_BIT(20))
>>> +
>>> +#define PVC_DEV_ERR_STAT_CORRECTABLE_MASK \
>>> +		(REG_BIT(0) | \
>>> +		 REG_BIT(8))
>>> +
>>> +#define DG2_DEV_ERR_STAT_FATAL_MASK \
>>> +		(REG_BIT(0) | \
>>> +		 REG_BIT(4) | \
>>> +		 REG_BIT(8) | \
>>> +		 REG_BIT(12) | \
>>> +		 REG_BIT(16))
>>> +
>>> +#define DG2_DEV_ERR_STAT_NONFATAL_MASK \
>>> +		(REG_BIT(0) | \
>>> +		 REG_BIT(4) | \
>>> +		 REG_BIT(8) | \
>>> +		 REG_BIT(12) | \
>>> +		 REG_BIT(16) | \
>>> +		 REG_BIT(20))
>>> +
>>> +#define DG2_DEV_ERR_STAT_CORRECTABLE_MASK \
>>> +		(REG_BIT(0) | \
>>> +		 REG_BIT(4) | \
>>> +		 REG_BIT(8) | \
>>> +		 REG_BIT(12) | \
>>> +		 REG_BIT(16))
>> Are the above supposed to match what's in xe_tile_hw_errors? Seems
>> rather unmaintainable.
> xe_tile_hw_errors contains superset of applicable platforms and mask determines
> what are applicable bits for a platform.
>>> +
>>> +#define REG_SIZE 32
>>> +
>>> +#define xe_tile_log_hw_err(tile, fmt, ...) \
>>> +	drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
>>> +			    tile->id, ##__VA_ARGS__)
>>> +
>>> +#define xe_tile_log_hw_warn(tile, fmt, ...) \
>>> +	drm_warn(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
>>> +		 tile->id, ##__VA_ARGS__)
>> Do we really want to keep adding new macros for all possible scenarios
>> in the driver? This is getting out of hand.
>>
>> Where's HW_ERR defined?
> include/linux/printk.h defines HW_ERR.
>>> +
>>> +#endif
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>> index f84ecb976f5d..1335ba74981a 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -16,6 +16,7 @@
>>>   #include "xe_gt_types.h"
>>>   #include "xe_platform_types.h"
>>>   #include "xe_step_types.h"
>>> +#include "regs/xe_tile_error_regs.h"
>>>   
>>>   #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>>>   #include "ext/intel_device_info.h"
>>> @@ -166,6 +167,11 @@ struct xe_tile {
>>>   
>>>   	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>>   	struct kobject *sysfs;
>>> +
>>> +	/** @tile_hw_errors: hardware errors reported for the tile */
>>> +	struct tile_hw_errors {
>>> +		unsigned long hw[XE_TILE_HW_ERROR_MAX];
>> Even with the documentation comment, I have to look up the source code
>> to realize this is the *number* of errors for each class.
>>
>> Maybe "count" is more informative than "hw".
> Ok.
>>> +	} errors;
>>>   };
>>>   
>>>   /**
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 2022a5643e01..04a665faea23 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -362,6 +362,223 @@ static void dg1_intr_enable(struct xe_device *xe, bool stall)
>>>   		xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR);
>>>   }
>>>   
>>> +static const char *
>>> +hardware_error_type_to_str(const enum hardware_error hw_err)
>>> +{
>>> +	switch (hw_err) {
>>> +	case HARDWARE_ERROR_CORRECTABLE:
>>> +		return "CORRECTABLE";
>>> +	case HARDWARE_ERROR_NONFATAL:
>>> +		return "NONFATAL";
>>> +	case HARDWARE_ERROR_FATAL:
>>> +		return "FATAL";
>>> +	default:
>>> +		return "UNKNOWN";
>>> +	}
>>> +}
>>> +
>>> +struct error_msg_counter_pair {
>>> +	const char *errmsg;
>>> +	int errcounter;
>> Counter? Or type/class/whatever?
>>
>>> +};
>>> +
>>> +struct error_msg_counter_pair dev_err_stat_fatal_reg[] = {
>>> +	{"GT",			XE_TILE_HW_ERR_GT_FATAL			/* Bit Pos 0 */},
>> Does this again tie the enums and the bit positions together, similar to
>> how the mask macros also do above?
>>
>> There needs to be a single point of truth for all of this.
>>
>> I think this needs a redesign.
>
> As commented above this is array which defines all valid bit positions 
> irrespective of platform. Mask was
>
> to determine platform specific applicability.
>
>> BR,
>> Jani.
>>
>>> +	{"SGGI Cmd Parity",	XE_TILE_HW_ERR_SGGI_FATAL		/* Bit Pos 1 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_FATAL		/* Bit Pos 4 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"GSC error",		XE_TILE_HW_ERR_GSC_FATAL		/* Bit Pos 8 */},
>>> +	{"SGLI Cmd Parity",	XE_TILE_HW_ERR_SGLI_FATAL		/* Bit Pos 9 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_FATAL		/* Bit Pos 12 */},
>>> +	{"SGCI Cmd Parity",	XE_TILE_HW_ERR_SGCI_FATAL		/* Bit Pos 13 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"SOC ERROR",		XE_TILE_HW_ERR_SOC_FATAL		/* Bit Pos 16 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
>>> +	{"MERT Cmd Parity",	XE_TILE_HW_ERR_MERT_FATAL		/* Bit Pos 20 */},
>>> +};
>>> +
>>> +struct error_msg_counter_pair dev_err_stat_nonfatal_reg[] = {
>>> +	{"GT",			XE_TILE_HW_ERR_GT_NONFATAL		/* Bit Pos 0 */},
>>> +	{"SGGI Data Parity",	XE_TILE_HW_ERR_SGGI_NONFATAL		/* Bit Pos 1 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_NONFATAL		/* Bit Pos 4 */},
>>> +	{"SGDI Data Parity",	XE_TILE_HW_ERR_SGDI_NONFATAL		/* Bit Pos 5 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"GSC",			XE_TILE_HW_ERR_GSC_NONFATAL		/* Bit Pos 8 */},
>>> +	{"SGLI Data Parity",	XE_TILE_HW_ERR_SGLI_NONFATAL		/* Bit Pos 9 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_NONFATAL		/* Bit Pos 12 */},
>>> +	{"SGCI Data Parity",	XE_TILE_HW_ERR_SGCI_NONFATAL		/* Bit Pos 13 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"SOC",			XE_TILE_HW_ERR_SOC_NONFATAL		/* Bit Pos 16 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL		/* Bit Pos 17 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
>>> +	{"MERT Data Parity",	XE_TILE_HW_ERR_MERT_NONFATAL		/* Bit Pos 20 */},
>>> +};
>>> +
>>> +struct error_msg_counter_pair dev_err_stat_correctable_reg[] = {
>>> +	{"GT",			XE_TILE_HW_ERR_GT_CORR			/* Bit Pos 0 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_CORR		/* Bit Pos 4 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"GSC",			XE_TILE_HW_ERR_GSC_CORR			/* Bit Pos 8 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_CORR		/* Bit Pos 12 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"SOC",			XE_TILE_HW_ERR_SOC_CORR			/* Bit Pos 16 */},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
>>> +};
>>> +
>>> +static void update_valid_error_regs(struct xe_device *xe)
>>> +{
>>> +	unsigned long mask = 0;
>>> +
>>> +	u32 i;
>>> +
>>> +	if (xe->info.platform == XE_DG2) {
>>> +		mask = ~(0 | DG2_DEV_ERR_STAT_FATAL_MASK);
>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> +			dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
>> Nope. For one thing, the arrays really should be static const, placed in
>> rodata, and not mutable.
> My Idea of keeping it mutable is to avoid multiple platform specific 
> arrays.
>> For another, if you have a platform with two or more different devices,
>> whichever gets probed last clobbers the data.
>
> Thanks for pointing it out. I had missed this particular scenario.  I 
> believe defining platform specific array is better
>
> because then we can ensure them to be immutable and we will not be 
> required to have platform specific mask also.
>
> I believe this will help with maintainability too.
>
> BR
>
> Himal Ghimiray
>
>>> +
>>> +		mask = ~(0 | DG2_DEV_ERR_STAT_NONFATAL_MASK);
>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> +			dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
>>> +
>>> +		mask = ~(0 | DG2_DEV_ERR_STAT_CORRECTABLE_MASK);
>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> +			dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
>>> +	} else if (xe->info.platform == XE_PVC) {
>>> +		mask = ~(0 | PVC_DEV_ERR_STAT_FATAL_MASK);
>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> +			dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
>>> +
>>> +		mask = ~(0 | PVC_DEV_ERR_STAT_NONFATAL_MASK);
>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> +			dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
>>> +
>>> +		mask = ~(0 | PVC_DEV_ERR_STAT_CORRECTABLE_MASK);
>>> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
>>> +			dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
>>> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
>>> +	}
>>> +}
>>> +
>>> +static void
>>> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>> +{
>>> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>> +	struct error_msg_counter_pair *errstat;
>>> +	unsigned long errsrc;
>>> +	unsigned long flags;
>>> +	const char *errmsg;
>>> +	struct xe_gt *mmio;
>>> +	u32 counter;
>>> +	u32 errcntr;
>>> +	u32 errbit;
>>> +
>>> +	switch (hw_err) {
>>> +	case HARDWARE_ERROR_FATAL:
>>> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_fatal_reg;
>> Why the casts?
>>
>>> +		counter = XE_TILE_HW_ERR_UNKNOWN_FATAL;
>>> +		break;
>>> +	case HARDWARE_ERROR_NONFATAL:
>>> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_nonfatal_reg;
>>> +		counter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL;
>>> +		break;
>>> +	case HARDWARE_ERROR_CORRECTABLE:
>>> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_correctable_reg;
>>> +		counter = XE_TILE_HW_ERR_UNKNOWN_CORR;
>>> +		break;
>>> +	default:
>>> +		return;
>>> +	}
>>> +
>>> +	spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
>>> +	mmio = tile->primary_gt;
>>> +	errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
>>> +
>>> +	if (!errsrc) {
>>> +		xe_tile_log_hw_err(tile, "DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
>>> +		goto unlock;
>>> +	}
>>> +
>>> +	for_each_set_bit(errbit, &errsrc, REG_SIZE) {
>>> +		if (errbit < DEV_ERR_STAT_MAX_ERROR_BIT) {
>>> +			errmsg = errstat[errbit].errmsg;
>>> +			errcntr = errstat[errbit].errcounter;
>>> +		} else {
>>> +			errmsg = "Undefined";
>>> +			errcntr = counter;
>>> +		}
>>> +
>>> +		if (hw_err == HARDWARE_ERROR_CORRECTABLE)
>>> +			xe_tile_log_hw_warn(tile, "%s %s error bit[%d] is set\n",
>>> +					    errmsg, hw_err_str, errbit);
>>> +		else
>>> +			xe_tile_log_hw_err(tile, "%s %s error bit[%d] is set\n",
>>> +					   errmsg, hw_err_str, errbit);
>>> +
>>> +		tile->errors.hw[errcntr]++;
>>> +	}
>>> +
>>> +	xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
>>> +unlock:
>>> +	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>> +}
>>> +
>>> +/*
>>> + * XE Platforms adds three Error bits to the Master Interrupt
>>> + * Register to support error handling. These three bits are
>>> + * used to convey the class of error:
>>> + * FATAL, NONFATAL, or CORRECTABLE.
>>> + *
>>> + * To process an interrupt:
>>> + *       Determine source of error (IP block) by reading
>>> + *	 the Device Error Source Register (RW1C) that
>>> + *	 corresponds to the class of error being serviced
>>> + *	 and log the error.
>>> + */
>>> +static void
>>> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>> +{
>>> +	enum hardware_error hw_err;
>>> +
>>> +	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
>>> +		if (master_ctl & XE_ERROR_IRQ(hw_err))
>>> +			xe_hw_error_source_handler(tile, hw_err);
>>> +	}
>>> +}
>>> +
>>>   /*
>>>    * Top-level interrupt handler for Xe_LP+ and beyond.  These platforms have
>>>    * a "master tile" interrupt register which must be consulted before the
>>> @@ -413,6 +630,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>   		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>>>   
>>>   		gt_irq_handler(tile, master_ctl, intr_dw, identity);
>>> +		xe_hw_error_irq_handler(tile, master_ctl);
>>>   
>>>   		/*
>>>   		 * Display interrupts (including display backlight operations
>>> @@ -561,6 +779,8 @@ int xe_irq_install(struct xe_device *xe)
>>>   		return -EINVAL;
>>>   	}
>>>   
>>> +	update_valid_error_regs(xe);
>>> +
>>>   	xe->irq.enabled = true;
>>>   
>>>   	xe_irq_reset(xe);
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20230810/522f21d3/attachment-0001.htm>