[Intel-xe] [PATCH v2 1/4] drm/xe: Handle errors from various components.

Thu Aug 10 07:54:56 UTC 2023

On Thu, 10 Aug 2023, Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com> wrote:
> The GFX device can generate numbers of classes of error under the new
> infrastructure: correctable, non-fatal, and fatal errors.
>
> The non-fatal and fatal error classes distinguish between levels of
> severity for uncorrectable errors. Driver will only handle logging
> of errors and updating counters from various components within the
> graphics device. Anything more will be handled at system level.
>
> For errors that will route as interrupts, three bits in the Master
> Interrupt Register will be used to convey the class of error.
>
> For each class of error: Determine source of error (IP block) by reading
> the Device Error Source Register (RW1C) that
> corresponds to the class of error being serviced.
>
> Bspec: 50875, 53073, 53074, 53075
>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Matt Roper <matthew.d.roper at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Jani Nikula <jani.nikula at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_regs.h            |   7 +
>  drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 108 +++++++++
>  drivers/gpu/drm/xe/xe_device_types.h         |   6 +
>  drivers/gpu/drm/xe/xe_irq.c                  | 220 +++++++++++++++++++
>  4 files changed, 341 insertions(+)
>  create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
> index ec45b1ba9db1..9901e55fc89c 100644
> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
> @@ -87,7 +87,14 @@
>  #define   GU_MISC_IRQ				REG_BIT(29)
>  #define   DISPLAY_IRQ				REG_BIT(16)
>  #define   GT_DW_IRQ(x)				REG_BIT(x)
> +#define   XE_ERROR_IRQ(x)			REG_BIT(26 + (x))
>  
>  #define PVC_RP_STATE_CAP			XE_REG(0x281014)
>  
> +enum hardware_error {
> +	HARDWARE_ERROR_CORRECTABLE = 0,
> +	HARDWARE_ERROR_NONFATAL = 1,
> +	HARDWARE_ERROR_FATAL = 2,
> +	HARDWARE_ERROR_MAX,
> +};

This file is about registers. IMO enums belong somewhere else. Define
hardware registers using macros.

>  #endif
> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> new file mode 100644
> index 000000000000..fbb794b2f183
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> @@ -0,0 +1,108 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2023 Intel Corporation
> + */
> +#ifndef XE_TILE_ERROR_REGS_H_
> +#define XE_TILE_ERROR_REGS_H_
> +
> +#include <linux/stddef.h>
> +
> +#define _DEV_ERR_STAT_NONFATAL                         0x100178
> +#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
> +#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
> +								_DEV_ERR_STAT_CORRECTABLE, \
> +								_DEV_ERR_STAT_NONFATAL))
> +
> +#define  DEV_ERR_STAT_MAX_ERROR_BIT         (21)
> +
> +/* Count of  Correctable and Uncorrectable errors reported on tile */
> +enum xe_tile_hw_errors {g
> +	XE_TILE_HW_ERR_GT_FATAL = 0,
> +	XE_TILE_HW_ERR_SGGI_FATAL,
> +	XE_TILE_HW_ERR_DISPLAY_FATAL,
> +	XE_TILE_HW_ERR_SGDI_FATAL,
> +	XE_TILE_HW_ERR_SGLI_FATAL,
> +	XE_TILE_HW_ERR_SGUNIT_FATAL,
> +	XE_TILE_HW_ERR_SGCI_FATAL,
> +	XE_TILE_HW_ERR_GSC_FATAL,
> +	XE_TILE_HW_ERR_SOC_FATAL,
> +	XE_TILE_HW_ERR_MERT_FATAL,
> +	XE_TILE_HW_ERR_SGMI_FATAL,
> +	XE_TILE_HW_ERR_UNKNOWN_FATAL,
> +	XE_TILE_HW_ERR_SGGI_NONFATAL,
> +	XE_TILE_HW_ERR_DISPLAY_NONFATAL,
> +	XE_TILE_HW_ERR_SGDI_NONFATAL,
> +	XE_TILE_HW_ERR_SGLI_NONFATAL,
> +	XE_TILE_HW_ERR_GT_NONFATAL,
> +	XE_TILE_HW_ERR_SGUNIT_NONFATAL,
> +	XE_TILE_HW_ERR_SGCI_NONFATAL,
> +	XE_TILE_HW_ERR_GSC_NONFATAL,
> +	XE_TILE_HW_ERR_SOC_NONFATAL,
> +	XE_TILE_HW_ERR_MERT_NONFATAL,
> +	XE_TILE_HW_ERR_SGMI_NONFATAL,
> +	XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
> +	XE_TILE_HW_ERR_GT_CORR,
> +	XE_TILE_HW_ERR_DISPLAY_CORR,
> +	XE_TILE_HW_ERR_SGUNIT_CORR,
> +	XE_TILE_HW_ERR_GSC_CORR,
> +	XE_TILE_HW_ERR_SOC_CORR,
> +	XE_TILE_HW_ERR_UNKNOWN_CORR,
> +};

Ditto about enums and regs.

> +
> +#define XE_TILE_HW_ERROR_MAX (XE_TILE_HW_ERR_UNKNOWN_CORR + 1)

If it's an enum, adding that last in the enum does the trick.

> +
> +#define PVC_DEV_ERR_STAT_FATAL_MASK \
> +		(REG_BIT(0) | \
> +		 REG_BIT(1) | \
> +		 REG_BIT(8) | \
> +		 REG_BIT(9) | \
> +		 REG_BIT(13) | \
> +		 REG_BIT(16) | \
> +		 REG_BIT(20))
> +
> +#define PVC_DEV_ERR_STAT_NONFATAL_MASK \
> +		(REG_BIT(0) | \
> +		 REG_BIT(1) | \
> +		 REG_BIT(8) | \
> +		 REG_BIT(9) | \
> +		 REG_BIT(13) | \
> +		 REG_BIT(16) | \
> +		 REG_BIT(20))
> +
> +#define PVC_DEV_ERR_STAT_CORRECTABLE_MASK \
> +		(REG_BIT(0) | \
> +		 REG_BIT(8))
> +
> +#define DG2_DEV_ERR_STAT_FATAL_MASK \
> +		(REG_BIT(0) | \
> +		 REG_BIT(4) | \
> +		 REG_BIT(8) | \
> +		 REG_BIT(12) | \
> +		 REG_BIT(16))
> +
> +#define DG2_DEV_ERR_STAT_NONFATAL_MASK \
> +		(REG_BIT(0) | \
> +		 REG_BIT(4) | \
> +		 REG_BIT(8) | \
> +		 REG_BIT(12) | \
> +		 REG_BIT(16) | \
> +		 REG_BIT(20))
> +
> +#define DG2_DEV_ERR_STAT_CORRECTABLE_MASK \
> +		(REG_BIT(0) | \
> +		 REG_BIT(4) | \
> +		 REG_BIT(8) | \
> +		 REG_BIT(12) | \
> +		 REG_BIT(16))

Are the above supposed to match what's in xe_tile_hw_errors? Seems
rather unmaintainable.

> +
> +#define REG_SIZE 32
> +
> +#define xe_tile_log_hw_err(tile, fmt, ...) \
> +	drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
> +			    tile->id, ##__VA_ARGS__)
> +
> +#define xe_tile_log_hw_warn(tile, fmt, ...) \
> +	drm_warn(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
> +		 tile->id, ##__VA_ARGS__)

Do we really want to keep adding new macros for all possible scenarios
in the driver? This is getting out of hand.

Where's HW_ERR defined?

> +
> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index f84ecb976f5d..1335ba74981a 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -16,6 +16,7 @@
>  #include "xe_gt_types.h"
>  #include "xe_platform_types.h"
>  #include "xe_step_types.h"
> +#include "regs/xe_tile_error_regs.h"
>  
>  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>  #include "ext/intel_device_info.h"
> @@ -166,6 +167,11 @@ struct xe_tile {
>  
>  	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>  	struct kobject *sysfs;
> +
> +	/** @tile_hw_errors: hardware errors reported for the tile */
> +	struct tile_hw_errors {
> +		unsigned long hw[XE_TILE_HW_ERROR_MAX];

Even with the documentation comment, I have to look up the source code
to realize this is the *number* of errors for each class.

Maybe "count" is more informative than "hw".

> +	} errors;
>  };
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> index 2022a5643e01..04a665faea23 100644
> --- a/drivers/gpu/drm/xe/xe_irq.c
> +++ b/drivers/gpu/drm/xe/xe_irq.c
> @@ -362,6 +362,223 @@ static void dg1_intr_enable(struct xe_device *xe, bool stall)
>  		xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR);
>  }
>  
> +static const char *
> +hardware_error_type_to_str(const enum hardware_error hw_err)
> +{
> +	switch (hw_err) {
> +	case HARDWARE_ERROR_CORRECTABLE:
> +		return "CORRECTABLE";
> +	case HARDWARE_ERROR_NONFATAL:
> +		return "NONFATAL";
> +	case HARDWARE_ERROR_FATAL:
> +		return "FATAL";
> +	default:
> +		return "UNKNOWN";
> +	}
> +}
> +
> +struct error_msg_counter_pair {
> +	const char *errmsg;
> +	int errcounter;

Counter? Or type/class/whatever?

> +};
> +
> +struct error_msg_counter_pair dev_err_stat_fatal_reg[] = {
> +	{"GT",			XE_TILE_HW_ERR_GT_FATAL			/* Bit Pos 0 */},

Does this again tie the enums and the bit positions together, similar to
how the mask macros also do above?

There needs to be a single point of truth for all of this.

I think this needs a redesign.

BR,
Jani.

> +	{"SGGI Cmd Parity",	XE_TILE_HW_ERR_SGGI_FATAL		/* Bit Pos 1 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_FATAL		/* Bit Pos 4 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"GSC error",		XE_TILE_HW_ERR_GSC_FATAL		/* Bit Pos 8 */},
> +	{"SGLI Cmd Parity",	XE_TILE_HW_ERR_SGLI_FATAL		/* Bit Pos 9 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_FATAL		/* Bit Pos 12 */},
> +	{"SGCI Cmd Parity",	XE_TILE_HW_ERR_SGCI_FATAL		/* Bit Pos 13 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"SOC ERROR",		XE_TILE_HW_ERR_SOC_FATAL		/* Bit Pos 16 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_FATAL},
> +	{"MERT Cmd Parity",	XE_TILE_HW_ERR_MERT_FATAL		/* Bit Pos 20 */},
> +};
> +
> +struct error_msg_counter_pair dev_err_stat_nonfatal_reg[] = {
> +	{"GT",			XE_TILE_HW_ERR_GT_NONFATAL		/* Bit Pos 0 */},
> +	{"SGGI Data Parity",	XE_TILE_HW_ERR_SGGI_NONFATAL		/* Bit Pos 1 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_NONFATAL		/* Bit Pos 4 */},
> +	{"SGDI Data Parity",	XE_TILE_HW_ERR_SGDI_NONFATAL		/* Bit Pos 5 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"GSC",			XE_TILE_HW_ERR_GSC_NONFATAL		/* Bit Pos 8 */},
> +	{"SGLI Data Parity",	XE_TILE_HW_ERR_SGLI_NONFATAL		/* Bit Pos 9 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_NONFATAL		/* Bit Pos 12 */},
> +	{"SGCI Data Parity",	XE_TILE_HW_ERR_SGCI_NONFATAL		/* Bit Pos 13 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"SOC",			XE_TILE_HW_ERR_SOC_NONFATAL		/* Bit Pos 16 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL		/* Bit Pos 17 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
> +	{"MERT Data Parity",	XE_TILE_HW_ERR_MERT_NONFATAL		/* Bit Pos 20 */},
> +};
> +
> +struct error_msg_counter_pair dev_err_stat_correctable_reg[] = {
> +	{"GT",			XE_TILE_HW_ERR_GT_CORR			/* Bit Pos 0 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"DISPLAY",		XE_TILE_HW_ERR_DISPLAY_CORR		/* Bit Pos 4 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"GSC",			XE_TILE_HW_ERR_GSC_CORR			/* Bit Pos 8 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"SGUNIT",		XE_TILE_HW_ERR_SGUNIT_CORR		/* Bit Pos 12 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"SOC",			XE_TILE_HW_ERR_SOC_CORR			/* Bit Pos 16 */},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +	{"Undefined",		XE_TILE_HW_ERR_UNKNOWN_CORR},
> +};
> +
> +static void update_valid_error_regs(struct xe_device *xe)
> +{
> +	unsigned long mask = 0;
> +
> +	u32 i;
> +
> +	if (xe->info.platform == XE_DG2) {
> +		mask = ~(0 | DG2_DEV_ERR_STAT_FATAL_MASK);
> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
> +			dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};

Nope. For one thing, the arrays really should be static const, placed in
rodata, and not mutable.

For another, if you have a platform with two or more different devices,
whichever gets probed last clobbers the data.

> +
> +		mask = ~(0 | DG2_DEV_ERR_STAT_NONFATAL_MASK);
> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
> +			dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
> +
> +		mask = ~(0 | DG2_DEV_ERR_STAT_CORRECTABLE_MASK);
> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
> +			dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
> +	} else if (xe->info.platform == XE_PVC) {
> +		mask = ~(0 | PVC_DEV_ERR_STAT_FATAL_MASK);
> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
> +			dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
> +
> +		mask = ~(0 | PVC_DEV_ERR_STAT_NONFATAL_MASK);
> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
> +			dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
> +
> +		mask = ~(0 | PVC_DEV_ERR_STAT_CORRECTABLE_MASK);
> +		for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
> +			dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
> +			{.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
> +	}
> +}
> +
> +static void
> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
> +{
> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> +	struct error_msg_counter_pair *errstat;
> +	unsigned long errsrc;
> +	unsigned long flags;
> +	const char *errmsg;
> +	struct xe_gt *mmio;
> +	u32 counter;
> +	u32 errcntr;
> +	u32 errbit;
> +
> +	switch (hw_err) {
> +	case HARDWARE_ERROR_FATAL:
> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_fatal_reg;

Why the casts?

> +		counter = XE_TILE_HW_ERR_UNKNOWN_FATAL;
> +		break;
> +	case HARDWARE_ERROR_NONFATAL:
> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_nonfatal_reg;
> +		counter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL;
> +		break;
> +	case HARDWARE_ERROR_CORRECTABLE:
> +		errstat = (struct error_msg_counter_pair *)dev_err_stat_correctable_reg;
> +		counter = XE_TILE_HW_ERR_UNKNOWN_CORR;
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
> +	mmio = tile->primary_gt;
> +	errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
> +
> +	if (!errsrc) {
> +		xe_tile_log_hw_err(tile, "DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
> +		goto unlock;
> +	}
> +
> +	for_each_set_bit(errbit, &errsrc, REG_SIZE) {
> +		if (errbit < DEV_ERR_STAT_MAX_ERROR_BIT) {
> +			errmsg = errstat[errbit].errmsg;
> +			errcntr = errstat[errbit].errcounter;
> +		} else {
> +			errmsg = "Undefined";
> +			errcntr = counter;
> +		}
> +
> +		if (hw_err == HARDWARE_ERROR_CORRECTABLE)
> +			xe_tile_log_hw_warn(tile, "%s %s error bit[%d] is set\n",
> +					    errmsg, hw_err_str, errbit);
> +		else
> +			xe_tile_log_hw_err(tile, "%s %s error bit[%d] is set\n",
> +					   errmsg, hw_err_str, errbit);
> +
> +		tile->errors.hw[errcntr]++;
> +	}
> +
> +	xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
> +unlock:
> +	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
> +}
> +
> +/*
> + * XE Platforms adds three Error bits to the Master Interrupt
> + * Register to support error handling. These three bits are
> + * used to convey the class of error:
> + * FATAL, NONFATAL, or CORRECTABLE.
> + *
> + * To process an interrupt:
> + *       Determine source of error (IP block) by reading
> + *	 the Device Error Source Register (RW1C) that
> + *	 corresponds to the class of error being serviced
> + *	 and log the error.
> + */
> +static void
> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
> +{
> +	enum hardware_error hw_err;
> +
> +	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
> +		if (master_ctl & XE_ERROR_IRQ(hw_err))
> +			xe_hw_error_source_handler(tile, hw_err);
> +	}
> +}
> +
>  /*
>   * Top-level interrupt handler for Xe_LP+ and beyond.  These platforms have
>   * a "master tile" interrupt register which must be consulted before the
> @@ -413,6 +630,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>  		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>  
>  		gt_irq_handler(tile, master_ctl, intr_dw, identity);
> +		xe_hw_error_irq_handler(tile, master_ctl);
>  
>  		/*
>  		 * Display interrupts (including display backlight operations
> @@ -561,6 +779,8 @@ int xe_irq_install(struct xe_device *xe)
>  		return -EINVAL;
>  	}
>  
> +	update_valid_error_regs(xe);
> +
>  	xe->irq.enabled = true;
>  
>  	xe_irq_reset(xe);

-- 
Jani Nikula, Intel Open Source Graphics Center