[PATCH 01/10] drm/xe: Handle errors from various components.

Michal Wajdeczko michal.wajdeczko at intel.com
Wed Jul 30 09:08:29 UTC 2025



On 7/30/2025 7:48 AM, Aravind Iddamsetty wrote:
> From: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> 
> The GFX device reports two classes of errors: uncorrectable and
> correctable. Depending on the severity uncorrectable errors are
> further classified as non fatal and fatal. Driver will only handle
> logging of errors and updating counters from various components within
> the graphics device. Anything more will be handled at system level.
> 
> Correctable and NonFatal errors are reported as interrupts, bits in
> the Master Interrupt Register will be used to convey the class of error.
> Determine source of error (IP block) by reading the Device Error Source
> Register (RW1C) that corresponds to the class of error being serviced
> 
> Fatal errors are reported as PCIe errors. When a PCIe error is asserted,
> the OS will perform a device warm reset which causes the driver to
> reload. The error registers are sticky and the values are maintained
> through a warm reset. We read these registers during the boot flow of the
> driver and increment the respective error counters.
> 
> Bspec: 50875, 53073, 53074, 53075, 53076
> 
> v6
> - Limit the implementation to DG2 and PVC.
> - Limit the tile level logging to only PVC.
> - Use xarray instead of array for error counters.
> - Squash the fatal error reporting patch with this patch.
> - use drm_dbg instead of drm_info to dump register values.
> - use XE_HW_ERR_UNSPEC for error which are reported by leaf registers.
> - use source_typeoferror_errorname convention for enum and error loging.
> - Clean unused enums and there are no display supported ras error,
> categorize them as unknown.
> - Dont make xe_assign_hw_err_regs static.
> - Use err_name_index_pair instead of err_msg_cntr_pair.(Aravind)
> 
> v7
> - Ci fix
> 
> v8
> - Avoid unnecessary write if reg is empty incase of DG2.
> 
> v9
> - For reg being blank print error for DG2 too.
> - Maintain order of headers.
> - Make XE_HW_ERR_UNSPEC 0. (Aravind)
> 
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Matt Roper <matthew.d.roper at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Jani Nikula <jani.nikula at intel.com>
> Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile                  |   1 +
>  drivers/gpu/drm/xe/regs/xe_irq_regs.h        |   1 +
>  drivers/gpu/drm/xe/regs/xe_regs.h            |   3 +
>  drivers/gpu/drm/xe/regs/xe_tile_error_regs.h |  13 +
>  drivers/gpu/drm/xe/xe_device.c               |  13 +
>  drivers/gpu/drm/xe/xe_device_types.h         |  10 +
>  drivers/gpu/drm/xe/xe_hw_error.c             | 258 +++++++++++++++++++
>  drivers/gpu/drm/xe/xe_hw_error.h             |  50 ++++
>  drivers/gpu/drm/xe/xe_irq.c                  |   1 +
>  drivers/gpu/drm/xe/xe_tile.c                 |   2 +
>  10 files changed, 352 insertions(+)
>  create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
>  create mode 100644 drivers/gpu/drm/xe/xe_hw_error.c
>  create mode 100644 drivers/gpu/drm/xe/xe_hw_error.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index 42c6ca5b3f76..80eecd35e807 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -82,6 +82,7 @@ xe-y += xe_bb.o \
>  	xe_hw_engine.o \
>  	xe_hw_engine_class_sysfs.o \
>  	xe_hw_engine_group.o \
> +	xe_hw_error.o \
>  	xe_hw_fence.o \
>  	xe_irq.o \
>  	xe_lrc.o \
> diff --git a/drivers/gpu/drm/xe/regs/xe_irq_regs.h b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> index 13635e4331d4..086ec7584b1a 100644
> --- a/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_irq_regs.h
> @@ -18,6 +18,7 @@
>  #define GFX_MSTR_IRQ				XE_REG(0x190010, XE_REG_OPTION_VF)
>  #define   MASTER_IRQ				REG_BIT(31)
>  #define   GU_MISC_IRQ				REG_BIT(29)
> +#define   XE_ERROR_IRQ(x)			REG_BIT(26 + (x))
>  #define   DISPLAY_IRQ				REG_BIT(16)
>  #define   I2C_IRQ				REG_BIT(12)
>  #define   GT_DW_IRQ(x)				REG_BIT(x)
> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
> index 1926b4044314..00900d3821f7 100644
> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
> @@ -9,6 +9,9 @@
>  
>  #define SOC_BASE				0x280000
>  
> +#define DEV_PCIEERR_STATUS			XE_REG(0x100180)
> +#define   DEV_PCIEERR_IS_FATAL(x)		REG_BIT(x * 4 + 2)
> +
>  #define GU_CNTL_PROTECTED			XE_REG(0x10100C)
>  #define   DRIVERINT_FLR_DIS			REG_BIT(31)
>  
> diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> new file mode 100644
> index 000000000000..ba5480fb2789
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2023 Intel Corporation
> + */
> +#ifndef XE_TILE_ERROR_REGS_H_
> +#define XE_TILE_ERROR_REGS_H_
> +
> +#define _DEV_ERR_STAT_NONFATAL                         0x100178
> +#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
> +#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
> +								_DEV_ERR_STAT_CORRECTABLE, \
> +								_DEV_ERR_STAT_NONFATAL))
> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index d04a0ae018e6..e0625fa5b1ca 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -875,6 +875,8 @@ int xe_device_probe(struct xe_device *xe)
>  			return err;
>  	}
>  
> +	xe_init_hw_errors(xe);
> +
>  	err = xe_irq_install(xe);
>  	if (err)
>  		return err;
> @@ -952,6 +954,15 @@ int xe_device_probe(struct xe_device *xe)
>  	return err;
>  }
>  
> +static void xe_hw_error_fini(struct xe_device *xe)
> +{
> +	struct xe_tile *tile;
> +	int i;
> +
> +	for_each_tile(tile, xe, i)
> +		xa_destroy(&tile->errors.hw_error);
> +}
> +
>  void xe_device_remove(struct xe_device *xe)
>  {
>  	xe_display_unregister(xe);
> @@ -961,6 +972,8 @@ void xe_device_remove(struct xe_device *xe)
>  	drm_dev_unplug(&xe->drm);
>  
>  	xe_bo_pci_dev_remove_all(xe);
> +
> +	xe_hw_error_fini(xe);

this should be a devm action registered by xe_init_hw_errors()

>  }
>  
>  void xe_device_shutdown(struct xe_device *xe)
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 38c8329b4d2c..233c2751d09f 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -14,6 +14,7 @@
>  
>  #include "xe_devcoredump_types.h"
>  #include "xe_heci_gsc.h"
> +#include "xe_hw_error.h"
>  #include "xe_lmtt_types.h"
>  #include "xe_memirq_types.h"
>  #include "xe_oa_types.h"
> @@ -206,6 +207,11 @@ struct xe_tile {
>  
>  	/** @debugfs: debugfs directory associated with this tile */
>  	struct dentry *debugfs;
> +
> +	/** @errors: count of hardware errors reported for the tile */
> +	struct tile_hw_errors {
> +		struct xarray hw_error;
> +	} errors;
>  };
>  
>  /**
> @@ -575,6 +581,10 @@ struct xe_device {
>  	 */
>  	atomic64_t global_total_pages;
>  #endif
> +	/** @hw_err_regs: list of hw error regs*/
> +	struct hardware_errors_regs {
> +		const struct err_name_index_pair *dev_err_stat[HARDWARE_ERROR_MAX];
> +	} hw_err_regs;
>  
>  	/* private: */
>  
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> new file mode 100644
> index 000000000000..84830ad81813
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -0,0 +1,258 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2023 Intel Corporation

2025 ? everywhere

> + */
> +
> +#include "xe_hw_error.h"
> +
> +#include "regs/xe_regs.h"
> +#include "regs/xe_irq_regs.h"
> +#include "regs/xe_tile_error_regs.h"
> +#include "xe_device.h"
> +#include "xe_mmio.h"
> +
> +static const char *
> +hardware_error_type_to_str(const enum hardware_error hw_err)
> +{
> +	switch (hw_err) {
> +	case HARDWARE_ERROR_CORRECTABLE:
> +		return "CORRECTABLE";
> +	case HARDWARE_ERROR_NONFATAL:
> +		return "NONFATAL";
> +	case HARDWARE_ERROR_FATAL:
> +		return "FATAL";
> +	default:
> +		return "UNKNOWN";
> +	}
> +}
> +
> +static const struct err_name_index_pair dg2_err_stat_fatal_reg[] = {
> +	[0]         = {"GT",			XE_HW_ERR_TILE_UNSPEC},
> +	[1 ... 3]   = {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[4 ... 7]   = {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[8]         = {"GSC",			XE_HW_ERR_TILE_UNSPEC},
> +	[9 ... 11]  = {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[12]        = {"SGUNIT",		XE_HW_ERR_TILE_FATAL_SGUNIT},
> +	[13 ... 15] = {"Undefined",             XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[16]        = {"SOC",			XE_HW_ERR_TILE_UNSPEC},
> +	[17 ... 31] = {"Undefined",             XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +};
> +
> +static const struct err_name_index_pair dg2_err_stat_nonfatal_reg[] = {
> +	[0]         = {"GT",			XE_HW_ERR_TILE_UNSPEC},
> +	[1 ... 3]   = {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[4 ... 7]   = {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[8]         = {"GSC",			XE_HW_ERR_TILE_UNSPEC},
> +	[9 ... 11]  = {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[12]        = {"SGUNIT",		XE_HW_ERR_TILE_NONFATAL_SGUNIT},
> +	[13 ... 15] = {"Undefined",             XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[16]        = {"SOC",			XE_HW_ERR_TILE_UNSPEC},
> +	[17 ... 19] = {"Undefined",             XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[20]        = {"MERT",			XE_HW_ERR_TILE_NONFATAL_MERT},
> +	[21 ... 31] = {"Undefined",             XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +};
> +
> +static const struct err_name_index_pair dg2_err_stat_correctable_reg[] = {
> +	[0]         = {"GT",			XE_HW_ERR_TILE_UNSPEC},
> +	[1 ... 3]   = {"Undefined",		XE_HW_ERR_TILE_CORR_UNKNOWN},
> +	[4 ... 7]   = {"Undefined",		XE_HW_ERR_TILE_CORR_UNKNOWN},
> +	[8]         = {"GSC",			XE_HW_ERR_TILE_UNSPEC},
> +	[9 ... 11]  = {"Undefined",		XE_HW_ERR_TILE_CORR_UNKNOWN},
> +	[12]        = {"SGUNIT",		XE_HW_ERR_TILE_CORR_SGUNIT},
> +	[13 ... 15] = {"Undefined",             XE_HW_ERR_TILE_CORR_UNKNOWN},
> +	[16]        = {"SOC",			XE_HW_ERR_TILE_UNSPEC},
> +	[17 ... 31] = {"Undefined",             XE_HW_ERR_TILE_CORR_UNKNOWN},
> +};
> +
> +static const struct err_name_index_pair pvc_err_stat_fatal_reg[] = {
> +	[0]         =  {"GT",			XE_HW_ERR_TILE_UNSPEC},
> +	[1]         =  {"SGGI Cmd Parity",	XE_HW_ERR_TILE_FATAL_SGGI},
> +	[2 ... 7]   =  {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[8]         =  {"GSC",			XE_HW_ERR_TILE_UNSPEC},
> +	[9]         =  {"SGLI Cmd Parity",	XE_HW_ERR_TILE_FATAL_SGLI},
> +	[10 ... 12] =  {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[13]        =  {"SGCI Cmd Parity",	XE_HW_ERR_TILE_FATAL_SGCI},
> +	[14 ... 15] =  {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[16]        =  {"SOC ERROR",		XE_HW_ERR_TILE_UNSPEC},
> +	[17 ... 19] =  {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +	[20]        =  {"MERT Cmd Parity",	XE_HW_ERR_TILE_FATAL_MERT},
> +	[21 ... 31] =  {"Undefined",		XE_HW_ERR_TILE_FATAL_UNKNOWN},
> +};
> +
> +static const struct err_name_index_pair pvc_err_stat_nonfatal_reg[] = {
> +	[0]         =  {"GT",			XE_HW_ERR_TILE_UNSPEC},
> +	[1]         =  {"SGGI Data Parity",	XE_HW_ERR_TILE_NONFATAL_SGGI},
> +	[2 ... 7]   =  {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[8]         =  {"GSC",			XE_HW_ERR_TILE_UNSPEC},
> +	[9]         =  {"SGLI Data Parity",	XE_HW_ERR_TILE_NONFATAL_SGLI},
> +	[10 ... 12] =  {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[13]        =  {"SGCI Data Parity",	XE_HW_ERR_TILE_NONFATAL_SGCI},
> +	[14 ... 15] =  {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[16]        =  {"SOC",			XE_HW_ERR_TILE_UNSPEC},
> +	[17 ... 19] =  {"Undefined",		XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +	[20]        =  {"MERT Data Parity",	XE_HW_ERR_TILE_NONFATAL_MERT},
> +	[21 ... 31] =  {"Undefined",            XE_HW_ERR_TILE_NONFATAL_UNKNOWN},
> +};
> +
> +static const struct err_name_index_pair pvc_err_stat_correctable_reg[] = {
> +	[0]         =  {"GT",			XE_HW_ERR_TILE_UNSPEC},
> +	[1 ... 7]   =  {"Undefined",		XE_HW_ERR_TILE_CORR_UNKNOWN},
> +	[8]         =  {"GSC",			XE_HW_ERR_TILE_UNSPEC},
> +	[9 ... 31]  =  {"Undefined",		XE_HW_ERR_TILE_CORR_UNKNOWN},
> +};
> +
> +static void xe_assign_hw_err_regs(struct xe_device *xe)
> +{
> +	const struct err_name_index_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
> +
> +	/* Error reporting is supported only for DG2 and PVC currently. */
> +	if (xe->info.platform == XE_DG2) {
> +		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dg2_err_stat_correctable_reg;
> +		dev_err_stat[HARDWARE_ERROR_NONFATAL] = dg2_err_stat_nonfatal_reg;
> +		dev_err_stat[HARDWARE_ERROR_FATAL] = dg2_err_stat_fatal_reg;
> +	}
> +

else

> +	if (xe->info.platform == XE_PVC) {
> +		dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
> +		dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
> +		dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
> +	}

and I guess the 'if' ladder shall start with newest platforms first

> +}
> +
> +static bool xe_platform_has_ras(struct xe_device *xe)
> +{
> +	if (xe->info.platform == XE_PVC || xe->info.platform == XE_DG2)
> +		return true;
> +
> +	return false;

this could be one line

	return xe->info.platform == XE_PVC || xe->info.platform == XE_DG2;

but likely it should start with

	if (IS_SRIOV_VF(xe))
		return false;

> +}
> +
> +static void
> +xe_update_hw_error_cnt(struct drm_device *drm, struct xarray *hw_error, unsigned long index)
> +{
> +	unsigned long flags;
> +	void *entry;
> +
> +	entry = xa_load(hw_error, index);
> +	entry = xa_mk_value(xa_to_value(entry) + 1);
> +
> +	xa_lock_irqsave(hw_error, flags);
> +	if (xa_is_err(__xa_store(hw_error, index, entry, GFP_ATOMIC)))
> +		drm_err_ratelimited(drm,
> +				    HW_ERR "Error reported by index %ld is lost\n", index);
> +	xa_unlock_irqrestore(hw_error, flags);
> +}
> +
> +static void
> +xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
> +{
> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> +	const struct hardware_errors_regs *err_regs;
> +	const struct err_name_index_pair *errstat;
> +	unsigned long errsrc;
> +	unsigned long flags;
> +	const char *name;
> +	u32 indx;
> +	u32 errbit;
> +
> +	if (!xe_platform_has_ras(tile_to_xe(tile)))
> +		return;
> +
> +	spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
> +	err_regs = &tile_to_xe(tile)->hw_err_regs;
> +	errstat = err_regs->dev_err_stat[hw_err];
> +	errsrc = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err));
> +	if (!errsrc) {
> +		drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
> +				    "TILE%d reported DEV_ERR_STAT_REG_%s blank!\n",
> +				    tile->id, hw_err_str);
> +		goto unlock;
> +	}
> +
> +	if (tile_to_xe(tile)->info.platform != XE_DG2)
> +		drm_dbg(&tile_to_xe(tile)->drm, HW_ERR
> +			"TILE%d reported DEV_ERR_STAT_REG_%s=0x%08lx\n",
> +			tile->id, hw_err_str, errsrc);
> +
> +	for_each_set_bit(errbit, &errsrc, XE_RAS_REG_SIZE) {
> +		name = errstat[errbit].name;
> +		indx = errstat[errbit].index;
> +
> +		if (hw_err == HARDWARE_ERROR_CORRECTABLE &&
> +		    tile_to_xe(tile)->info.platform != XE_DG2)
> +			drm_warn(&tile_to_xe(tile)->drm,
> +				 HW_ERR "TILE%d reported %s %s error, bit[%d] is set\n",
> +				 tile->id, name, hw_err_str, errbit);
> +
> +		else if (tile_to_xe(tile)->info.platform != XE_DG2)
> +			drm_err_ratelimited(&tile_to_xe(tile)->drm,
> +					    HW_ERR "TILE%d reported %s %s error, bit[%d] is set\n",
> +					    tile->id, name, hw_err_str, errbit);
> +
> +		if (indx != XE_HW_ERR_TILE_UNSPEC)
> +			xe_update_hw_error_cnt(&tile_to_xe(tile)->drm,
> +					       &tile->errors.hw_error, indx);
> +	}
> +
> +	xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
> +unlock:
> +	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
> +}
> +
> +/*
> + * XE Platforms adds three Error bits to the Master Interrupt
> + * Register to support error handling. These three bits are
> + * used to convey the class of error:
> + * FATAL, NONFATAL, or CORRECTABLE.
> + *
> + * To process an interrupt:
> + *       Determine source of error (IP block) by reading
> + *	 the Device Error Source Register (RW1C) that
> + *	 corresponds to the class of error being serviced
> + *	 and log the error.
> + */
> +void
> +xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
> +{
> +	enum hardware_error hw_err;
> +
> +	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
> +		if (master_ctl & XE_ERROR_IRQ(hw_err))
> +			xe_hw_error_source_handler(tile, hw_err);
> +	}
> +}
> +
> +/*
> + * xe_process_hw_errors - checks for the occurrence of HW errors
> + *
> + * Fatal will result in a card warm reset and driver will be reloaded.
> + * This checks for the HW Errors that might have occurred in the
> + * previous boot of the driver.
> + */
> +static void xe_process_hw_errors(struct xe_device *xe)
> +{
> +	struct xe_mmio *root_mmio = xe_root_tile_mmio(xe);
> +
> +	u32 dev_pcieerr_status, master_ctl;
> +	struct xe_tile *tile;
> +	int i;
> +
> +	dev_pcieerr_status = xe_mmio_read32(root_mmio, DEV_PCIEERR_STATUS);
> +
> +	for_each_tile(tile, xe, i) {
> +		if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
> +			xe_hw_error_source_handler(tile, HARDWARE_ERROR_FATAL);
> +
> +		master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ);
> +		xe_hw_error_irq_handler(tile, master_ctl);
> +		xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl);
> +	}
> +	if (dev_pcieerr_status)
> +		xe_mmio_write32(root_mmio, DEV_PCIEERR_STATUS, dev_pcieerr_status);
> +}
> +

missing kernel-doc for public function

> +void xe_init_hw_errors(struct xe_device *xe)
> +{

shouldn't you check xe_platform_has_ras() here?

> +	xe_assign_hw_err_regs(xe);
> +	xe_process_hw_errors(xe);
> +}
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
> new file mode 100644
> index 000000000000..398e2a7f2ac6
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2023 Intel Corporation
> + */
> +#ifndef XE_HW_ERRORS_H_
> +#define XE_HW_ERRORS_H_
> +
> +#include <linux/stddef.h>
> +#include <linux/types.h>
> +
> +#define XE_RAS_REG_SIZE 32
> +
> +/* Error categories reported by hardware */
> +enum hardware_error {
> +	HARDWARE_ERROR_CORRECTABLE = 0,
> +	HARDWARE_ERROR_NONFATAL = 1,
> +	HARDWARE_ERROR_FATAL = 2,
> +	HARDWARE_ERROR_MAX,
> +};
> +
> +/* Count of Correctable and Uncorrectable errors reported on tile */
> +enum xe_tile_hw_errors {
> +	XE_HW_ERR_TILE_UNSPEC = 0,
> +	XE_HW_ERR_TILE_FATAL_SGGI,
> +	XE_HW_ERR_TILE_FATAL_SGLI,
> +	XE_HW_ERR_TILE_FATAL_SGUNIT,
> +	XE_HW_ERR_TILE_FATAL_SGCI,
> +	XE_HW_ERR_TILE_FATAL_MERT,
> +	XE_HW_ERR_TILE_FATAL_UNKNOWN,
> +	XE_HW_ERR_TILE_NONFATAL_SGGI,
> +	XE_HW_ERR_TILE_NONFATAL_SGLI,
> +	XE_HW_ERR_TILE_NONFATAL_SGUNIT,
> +	XE_HW_ERR_TILE_NONFATAL_SGCI,
> +	XE_HW_ERR_TILE_NONFATAL_MERT,
> +	XE_HW_ERR_TILE_NONFATAL_UNKNOWN,
> +	XE_HW_ERR_TILE_CORR_SGUNIT,
> +	XE_HW_ERR_TILE_CORR_UNKNOWN,
> +};
> +
> +struct err_name_index_pair {
> +	const char *name;
> +	const u32 index;
> +};

shouldn't above defs be in xe_hw_error_types.h ?

> +
> +struct xe_device;
> +struct xe_tile;
> +
> +void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
> +void xe_init_hw_errors(struct xe_device *xe);

naming seems wrong:

void xe_hw_error_init(struct xe_device *xe);
void xe_hw_error_irq_handler(struct xe_tile *tile, u32 master_ctl);

> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> index 5df5b8c2a3e4..1e9cfb8bb85d 100644
> --- a/drivers/gpu/drm/xe/xe_irq.c
> +++ b/drivers/gpu/drm/xe/xe_irq.c
> @@ -468,6 +468,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>  		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>  
>  		gt_irq_handler(tile, master_ctl, intr_dw, identity);
> +		xe_hw_error_irq_handler(tile, master_ctl);
>  
>  		/*
>  		 * Display interrupts (including display backlight operations
> diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> index d49ba3401963..b00c517e5559 100644
> --- a/drivers/gpu/drm/xe/xe_tile.c
> +++ b/drivers/gpu/drm/xe/xe_tile.c
> @@ -91,6 +91,8 @@
>   */
>  static int xe_tile_alloc(struct xe_tile *tile)
>  {
> +	xa_init(&tile->errors.hw_error);

shouldn't this be part of xe_init_hw_errors() ?

> +
>  	tile->mem.ggtt = xe_ggtt_alloc(tile);
>  	if (!tile->mem.ggtt)
>  		return -ENOMEM;



More information about the Intel-xe mailing list