[Intel-xe] [PATCH 2/4] drm/xe/ras: Log the GT hw errors.

Fri Apr 28 08:00:04 UTC 2023


> -----Original Message-----
> From: Roper, Matthew D <matthew.d.roper at intel.com>
> Sent: 25 April 2023 05:53
> To: Ghimiray, Himal Prasad <himal.prasad.ghimiray at intel.com>
> Cc: intel-xe at lists.freedesktop.org
> Subject: Re: [Intel-xe] [PATCH 2/4] drm/xe/ras: Log the GT hw errors.
> 
> On Thu, Apr 06, 2023 at 02:56:29PM +0530, Himal Prasad Ghimiray wrote:
> > From: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> >
> > Count the CORRECTABLE and FATAL GT hardware errors as signaled by
> > relevant interrupt and respective registers.
> >
> > For non relevant interrupts count them as driver interrupt error.
> >
> > For platform supporting error vector registers count and report the
> > respective vector errors.
> >
> > Co-authored-by: Himal Prasad Ghimiray
> > <himal.prasad.ghimiray at intel.com>
> > Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> > ---
> >  drivers/gpu/drm/xe/regs/xe_regs.h    |  77 ++++++-
> >  drivers/gpu/drm/xe/xe_device_types.h |   2 +
> >  drivers/gpu/drm/xe/xe_gt.c           |  29 +++
> >  drivers/gpu/drm/xe/xe_gt_types.h     |  43 ++++
> >  drivers/gpu/drm/xe/xe_irq.c          | 332 ++++++++++++++++++++++++---
> >  drivers/gpu/drm/xe/xe_pci.c          |   3 +
> >  6 files changed, 453 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h
> > b/drivers/gpu/drm/xe/regs/xe_regs.h
> > index dff74b093d4e..b3d35d0c5a77 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_regs.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
> > @@ -122,14 +122,50 @@ enum hardware_error {
> >  	HARDWARE_ERROR_MAX,
> >  };
> >
> > +#define DEV_PCIEERR_STATUS              _MMIO(0x100180)
> > +#define DEV_PCIEERR_IS_FATAL(x)         (REG_BIT(2) << (x * 4))
> 
> Just move the math inside the REG_BIT parameter.  I.e.
> 
>         #define DEV_PCIEERR_IS_FATAL(x)   REG_BIT(x * 4 + 2)
> 
> >  #define _DEV_ERR_STAT_FATAL             0x100174
> >  #define _DEV_ERR_STAT_NONFATAL          0x100178
> >  #define _DEV_ERR_STAT_CORRECTABLE       0x10017c
> >  #define DEV_ERR_STAT_REG(x)             _MMIO(_PICK_EVEN((x), \
> >
> 	_DEV_ERR_STAT_CORRECTABLE, \
> >
> 	_DEV_ERR_STAT_NONFATAL))
> > +
> >  #define  DEV_ERR_STAT_GT_ERROR          REG_BIT(0)
> >
> > +enum gt_vctr_registers {
> > +	ERR_STAT_GT_VCTR0 = 0,
> > +	ERR_STAT_GT_VCTR1,
> > +	ERR_STAT_GT_VCTR2,
> > +	ERR_STAT_GT_VCTR3,
> > +	ERR_STAT_GT_VCTR4,
> > +	ERR_STAT_GT_VCTR5,
> > +	ERR_STAT_GT_VCTR6,
> > +	ERR_STAT_GT_VCTR7,
> > +};
> > +
> > +#define ERR_STAT_GT_COR_VCTR_LEN        (4)
> > +#define _ERR_STAT_GT_COR_VCTR_0         0x1002a0
> > +#define _ERR_STAT_GT_COR_VCTR_1         0x1002a4
> > +#define _ERR_STAT_GT_COR_VCTR_2         0x1002a8
> > +#define _ERR_STAT_GT_COR_VCTR_3         0x1002ac
> > +#define ERR_STAT_GT_COR_VCTR_REG(x)     _MMIO(_PICK_EVEN((x), \
> > +
> 	_ERR_STAT_GT_COR_VCTR_0, \
> > +
> 	_ERR_STAT_GT_COR_VCTR_1))
> > +
> > +#define ERR_STAT_GT_FATAL_VCTR_LEN      (8)
> > +#define _ERR_STAT_GT_FATAL_VCTR_0       0x100260
> > +#define _ERR_STAT_GT_FATAL_VCTR_1       0x100264
> > +#define _ERR_STAT_GT_FATAL_VCTR_2       0x100268
> > +#define _ERR_STAT_GT_FATAL_VCTR_3       0x10026c
> > +#define _ERR_STAT_GT_FATAL_VCTR_4       0x100270
> > +#define _ERR_STAT_GT_FATAL_VCTR_5       0x100274
> > +#define _ERR_STAT_GT_FATAL_VCTR_6       0x100278
> > +#define _ERR_STAT_GT_FATAL_VCTR_7       0x10027c
> > +#define ERR_STAT_GT_FATAL_VCTR_REG(x)   _MMIO(_PICK_EVEN((x), \
> > +					_ERR_STAT_GT_FATAL_VCTR_0, \
> > +					_ERR_STAT_GT_FATAL_VCTR_1))
> > +
> >  #define _ERR_STAT_GT_COR                0x100160
> >  #define _ERR_STAT_GT_NONFATAL           0x100164
> >  #define _ERR_STAT_GT_FATAL              0x100168
> > @@ -137,7 +173,42 @@ enum hardware_error {
> >  						_ERR_STAT_GT_COR, \
> >  						_ERR_STAT_GT_NONFATAL))
> >
> > -#define  EU_GRF_ERROR                   REG_BIT(15)
> > -#define  EU_IC_ERROR                    REG_BIT(14)
> > -
> > +#define  EU_GRF_COR_ERR                 (15)
> > +#define  EU_IC_COR_ERR                  (14)
> > +#define  SLM_COR_ERR                    (13)
> > +#define  SAMPLER_COR_ERR                (12)
> > +#define  GUC_COR_ERR                    (1)
> > +#define  L3_SNG_COR_ERR                 (0)
> > +
> > +#define PVC_COR_ERR_MASK \
> > +		(REG_BIT(GUC_COR_ERR) | \
> > +		 REG_BIT(SLM_COR_ERR) | \
> > +		 REG_BIT(EU_IC_COR_ERR) | \
> > +		 REG_BIT(EU_GRF_COR_ERR))
> > +
> > +#define EU_GRF_FAT_ERR                  (15)
> > +#define EU_IC_FAT_ERR                   (14)
> > +#define SLM_FAT_ERR                     (13)
> > +#define SAMPLER_FAT_ERR                 (12)
> > +#define SQIDI_FAT_ERR                   (9)
> > +#define IDI_PAR_FAT_ERR                 (8)
> > +#define GUC_FAT_ERR                     (6)
> > +#define L3_ECC_CHK_FAT_ERR              (5)
> > +#define L3_DOUBLE_FAT_ERR               (4)
> > +#define FPU_UNCORR_FAT_ERR              (3)
> > +#define ARRAY_BIST_FAT_ERR              (1)
> > +
> > +#define PVC_FAT_ERR_MASK \
> > +		(REG_BIT(FPU_UNCORR_FAT_ERR) | \
> > +		 REG_BIT(GUC_FAT_ERR)  | \
> > +		 REG_BIT(SLM_FAT_ERR)  | \
> > +		 REG_BIT(EU_GRF_FAT_ERR))
> > +
> > +#define GT_HW_ERROR_MAX_ERR_BITS        16
> > +
> > +#define _SLM_ECC_ERROR_CNT              0xe7f4
> > +#define _SLM_UNCORR_ECC_ERROR_CNT       0xe7c0
> > +#define SLM_ECC_ERROR_CNTR(x)           _MMIO((x) ==
> HARDWARE_ERROR_CORRECTABLE ? \
> > +						_SLM_ECC_ERROR_CNT : \
> > +
> 	_SLM_UNCORR_ECC_ERROR_CNT)
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h
> > b/drivers/gpu/drm/xe/xe_device_types.h
> > index 88f863edc41c..ecabf4d6690d 100644
> > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > @@ -99,6 +99,8 @@ struct xe_device {
> >  		bool has_link_copy_engine;
> >  		/** @enable_display: display enabled */
> >  		bool enable_display;
> > +		/** @has_gt_error_vectors: whether platform supports
> ERROR VECTORS */
> > +		bool has_gt_error_vectors;
> >
> >  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
> >  		struct xe_device_display_info {
> > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > index bc821f431c45..ce9ce2748394 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.c
> > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > @@ -44,6 +44,35 @@
> >  #include "xe_wa.h"
> >  #include "xe_wopcm.h"
> >
> > +static const char * const xe_gt_driver_errors_to_str[] = {
> > +	[INTEL_GT_DRIVER_ERROR_INTERRUPT] = "INTERRUPT", };
> > +
> > +void xe_gt_log_driver_error(struct xe_gt *gt,
> 
> It looks like the things this function gets used for fall into one of two
> categories:
> 
>  - Software assertions that we don't expect to happen unless we screwed
>    up and missed something in the code.
>  - Hardware is doing something out of spec (e.g., raising interrupt bits
>    that aren't documented).
> 
> For the first one, I don't see a reason to be counting up such errors; they're
> just there for early developers and such mistakes should not be present in
> the driver anymore by the time it gets into real users'
> hands.  For the latter, classifying these as "driver errors" is incorrect since it's
> the hardware misbehaving, not the driver.
Yes you are right. Will address this in next patch
> 
> All of this new infrastructure seems pretty questionable at the moment.
> We're doing extra work to count up errors, but then never doing anything
> with the counts.  You mention in the cover letter that these will be exposed
> to userspace eventually, but what's the benefit of that?  Which userspace
> component is going to actually use this information?  What do you expect
> userspace to do if it finds out there's been a fatal or correctable error in
> some low-level hardware unit?  Generally userspace shouldn't even need to
> care about the really low-level hardware details; if something has truly gone
> fatally wrong, it's game over for userspace and it probably doesn't matter
> exactly where in the hardware things are actually busted.
> 
> Without some extra justification from the userspace point of view, it feels
> like we're just adding a bunch of code that doesn't have a real-world
> purpose.
The error counters  exposed by KMD will be used by sysman 
They will be categorized to specific category of error in sysman:
 https://spec.oneapi.io/level-zero/latest/sysman/api.html#ras
> 
> > +			    const enum xe_gt_driver_errors error,
> > +			    const char *fmt, ...)
> > +{
> > +	struct va_format vaf;
> > +	va_list args;
> > +
> > +	va_start(args, fmt);
> > +	vaf.fmt = fmt;
> > +	vaf.va = &args;
> > +
> > +	BUILD_BUG_ON(ARRAY_SIZE(xe_gt_driver_errors_to_str) !=
> > +		     INTEL_GT_DRIVER_ERROR_COUNT);
> > +
> > +	WARN_ON_ONCE(error >= INTEL_GT_DRIVER_ERROR_COUNT);
> > +
> > +	gt->errors.driver[error]++;
> > +
> > +	drm_err_ratelimited(&gt_to_xe(gt)->drm, "GT%u [%s] %pV",
> > +			    gt->info.id,
> > +			    xe_gt_driver_errors_to_str[error],
> > +			    &vaf);
> > +	va_end(args);
> > +}
> > +
> >  struct xe_gt *xe_find_full_gt(struct xe_gt *gt)  {
> >  	struct xe_gt *search;
> > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> > b/drivers/gpu/drm/xe/xe_gt_types.h
> > index 8f29aba455e0..9580a40c0142 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > @@ -33,6 +33,43 @@ enum xe_gt_type {
> >  typedef unsigned long xe_dss_mask_t[BITS_TO_LONGS(32 *
> > XE_MAX_DSS_FUSE_REGS)];  typedef unsigned long
> > xe_eu_mask_t[BITS_TO_LONGS(32 * XE_MAX_EU_FUSE_REGS)];
> >
> > +/* Count of GT Correctable and FATAL HW ERRORS */ enum
> > +intel_gt_hw_errors {
> > +	INTEL_GT_HW_ERROR_COR_SUBSLICE = 0,
> > +	INTEL_GT_HW_ERROR_COR_L3BANK,
> > +	INTEL_GT_HW_ERROR_COR_L3_SNG,
> > +	INTEL_GT_HW_ERROR_COR_GUC,
> > +	INTEL_GT_HW_ERROR_COR_SAMPLER,
> > +	INTEL_GT_HW_ERROR_COR_SLM,
> > +	INTEL_GT_HW_ERROR_COR_EU_IC,
> > +	INTEL_GT_HW_ERROR_COR_EU_GRF,
> > +	INTEL_GT_HW_ERROR_FAT_SUBSLICE,
> > +	INTEL_GT_HW_ERROR_FAT_L3BANK,
> > +	INTEL_GT_HW_ERROR_FAT_ARR_BIST,
> > +	INTEL_GT_HW_ERROR_FAT_FPU,
> > +	INTEL_GT_HW_ERROR_FAT_L3_DOUB,
> > +	INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK,
> > +	INTEL_GT_HW_ERROR_FAT_GUC,
> > +	INTEL_GT_HW_ERROR_FAT_IDI_PAR,
> > +	INTEL_GT_HW_ERROR_FAT_SQIDI,
> > +	INTEL_GT_HW_ERROR_FAT_SAMPLER,
> > +	INTEL_GT_HW_ERROR_FAT_SLM,
> > +	INTEL_GT_HW_ERROR_FAT_EU_IC,
> > +	INTEL_GT_HW_ERROR_FAT_EU_GRF,
> > +	INTEL_GT_HW_ERROR_FAT_TLB,
> > +	INTEL_GT_HW_ERROR_FAT_L3_FABRIC,
> > +	INTEL_GT_HW_ERROR_COUNT
> > +};
> > +
> > +enum xe_gt_driver_errors {
> > +	INTEL_GT_DRIVER_ERROR_INTERRUPT = 0,
> > +	INTEL_GT_DRIVER_ERROR_COUNT
> > +};
> > +
> > +void xe_gt_log_driver_error(struct xe_gt *gt,
> > +			    const enum xe_gt_driver_errors error,
> > +			    const char *fmt, ...);
> > +
> >  struct xe_mmio_range {
> >  	u32 start;
> >  	u32 end;
> > @@ -357,6 +394,12 @@ struct xe_gt {
> >  	 *    of a steered operation
> >  	 */
> >  	spinlock_t mcr_lock;
> > +
> > +	struct intel_hw_errors {
> > +		unsigned long hw[INTEL_GT_HW_ERROR_COUNT];
> > +		unsigned long driver[INTEL_GT_DRIVER_ERROR_COUNT];
> > +	} errors;
> > +
> >  };
> >
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> > index 6b922332bff1..4626f7280aaf 100644
> > --- a/drivers/gpu/drm/xe/xe_irq.c
> > +++ b/drivers/gpu/drm/xe/xe_irq.c
> > @@ -19,6 +19,7 @@
> >  #include "xe_hw_engine.h"
> >  #include "xe_mmio.h"
> >
> > +#define HAS_GT_ERROR_VECTORS(xe) ((xe)->info.has_gt_error_vectors)
> >  static void gen3_assert_iir_is_zero(struct xe_gt *gt, i915_reg_t reg)
> > {
> >  	u32 val = xe_mmio_read32(gt, reg.reg); @@ -359,44 +360,281 @@
> > hardware_error_type_to_str(const enum hardware_error hw_err)
> >  	}
> >  }
> >
> > +#define xe_gt_hw_err(gt, fmt, ...) \
> > +	drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "GT%d detected "
> fmt, \
> > +			(gt)->info.id, ##__VA_ARGS__)
> 
> As on the previous patch, it looks like we're printing error-level kernel
> messages for correctable errors (i.e., things the hardware caught and fixed
> internally like ECC).  Generally those kind of things shouldn't be putting
> errors in the kernel log because there's no actual problem from the end user
> perspective.
Agreed. Should we use warning for correctable errors ?
> 
> > +
> >  static void
> > -xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error
> > hw_err)
> > +xe_gt_correctable_hw_error_stats_update(struct xe_gt  *gt, unsigned
> > +long errstat)
> >  {
> > -	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> > -	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
> > -	u32 errstat;
> > +	u32 errbit, cnt;
> >
> > -	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
> > +	if (!errstat && HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
> > +		return;
> >
> > -	errstat = xe_mmio_read32(gt, ERR_STAT_GT_REG(hw_err).reg);
> > +	for_each_set_bit(errbit, &errstat, GT_HW_ERROR_MAX_ERR_BITS) {
> > +		if (gt->xe->info.platform == XE_PVC && !(REG_BIT(errbit) &
> PVC_COR_ERR_MASK)) {
> > +			xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > +					       "UNKNOWN CORRECTABLE
> error\n");
> > +			continue;
> > +		}
> >
> > -	if (unlikely(!errstat)) {
> > -		DRM_ERROR("ERR_STAT_GT_REG_%s blank!\n",
> hw_err_str);
> > -		return;
> > +	switch (errbit) {
> > +	case L3_SNG_COR_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_L3_SNG]++;
> > +		xe_gt_hw_err(gt, "L3 SINGLE CORRECTABLE error\n");
> > +		break;
> > +	case GUC_COR_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_GUC]++;
> > +		xe_gt_hw_err(gt, "SINGLE BIT GUC SRAM CORRECTABLE
> error\n");
> > +		break;
> > +	case SAMPLER_COR_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_SAMPLER]++;
> > +		xe_gt_hw_err(gt, "SINGLE BIT SAMPLER CORRECTABLE
> error\n");
> > +		break;
> > +	case SLM_COR_ERR:
> > +		cnt = xe_mmio_read32(gt,
> SLM_ECC_ERROR_CNTR(HARDWARE_ERROR_CORRECTABLE).reg);
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_SLM] = cnt;
> > +		xe_gt_hw_err(gt, "%u SINGLE BIT SLM CORRECTABLE
> error\n", cnt);
> > +		break;
> > +	case EU_IC_COR_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_EU_IC]++;
> > +		xe_gt_hw_err(gt, "SINGLE BIT EU IC CORRECTABLE error\n");
> > +		break;
> > +	case EU_GRF_COR_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_EU_GRF]++;
> > +		xe_gt_hw_err(gt, "SINGLE BIT EU GRF CORRECTABLE
> error\n");
> > +		break;
> > +	default:
> > +		xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT, "UNKNOWN CORRECTABLE error\n");
> > +		break;
> > +	}
> >  	}
> > +}
> >
> > -	/*
> > -	 * TODO: The GT Non Fatal Error Status Register
> > -	 * only has reserved bitfields defined.
> > -	 * Remove once there is something to service.
> > -	 */
> > -	if (hw_err == HARDWARE_ERROR_NONFATAL) {
> > -		DRM_ERROR("detected Non-Fatal error\n");
> > -		xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg,
> errstat);
> > +static void xe_gt_fatal_hw_error_stats_update(struct xe_gt *gt,
> > +unsigned long errstat) {
> > +	u32 errbit, cnt;
> > +
> > +	if (!errstat && HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
> >  		return;
> > +
> > +	for_each_set_bit(errbit, &errstat, GT_HW_ERROR_MAX_ERR_BITS) {
> > +		if (gt->xe->info.platform == XE_PVC && !(REG_BIT(errbit) &
> PVC_FAT_ERR_MASK)) {
> > +			xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > +					       "UNKNOWN FATAL error\n");
> > +			continue;
> > +		}
> > +
> > +	switch (errbit) {
> > +	case ARRAY_BIST_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_ARR_BIST]++;
> > +		xe_gt_hw_err(gt, "Array BIST FATAL error\n");
> > +		break;
> > +	case FPU_UNCORR_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_FPU]++;
> > +		xe_gt_hw_err(gt, "FPU FATAL error\n");
> > +		break;
> > +	case L3_DOUBLE_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_L3_DOUB]++;
> > +		xe_gt_hw_err(gt, "L3 Double FATAL error\n");
> > +		break;
> > +	case L3_ECC_CHK_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK]++;
> > +		xe_gt_hw_err(gt, "L3 ECC Checker FATAL error\n");
> > +		break;
> > +	case GUC_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_GUC]++;
> > +		xe_gt_hw_err(gt, "GUC SRAM FATAL error\n");
> > +		break;
> > +	case IDI_PAR_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_IDI_PAR]++;
> > +		xe_gt_hw_err(gt, "IDI PARITY FATAL error\n");
> > +		break;
> > +	case SQIDI_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SQIDI]++;
> > +		xe_gt_hw_err(gt, "SQIDI FATAL error\n");
> > +		break;
> > +	case SAMPLER_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SAMPLER]++;
> > +		xe_gt_hw_err(gt, "SAMPLER FATAL error\n");
> > +		break;
> > +	case SLM_FAT_ERR:
> > +		cnt = xe_mmio_read32(gt,
> SLM_ECC_ERROR_CNTR(HARDWARE_ERROR_FATAL).reg);
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SLM] = cnt;
> > +		xe_gt_hw_err(gt, "%u SLM FATAL error\n", cnt);
> > +		break;
> > +	case EU_IC_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_EU_IC]++;
> > +		xe_gt_hw_err(gt, "EU IC FATAL error\n");
> > +		break;
> > +	case EU_GRF_FAT_ERR:
> > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_EU_GRF]++;
> > +		xe_gt_hw_err(gt, "EU GRF FATAL error\n");
> > +		break;
> > +	default:
> > +		xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > +				       "UNKNOWN FATAL error\n");
> > +		break;
> > +	}
> >  	}
> > +}
> >
> > -	/*
> > -	 * TODO: The remaining GT errors don't have a
> > -	 * need for targeted logging at the moment. We
> > -	 * still want to log detection of these errors, but
> > -	 * let's aggregate them until someone has a need for them.
> > -	 */
> > -	if (errstat & other_errors)
> > -		DRM_ERROR("detected hardware error(s) in
> ERR_STAT_GT_REG_%s: 0x%08x\n",
> > -			  hw_err_str, errstat & other_errors);
> > +static void
> > +xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error
> > +hw_err) {
> > +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> > +	unsigned long errstat;
> > +
> > +	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
> >
> > -	xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg, errstat);
> > +	if (!HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
> > +		errstat = xe_mmio_read32(gt,
> ERR_STAT_GT_REG(hw_err).reg);
> > +		if (unlikely(!errstat)) {
> > +			xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > +					       "ERR_STAT_GT_REG_%s
> blank!\n", hw_err_str);
> > +			return;
> > +		}
> > +	}
> > +
> > +	switch (hw_err) {
> > +	case HARDWARE_ERROR_CORRECTABLE:
> > +		if (HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
> > +			bool error = false;
> > +			int i;
> > +
> > +			errstat = 0;
> > +			for (i = 0; i < ERR_STAT_GT_COR_VCTR_LEN; i++) {
> > +				u32 err_type =
> ERR_STAT_GT_COR_VCTR_LEN;
> > +				unsigned long vctr;
> > +				const char *name;
> > +
> > +				vctr = xe_mmio_read32(gt,
> ERR_STAT_GT_COR_VCTR_REG(i).reg);
> > +				if (!vctr)
> > +					continue;
> > +
> > +				switch (i) {
> > +				case ERR_STAT_GT_VCTR0:
> > +				case ERR_STAT_GT_VCTR1:
> > +					err_type =
> INTEL_GT_HW_ERROR_COR_SUBSLICE;
> > +					gt->errors.hw[err_type] +=
> hweight32(vctr);
> > +					name = "SUBSLICE";
> > +
> > +					/* Avoid second read/write to error
> status register*/
> > +					if (errstat)
> > +						break;
> > +
> > +					errstat = xe_mmio_read32(gt,
> ERR_STAT_GT_REG(hw_err).reg);
> > +					xe_gt_hw_err(gt,
> "ERR_STAT_GT_CORRECTABLE:0x%08lx\n",
> > +						     errstat);
> > +
> 	xe_gt_correctable_hw_error_stats_update(gt, errstat);
> > +					if (errstat)
> > +						xe_mmio_write32(gt,
> ERR_STAT_GT_REG(hw_err).reg,
> > +								errstat);
> > +					break;
> > +
> > +				case ERR_STAT_GT_VCTR2:
> > +				case ERR_STAT_GT_VCTR3:
> > +					err_type =
> INTEL_GT_HW_ERROR_COR_L3BANK;
> > +					gt->errors.hw[err_type] +=
> hweight32(vctr);
> > +					name = "L3 BANK";
> > +					break;
> > +				default:
> > +					name = "UNKNOWN";
> > +					break;
> > +				}
> > +				xe_mmio_write32(gt,
> ERR_STAT_GT_COR_VCTR_REG(i).reg, vctr);
> > +				xe_gt_hw_err(gt, "%s CORRECTABLE error,
> ERR_VECT_GT_CORRECTABLE_%d:0x%08lx\n",
> > +					     name, i, vctr);
> > +				error = true;
> > +			}
> > +
> > +			if (!error)
> > +				xe_gt_hw_err(gt, "UNKNOWN CORRECTABLE
> error\n");
> > +		} else {
> > +			xe_gt_correctable_hw_error_stats_update(gt,
> errstat);
> > +			xe_gt_hw_err(gt,
> "ERR_STAT_GT_CORRECTABLE:0x%08lx\n", errstat);
> > +		}
> > +		break;
> > +	case HARDWARE_ERROR_NONFATAL:
> > +	      /*
> > +	       * TODO: The GT Non Fatal Error Status Register
> > +	       * only has reserved bitfields defined.
> > +	       * Remove once there is something to service.
> > +	       */
> > +		drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "detected
> Non-Fatal error\n");
> > +		break;
> > +	case HARDWARE_ERROR_FATAL:
> > +		if (HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
> > +			bool error = false;
> > +			int i;
> > +
> > +			errstat = 0;
> > +			for (i = 0; i < ERR_STAT_GT_FATAL_VCTR_LEN; i++) {
> > +				u32 err_type =
> ERR_STAT_GT_FATAL_VCTR_LEN;
> > +				unsigned long vctr;
> > +				const char *name;
> > +
> > +				vctr = xe_mmio_read32(gt,
> ERR_STAT_GT_FATAL_VCTR_REG(i).reg);
> > +				if (!vctr)
> > +					continue;
> > +
> > +				/* i represents the vector register index */
> > +				switch (i) {
> > +				case ERR_STAT_GT_VCTR0:
> > +				case ERR_STAT_GT_VCTR1:
> > +					err_type =
> INTEL_GT_HW_ERROR_FAT_SUBSLICE;
> > +					gt->errors.hw[err_type] +=
> hweight32(vctr);
> > +					name = "SUBSLICE";
> > +
> > +					/*Avoid second read/write to error
> status register.*/
> > +					if (errstat)
> > +						break;
> > +
> > +					errstat = xe_mmio_read32(gt,
> ERR_STAT_GT_REG(hw_err).reg);
> > +					xe_gt_hw_err(gt,
> "ERR_STAT_GT_FATAL:0x%08lx\n", errstat);
> > +
> 	xe_gt_fatal_hw_error_stats_update(gt, errstat);
> > +					if (errstat)
> > +						xe_mmio_write32(gt,
> ERR_STAT_GT_REG(hw_err).reg,
> > +								errstat);
> > +					break;
> > +
> > +				case ERR_STAT_GT_VCTR2:
> > +				case ERR_STAT_GT_VCTR3:
> > +					err_type =
> INTEL_GT_HW_ERROR_FAT_L3BANK;
> > +					gt->errors.hw[err_type] +=
> hweight32(vctr);
> > +					name = "L3 BANK";
> > +					break;
> > +				case ERR_STAT_GT_VCTR6:
> > +					gt-
> >errors.hw[INTEL_GT_HW_ERROR_FAT_TLB] += hweight16(vctr);
> > +					name = "TLB";
> > +					break;
> > +				case ERR_STAT_GT_VCTR7:
> > +					gt-
> >errors.hw[INTEL_GT_HW_ERROR_FAT_L3_FABRIC] += hweight8(vctr);
> > +					name = "L3 FABRIC";
> > +					break;
> > +				default:
> > +					name = "UNKNOWN";
> > +					break;
> > +				}
> > +				xe_mmio_write32(gt,
> ERR_STAT_GT_FATAL_VCTR_REG(i).reg, vctr);
> > +				xe_gt_hw_err(gt, "%s FATAL error,
> ERR_VECT_GT_FATAL_%d:0x%08lx\n",
> > +					     name, i, vctr);
> > +				error = true;
> > +			}
> > +			if (!error)
> > +				xe_gt_hw_err(gt, "UNKNOWN FATAL
> error\n");
> > +		} else {
> > +			xe_gt_fatal_hw_error_stats_update(gt, errstat);
> > +			xe_gt_hw_err(gt, "ERR_STAT_GT_FATAL:0x%08lx\n",
> errstat);
> > +		}
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +
> > +	if (!HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
> > +		xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg,
> errstat);
> >  }
> >
> >  static void
> > @@ -409,7 +647,8 @@ xe_hw_error_source_handler(struct xe_gt *gt,
> const enum hardware_error hw_err)
> >  	spin_lock_irqsave(&gt_to_xe(gt)->irq.lock, flags);
> >  	errsrc = xe_mmio_read32(gt, DEV_ERR_STAT_REG(hw_err).reg);
> >  	if (unlikely(!errsrc)) {
> > -		DRM_ERROR("DEV_ERR_STAT_REG_%s blank!\n",
> hw_err_str);
> > +		xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > +				       "DEV_ERR_STAT_REG_%s blank!\n",
> hw_err_str);
> >  		goto out_unlock;
> >  	}
> >
> > @@ -417,8 +656,9 @@ xe_hw_error_source_handler(struct xe_gt *gt,
> const enum hardware_error hw_err)
> >  		xe_gt_hw_error_handler(gt, hw_err);
> >
> >  	if (errsrc & ~DEV_ERR_STAT_GT_ERROR)
> > -		DRM_ERROR("non-GT hardware error(s) in
> DEV_ERR_STAT_REG_%s: 0x%08x\n",
> > -			  hw_err_str, errsrc & ~DEV_ERR_STAT_GT_ERROR);
> > +		xe_gt_log_driver_error(gt,
> INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > +				       "non-GT hardware error(s) in
> DEV_ERR_STAT_REG_%s: 0x%08x\n",
> > +				       hw_err_str, errsrc &
> ~DEV_ERR_STAT_GT_ERROR);
> >
> >  	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err).reg, errsrc);
> >
> > @@ -634,12 +874,44 @@ static void irq_uninstall(struct drm_device *drm,
> void *arg)
> >  		pci_disable_msi(pdev);
> >  }
> >
> > +/**
> > + * process_hw_errors - checks for the occurrence of HW errors
> > + *
> > + * This checks for the HW Errors including FATAL error that might
> > + * have occurred in the previous boot of the driver which will
> > + * initiate PCIe FLR reset of the device and cause the
> > + * driver to reload.
> 
> Is this saying that there's already been a PCIe FLR and you're trying to read
> the registers after that reset has happened?  The bspec indicates that these
> registers have 'DEV' style reset, so they wouldn't be able to preserve their
> values across a reset.
Registers preserve the value across reset.
BSPEC: 50875
> 
> > + */
> > +static void process_hw_errors(struct xe_device *xe) {
> > +	struct xe_gt *gt0 = xe_device_get_gt(xe, 0);
> > +	u32 dev_pcieerr_status, master_ctl;
> > +	struct xe_gt *gt;
> > +	int i;
> > +
> > +	dev_pcieerr_status = xe_mmio_read32(gt0,
> DEV_PCIEERR_STATUS.reg);
> > +
> > +	for_each_gt(gt, xe, i) {
> > +		if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
> > +			xe_hw_error_source_handler(gt,
> HARDWARE_ERROR_FATAL);
> > +
> > +		master_ctl = xe_mmio_read32(gt,
> GEN11_GFX_MSTR_IRQ.reg);
> > +		xe_mmio_write32(gt, GEN11_GFX_MSTR_IRQ.reg,
> master_ctl);
> > +		xe_hw_error_irq_handler(gt, master_ctl);
> > +	}
> > +	if (dev_pcieerr_status)
> > +		xe_mmio_write32(gt, DEV_PCIEERR_STATUS.reg,
> dev_pcieerr_status); }
> > +
> >  int xe_irq_install(struct xe_device *xe)  {
> >  	int irq = to_pci_dev(xe->drm.dev)->irq;
> >  	irq_handler_t irq_handler;
> >  	int err;
> >
> > +	if (IS_DGFX(xe))
> > +		process_hw_errors(xe);
> 
> Why is this conditional on DGFX?  From what I can see this also applies to
> integrated platforms like MTL too.
>From RAS perspective report of error is required on DGFX only.
sysman don't use these counters from IGFX
> 
> 
> Matt
> 
> > +
> >  	irq_handler = xe_irq_handler(xe);
> >  	if (!irq_handler) {
> >  		drm_err(&xe->drm, "No supported interrupt handler"); diff --
> git
> > a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index
> > 1844cff8fba8..69098194cef8 100644
> > --- a/drivers/gpu/drm/xe/xe_pci.c
> > +++ b/drivers/gpu/drm/xe/xe_pci.c
> > @@ -73,6 +73,7 @@ struct xe_device_desc {
> >  	bool has_range_tlb_invalidation;
> >  	bool has_asid;
> >  	bool has_link_copy_engine;
> > +	bool has_gt_error_vectors;
> >  };
> >
> >  __diag_push();
> > @@ -232,6 +233,7 @@ static const struct xe_device_desc pvc_desc = {
> >  	.supports_usm = true,
> >  	.has_asid = true,
> >  	.has_link_copy_engine = true,
> > +	.has_gt_error_vectors = true,
> >  };
> >
> >  #define MTL_MEDIA_ENGINES \
> > @@ -418,6 +420,7 @@ static int xe_pci_probe(struct pci_dev *pdev, const
> struct pci_device_id *ent)
> >  	xe->info.vm_max_level = desc->vm_max_level;
> >  	xe->info.supports_usm = desc->supports_usm;
> >  	xe->info.has_asid = desc->has_asid;
> > +	xe->info.has_gt_error_vectors = desc->has_gt_error_vectors;
> >  	xe->info.has_flat_ccs = desc->has_flat_ccs;
> >  	xe->info.has_4tile = desc->has_4tile;
> >  	xe->info.has_range_tlb_invalidation =
> > desc->has_range_tlb_invalidation;
> > --
> > 2.25.1
> >
> 
> --
> Matt Roper
> Graphics Software Engineer
> Linux GPU Platform Enablement
> Intel Corporation