[Intel-xe] [PATCH 2/4] drm/xe/ras: Log the GT hw errors.

Matt Roper matthew.d.roper at intel.com
Thu May 4 00:02:53 UTC 2023


On Fri, Apr 28, 2023 at 01:00:04AM -0700, Ghimiray, Himal Prasad wrote:
...
> > All of this new infrastructure seems pretty questionable at the moment.
> > We're doing extra work to count up errors, but then never doing anything
> > with the counts.  You mention in the cover letter that these will be exposed
> > to userspace eventually, but what's the benefit of that?  Which userspace
> > component is going to actually use this information?  What do you expect
> > userspace to do if it finds out there's been a fatal or correctable error in
> > some low-level hardware unit?  Generally userspace shouldn't even need to
> > care about the really low-level hardware details; if something has truly gone
> > fatally wrong, it's game over for userspace and it probably doesn't matter
> > exactly where in the hardware things are actually busted.
> > 
> > Without some extra justification from the userspace point of view, it feels
> > like we're just adding a bunch of code that doesn't have a real-world
> > purpose.
> The error counters  exposed by KMD will be used by sysman 
> They will be categorized to specific category of error in sysman:
>  https://spec.oneapi.io/level-zero/latest/sysman/api.html#ras

L0 sysman looks sort of like a complicated libdrm replacement.  I.e.,
it's just a wrapper library over various uapi interfaces, but isn't
really a "consumer" of that uapi itself.  We need to understand the
whole top-to-bottom stack to make sure that whatever interfaces and
representation are selected (both at the Xe and Sysman levels) actually
makes sense and satisfies the full stack needs.

Is the actual reporting of these errors going to be done via the
standard Linux RAS/EDAC interfaces?  I.e., what's documented at
https://www.kernel.org/doc/html/v6.3/admin-guide/ras.html ?  If so, then
there's already a bunch of real userspace tools that work with that, so
that would probably help justify the work here, and make it clear that
we're not just reinventing the wheel.  If we're not tying into that,
then we probably need to justify clearly why it can't be used.

> > 
> > > +			    const enum xe_gt_driver_errors error,
> > > +			    const char *fmt, ...)
> > > +{
> > > +	struct va_format vaf;
> > > +	va_list args;
> > > +
> > > +	va_start(args, fmt);
> > > +	vaf.fmt = fmt;
> > > +	vaf.va = &args;
> > > +
> > > +	BUILD_BUG_ON(ARRAY_SIZE(xe_gt_driver_errors_to_str) !=
> > > +		     INTEL_GT_DRIVER_ERROR_COUNT);
> > > +
> > > +	WARN_ON_ONCE(error >= INTEL_GT_DRIVER_ERROR_COUNT);
> > > +
> > > +	gt->errors.driver[error]++;
> > > +
> > > +	drm_err_ratelimited(&gt_to_xe(gt)->drm, "GT%u [%s] %pV",
> > > +			    gt->info.id,
> > > +			    xe_gt_driver_errors_to_str[error],
> > > +			    &vaf);
> > > +	va_end(args);
> > > +}
> > > +
> > >  struct xe_gt *xe_find_full_gt(struct xe_gt *gt)  {
> > >  	struct xe_gt *search;
> > > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> > > b/drivers/gpu/drm/xe/xe_gt_types.h
> > > index 8f29aba455e0..9580a40c0142 100644
> > > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > > @@ -33,6 +33,43 @@ enum xe_gt_type {
> > >  typedef unsigned long xe_dss_mask_t[BITS_TO_LONGS(32 *
> > > XE_MAX_DSS_FUSE_REGS)];  typedef unsigned long
> > > xe_eu_mask_t[BITS_TO_LONGS(32 * XE_MAX_EU_FUSE_REGS)];
> > >
> > > +/* Count of GT Correctable and FATAL HW ERRORS */ enum
> > > +intel_gt_hw_errors {
> > > +	INTEL_GT_HW_ERROR_COR_SUBSLICE = 0,
> > > +	INTEL_GT_HW_ERROR_COR_L3BANK,
> > > +	INTEL_GT_HW_ERROR_COR_L3_SNG,
> > > +	INTEL_GT_HW_ERROR_COR_GUC,
> > > +	INTEL_GT_HW_ERROR_COR_SAMPLER,
> > > +	INTEL_GT_HW_ERROR_COR_SLM,
> > > +	INTEL_GT_HW_ERROR_COR_EU_IC,
> > > +	INTEL_GT_HW_ERROR_COR_EU_GRF,
> > > +	INTEL_GT_HW_ERROR_FAT_SUBSLICE,
> > > +	INTEL_GT_HW_ERROR_FAT_L3BANK,
> > > +	INTEL_GT_HW_ERROR_FAT_ARR_BIST,
> > > +	INTEL_GT_HW_ERROR_FAT_FPU,
> > > +	INTEL_GT_HW_ERROR_FAT_L3_DOUB,
> > > +	INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK,
> > > +	INTEL_GT_HW_ERROR_FAT_GUC,
> > > +	INTEL_GT_HW_ERROR_FAT_IDI_PAR,
> > > +	INTEL_GT_HW_ERROR_FAT_SQIDI,
> > > +	INTEL_GT_HW_ERROR_FAT_SAMPLER,
> > > +	INTEL_GT_HW_ERROR_FAT_SLM,
> > > +	INTEL_GT_HW_ERROR_FAT_EU_IC,
> > > +	INTEL_GT_HW_ERROR_FAT_EU_GRF,
> > > +	INTEL_GT_HW_ERROR_FAT_TLB,
> > > +	INTEL_GT_HW_ERROR_FAT_L3_FABRIC,
> > > +	INTEL_GT_HW_ERROR_COUNT
> > > +};
> > > +
> > > +enum xe_gt_driver_errors {
> > > +	INTEL_GT_DRIVER_ERROR_INTERRUPT = 0,
> > > +	INTEL_GT_DRIVER_ERROR_COUNT
> > > +};
> > > +
> > > +void xe_gt_log_driver_error(struct xe_gt *gt,
> > > +			    const enum xe_gt_driver_errors error,
> > > +			    const char *fmt, ...);
> > > +
> > >  struct xe_mmio_range {
> > >  	u32 start;
> > >  	u32 end;
> > > @@ -357,6 +394,12 @@ struct xe_gt {
> > >  	 *    of a steered operation
> > >  	 */
> > >  	spinlock_t mcr_lock;
> > > +
> > > +	struct intel_hw_errors {
> > > +		unsigned long hw[INTEL_GT_HW_ERROR_COUNT];
> > > +		unsigned long driver[INTEL_GT_DRIVER_ERROR_COUNT];
> > > +	} errors;
> > > +
> > >  };
> > >
> > >  #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> > > index 6b922332bff1..4626f7280aaf 100644
> > > --- a/drivers/gpu/drm/xe/xe_irq.c
> > > +++ b/drivers/gpu/drm/xe/xe_irq.c
> > > @@ -19,6 +19,7 @@
> > >  #include "xe_hw_engine.h"
> > >  #include "xe_mmio.h"
> > >
> > > +#define HAS_GT_ERROR_VECTORS(xe) ((xe)->info.has_gt_error_vectors)
> > >  static void gen3_assert_iir_is_zero(struct xe_gt *gt, i915_reg_t reg)
> > > {
> > >  	u32 val = xe_mmio_read32(gt, reg.reg); @@ -359,44 +360,281 @@
> > > hardware_error_type_to_str(const enum hardware_error hw_err)
> > >  	}
> > >  }
> > >
> > > +#define xe_gt_hw_err(gt, fmt, ...) \
> > > +	drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "GT%d detected "
> > fmt, \
> > > +			(gt)->info.id, ##__VA_ARGS__)
> > 
> > As on the previous patch, it looks like we're printing error-level kernel
> > messages for correctable errors (i.e., things the hardware caught and fixed
> > internally like ECC).  Generally those kind of things shouldn't be putting
> > errors in the kernel log because there's no actual problem from the end user
> > perspective.
> Agreed. Should we use warning for correctable errors ?

Presumably we shouldn't be reporting them in dmesg at all by default.
It looks like the EDAC subsystem has its own ways of controlling if/when 
stuff shows up in dmesg.

> > 
> > > +
> > >  static void
> > > -xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error
> > > hw_err)
> > > +xe_gt_correctable_hw_error_stats_update(struct xe_gt  *gt, unsigned
> > > +long errstat)
> > >  {
> > > -	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> > > -	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
> > > -	u32 errstat;
> > > +	u32 errbit, cnt;
> > >
> > > -	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
> > > +	if (!errstat && HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
> > > +		return;
> > >
> > > -	errstat = xe_mmio_read32(gt, ERR_STAT_GT_REG(hw_err).reg);
> > > +	for_each_set_bit(errbit, &errstat, GT_HW_ERROR_MAX_ERR_BITS) {
> > > +		if (gt->xe->info.platform == XE_PVC && !(REG_BIT(errbit) &
> > PVC_COR_ERR_MASK)) {
> > > +			xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > > +					       "UNKNOWN CORRECTABLE
> > error\n");
> > > +			continue;
> > > +		}
> > >
> > > -	if (unlikely(!errstat)) {
> > > -		DRM_ERROR("ERR_STAT_GT_REG_%s blank!\n",
> > hw_err_str);
> > > -		return;
> > > +	switch (errbit) {
> > > +	case L3_SNG_COR_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_L3_SNG]++;
> > > +		xe_gt_hw_err(gt, "L3 SINGLE CORRECTABLE error\n");
> > > +		break;
> > > +	case GUC_COR_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_GUC]++;
> > > +		xe_gt_hw_err(gt, "SINGLE BIT GUC SRAM CORRECTABLE
> > error\n");
> > > +		break;
> > > +	case SAMPLER_COR_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_SAMPLER]++;
> > > +		xe_gt_hw_err(gt, "SINGLE BIT SAMPLER CORRECTABLE
> > error\n");
> > > +		break;
> > > +	case SLM_COR_ERR:
> > > +		cnt = xe_mmio_read32(gt,
> > SLM_ECC_ERROR_CNTR(HARDWARE_ERROR_CORRECTABLE).reg);
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_SLM] = cnt;
> > > +		xe_gt_hw_err(gt, "%u SINGLE BIT SLM CORRECTABLE
> > error\n", cnt);
> > > +		break;
> > > +	case EU_IC_COR_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_EU_IC]++;
> > > +		xe_gt_hw_err(gt, "SINGLE BIT EU IC CORRECTABLE error\n");
> > > +		break;
> > > +	case EU_GRF_COR_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_EU_GRF]++;
> > > +		xe_gt_hw_err(gt, "SINGLE BIT EU GRF CORRECTABLE
> > error\n");
> > > +		break;
> > > +	default:
> > > +		xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT, "UNKNOWN CORRECTABLE error\n");
> > > +		break;
> > > +	}
> > >  	}
> > > +}
> > >
> > > -	/*
> > > -	 * TODO: The GT Non Fatal Error Status Register
> > > -	 * only has reserved bitfields defined.
> > > -	 * Remove once there is something to service.
> > > -	 */
> > > -	if (hw_err == HARDWARE_ERROR_NONFATAL) {
> > > -		DRM_ERROR("detected Non-Fatal error\n");
> > > -		xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg,
> > errstat);
> > > +static void xe_gt_fatal_hw_error_stats_update(struct xe_gt *gt,
> > > +unsigned long errstat) {
> > > +	u32 errbit, cnt;
> > > +
> > > +	if (!errstat && HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
> > >  		return;
> > > +
> > > +	for_each_set_bit(errbit, &errstat, GT_HW_ERROR_MAX_ERR_BITS) {
> > > +		if (gt->xe->info.platform == XE_PVC && !(REG_BIT(errbit) &
> > PVC_FAT_ERR_MASK)) {
> > > +			xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > > +					       "UNKNOWN FATAL error\n");
> > > +			continue;
> > > +		}
> > > +
> > > +	switch (errbit) {
> > > +	case ARRAY_BIST_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_ARR_BIST]++;
> > > +		xe_gt_hw_err(gt, "Array BIST FATAL error\n");
> > > +		break;
> > > +	case FPU_UNCORR_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_FPU]++;
> > > +		xe_gt_hw_err(gt, "FPU FATAL error\n");
> > > +		break;
> > > +	case L3_DOUBLE_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_L3_DOUB]++;
> > > +		xe_gt_hw_err(gt, "L3 Double FATAL error\n");
> > > +		break;
> > > +	case L3_ECC_CHK_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK]++;
> > > +		xe_gt_hw_err(gt, "L3 ECC Checker FATAL error\n");
> > > +		break;
> > > +	case GUC_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_GUC]++;
> > > +		xe_gt_hw_err(gt, "GUC SRAM FATAL error\n");
> > > +		break;
> > > +	case IDI_PAR_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_IDI_PAR]++;
> > > +		xe_gt_hw_err(gt, "IDI PARITY FATAL error\n");
> > > +		break;
> > > +	case SQIDI_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SQIDI]++;
> > > +		xe_gt_hw_err(gt, "SQIDI FATAL error\n");
> > > +		break;
> > > +	case SAMPLER_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SAMPLER]++;
> > > +		xe_gt_hw_err(gt, "SAMPLER FATAL error\n");
> > > +		break;
> > > +	case SLM_FAT_ERR:
> > > +		cnt = xe_mmio_read32(gt,
> > SLM_ECC_ERROR_CNTR(HARDWARE_ERROR_FATAL).reg);
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SLM] = cnt;
> > > +		xe_gt_hw_err(gt, "%u SLM FATAL error\n", cnt);
> > > +		break;
> > > +	case EU_IC_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_EU_IC]++;
> > > +		xe_gt_hw_err(gt, "EU IC FATAL error\n");
> > > +		break;
> > > +	case EU_GRF_FAT_ERR:
> > > +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_EU_GRF]++;
> > > +		xe_gt_hw_err(gt, "EU GRF FATAL error\n");
> > > +		break;
> > > +	default:
> > > +		xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > > +				       "UNKNOWN FATAL error\n");
> > > +		break;
> > > +	}
> > >  	}
> > > +}
> > >
> > > -	/*
> > > -	 * TODO: The remaining GT errors don't have a
> > > -	 * need for targeted logging at the moment. We
> > > -	 * still want to log detection of these errors, but
> > > -	 * let's aggregate them until someone has a need for them.
> > > -	 */
> > > -	if (errstat & other_errors)
> > > -		DRM_ERROR("detected hardware error(s) in
> > ERR_STAT_GT_REG_%s: 0x%08x\n",
> > > -			  hw_err_str, errstat & other_errors);
> > > +static void
> > > +xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error
> > > +hw_err) {
> > > +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
> > > +	unsigned long errstat;
> > > +
> > > +	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
> > >
> > > -	xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg, errstat);
> > > +	if (!HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
> > > +		errstat = xe_mmio_read32(gt,
> > ERR_STAT_GT_REG(hw_err).reg);
> > > +		if (unlikely(!errstat)) {
> > > +			xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > > +					       "ERR_STAT_GT_REG_%s
> > blank!\n", hw_err_str);
> > > +			return;
> > > +		}
> > > +	}
> > > +
> > > +	switch (hw_err) {
> > > +	case HARDWARE_ERROR_CORRECTABLE:
> > > +		if (HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
> > > +			bool error = false;
> > > +			int i;
> > > +
> > > +			errstat = 0;
> > > +			for (i = 0; i < ERR_STAT_GT_COR_VCTR_LEN; i++) {
> > > +				u32 err_type =
> > ERR_STAT_GT_COR_VCTR_LEN;
> > > +				unsigned long vctr;
> > > +				const char *name;
> > > +
> > > +				vctr = xe_mmio_read32(gt,
> > ERR_STAT_GT_COR_VCTR_REG(i).reg);
> > > +				if (!vctr)
> > > +					continue;
> > > +
> > > +				switch (i) {
> > > +				case ERR_STAT_GT_VCTR0:
> > > +				case ERR_STAT_GT_VCTR1:
> > > +					err_type =
> > INTEL_GT_HW_ERROR_COR_SUBSLICE;
> > > +					gt->errors.hw[err_type] +=
> > hweight32(vctr);
> > > +					name = "SUBSLICE";
> > > +
> > > +					/* Avoid second read/write to error
> > status register*/
> > > +					if (errstat)
> > > +						break;
> > > +
> > > +					errstat = xe_mmio_read32(gt,
> > ERR_STAT_GT_REG(hw_err).reg);
> > > +					xe_gt_hw_err(gt,
> > "ERR_STAT_GT_CORRECTABLE:0x%08lx\n",
> > > +						     errstat);
> > > +
> > 	xe_gt_correctable_hw_error_stats_update(gt, errstat);
> > > +					if (errstat)
> > > +						xe_mmio_write32(gt,
> > ERR_STAT_GT_REG(hw_err).reg,
> > > +								errstat);
> > > +					break;
> > > +
> > > +				case ERR_STAT_GT_VCTR2:
> > > +				case ERR_STAT_GT_VCTR3:
> > > +					err_type =
> > INTEL_GT_HW_ERROR_COR_L3BANK;
> > > +					gt->errors.hw[err_type] +=
> > hweight32(vctr);
> > > +					name = "L3 BANK";
> > > +					break;
> > > +				default:
> > > +					name = "UNKNOWN";
> > > +					break;
> > > +				}
> > > +				xe_mmio_write32(gt,
> > ERR_STAT_GT_COR_VCTR_REG(i).reg, vctr);
> > > +				xe_gt_hw_err(gt, "%s CORRECTABLE error,
> > ERR_VECT_GT_CORRECTABLE_%d:0x%08lx\n",
> > > +					     name, i, vctr);
> > > +				error = true;
> > > +			}
> > > +
> > > +			if (!error)
> > > +				xe_gt_hw_err(gt, "UNKNOWN CORRECTABLE
> > error\n");
> > > +		} else {
> > > +			xe_gt_correctable_hw_error_stats_update(gt,
> > errstat);
> > > +			xe_gt_hw_err(gt,
> > "ERR_STAT_GT_CORRECTABLE:0x%08lx\n", errstat);
> > > +		}
> > > +		break;
> > > +	case HARDWARE_ERROR_NONFATAL:
> > > +	      /*
> > > +	       * TODO: The GT Non Fatal Error Status Register
> > > +	       * only has reserved bitfields defined.
> > > +	       * Remove once there is something to service.
> > > +	       */
> > > +		drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "detected
> > Non-Fatal error\n");
> > > +		break;
> > > +	case HARDWARE_ERROR_FATAL:
> > > +		if (HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
> > > +			bool error = false;
> > > +			int i;
> > > +
> > > +			errstat = 0;
> > > +			for (i = 0; i < ERR_STAT_GT_FATAL_VCTR_LEN; i++) {
> > > +				u32 err_type =
> > ERR_STAT_GT_FATAL_VCTR_LEN;
> > > +				unsigned long vctr;
> > > +				const char *name;
> > > +
> > > +				vctr = xe_mmio_read32(gt,
> > ERR_STAT_GT_FATAL_VCTR_REG(i).reg);
> > > +				if (!vctr)
> > > +					continue;
> > > +
> > > +				/* i represents the vector register index */
> > > +				switch (i) {
> > > +				case ERR_STAT_GT_VCTR0:
> > > +				case ERR_STAT_GT_VCTR1:
> > > +					err_type =
> > INTEL_GT_HW_ERROR_FAT_SUBSLICE;
> > > +					gt->errors.hw[err_type] +=
> > hweight32(vctr);
> > > +					name = "SUBSLICE";
> > > +
> > > +					/*Avoid second read/write to error
> > status register.*/
> > > +					if (errstat)
> > > +						break;
> > > +
> > > +					errstat = xe_mmio_read32(gt,
> > ERR_STAT_GT_REG(hw_err).reg);
> > > +					xe_gt_hw_err(gt,
> > "ERR_STAT_GT_FATAL:0x%08lx\n", errstat);
> > > +
> > 	xe_gt_fatal_hw_error_stats_update(gt, errstat);
> > > +					if (errstat)
> > > +						xe_mmio_write32(gt,
> > ERR_STAT_GT_REG(hw_err).reg,
> > > +								errstat);
> > > +					break;
> > > +
> > > +				case ERR_STAT_GT_VCTR2:
> > > +				case ERR_STAT_GT_VCTR3:
> > > +					err_type =
> > INTEL_GT_HW_ERROR_FAT_L3BANK;
> > > +					gt->errors.hw[err_type] +=
> > hweight32(vctr);
> > > +					name = "L3 BANK";
> > > +					break;
> > > +				case ERR_STAT_GT_VCTR6:
> > > +					gt-
> > >errors.hw[INTEL_GT_HW_ERROR_FAT_TLB] += hweight16(vctr);
> > > +					name = "TLB";
> > > +					break;
> > > +				case ERR_STAT_GT_VCTR7:
> > > +					gt-
> > >errors.hw[INTEL_GT_HW_ERROR_FAT_L3_FABRIC] += hweight8(vctr);
> > > +					name = "L3 FABRIC";
> > > +					break;
> > > +				default:
> > > +					name = "UNKNOWN";
> > > +					break;
> > > +				}
> > > +				xe_mmio_write32(gt,
> > ERR_STAT_GT_FATAL_VCTR_REG(i).reg, vctr);
> > > +				xe_gt_hw_err(gt, "%s FATAL error,
> > ERR_VECT_GT_FATAL_%d:0x%08lx\n",
> > > +					     name, i, vctr);
> > > +				error = true;
> > > +			}
> > > +			if (!error)
> > > +				xe_gt_hw_err(gt, "UNKNOWN FATAL
> > error\n");
> > > +		} else {
> > > +			xe_gt_fatal_hw_error_stats_update(gt, errstat);
> > > +			xe_gt_hw_err(gt, "ERR_STAT_GT_FATAL:0x%08lx\n",
> > errstat);
> > > +		}
> > > +		break;
> > > +	default:
> > > +		break;
> > > +	}
> > > +
> > > +	if (!HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
> > > +		xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg,
> > errstat);
> > >  }
> > >
> > >  static void
> > > @@ -409,7 +647,8 @@ xe_hw_error_source_handler(struct xe_gt *gt,
> > const enum hardware_error hw_err)
> > >  	spin_lock_irqsave(&gt_to_xe(gt)->irq.lock, flags);
> > >  	errsrc = xe_mmio_read32(gt, DEV_ERR_STAT_REG(hw_err).reg);
> > >  	if (unlikely(!errsrc)) {
> > > -		DRM_ERROR("DEV_ERR_STAT_REG_%s blank!\n",
> > hw_err_str);
> > > +		xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > > +				       "DEV_ERR_STAT_REG_%s blank!\n",
> > hw_err_str);
> > >  		goto out_unlock;
> > >  	}
> > >
> > > @@ -417,8 +656,9 @@ xe_hw_error_source_handler(struct xe_gt *gt,
> > const enum hardware_error hw_err)
> > >  		xe_gt_hw_error_handler(gt, hw_err);
> > >
> > >  	if (errsrc & ~DEV_ERR_STAT_GT_ERROR)
> > > -		DRM_ERROR("non-GT hardware error(s) in
> > DEV_ERR_STAT_REG_%s: 0x%08x\n",
> > > -			  hw_err_str, errsrc & ~DEV_ERR_STAT_GT_ERROR);
> > > +		xe_gt_log_driver_error(gt,
> > INTEL_GT_DRIVER_ERROR_INTERRUPT,
> > > +				       "non-GT hardware error(s) in
> > DEV_ERR_STAT_REG_%s: 0x%08x\n",
> > > +				       hw_err_str, errsrc &
> > ~DEV_ERR_STAT_GT_ERROR);
> > >
> > >  	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err).reg, errsrc);
> > >
> > > @@ -634,12 +874,44 @@ static void irq_uninstall(struct drm_device *drm,
> > void *arg)
> > >  		pci_disable_msi(pdev);
> > >  }
> > >
> > > +/**
> > > + * process_hw_errors - checks for the occurrence of HW errors
> > > + *
> > > + * This checks for the HW Errors including FATAL error that might
> > > + * have occurred in the previous boot of the driver which will
> > > + * initiate PCIe FLR reset of the device and cause the
> > > + * driver to reload.
> > 
> > Is this saying that there's already been a PCIe FLR and you're trying to read
> > the registers after that reset has happened?  The bspec indicates that these
> > registers have 'DEV' style reset, so they wouldn't be able to preserve their
> > values across a reset.
> Registers preserve the value across reset.
> BSPEC: 50875
> > 
> > > + */
> > > +static void process_hw_errors(struct xe_device *xe) {
> > > +	struct xe_gt *gt0 = xe_device_get_gt(xe, 0);
> > > +	u32 dev_pcieerr_status, master_ctl;
> > > +	struct xe_gt *gt;
> > > +	int i;
> > > +
> > > +	dev_pcieerr_status = xe_mmio_read32(gt0,
> > DEV_PCIEERR_STATUS.reg);
> > > +
> > > +	for_each_gt(gt, xe, i) {
> > > +		if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
> > > +			xe_hw_error_source_handler(gt,
> > HARDWARE_ERROR_FATAL);
> > > +
> > > +		master_ctl = xe_mmio_read32(gt,
> > GEN11_GFX_MSTR_IRQ.reg);
> > > +		xe_mmio_write32(gt, GEN11_GFX_MSTR_IRQ.reg,
> > master_ctl);
> > > +		xe_hw_error_irq_handler(gt, master_ctl);
> > > +	}
> > > +	if (dev_pcieerr_status)
> > > +		xe_mmio_write32(gt, DEV_PCIEERR_STATUS.reg,
> > dev_pcieerr_status); }
> > > +
> > >  int xe_irq_install(struct xe_device *xe)  {
> > >  	int irq = to_pci_dev(xe->drm.dev)->irq;
> > >  	irq_handler_t irq_handler;
> > >  	int err;
> > >
> > > +	if (IS_DGFX(xe))
> > > +		process_hw_errors(xe);
> > 
> > Why is this conditional on DGFX?  From what I can see this also applies to
> > integrated platforms like MTL too.
> From RAS perspective report of error is required on DGFX only.
> sysman don't use these counters from IGFX

I don't think we should add artificial limitations to the kernel driver
just because one wrapper library (which probably won't even get used by
most users) has them.  If the hardware can report errors, and if users
can bypass sysman and just use EDAC stuff, then I don't see a reason to
hide the errors on integrated platforms?


Matt

> > 
> > 
> > Matt
> > 
> > > +
> > >  	irq_handler = xe_irq_handler(xe);
> > >  	if (!irq_handler) {
> > >  		drm_err(&xe->drm, "No supported interrupt handler"); diff --
> > git
> > > a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index
> > > 1844cff8fba8..69098194cef8 100644
> > > --- a/drivers/gpu/drm/xe/xe_pci.c
> > > +++ b/drivers/gpu/drm/xe/xe_pci.c
> > > @@ -73,6 +73,7 @@ struct xe_device_desc {
> > >  	bool has_range_tlb_invalidation;
> > >  	bool has_asid;
> > >  	bool has_link_copy_engine;
> > > +	bool has_gt_error_vectors;
> > >  };
> > >
> > >  __diag_push();
> > > @@ -232,6 +233,7 @@ static const struct xe_device_desc pvc_desc = {
> > >  	.supports_usm = true,
> > >  	.has_asid = true,
> > >  	.has_link_copy_engine = true,
> > > +	.has_gt_error_vectors = true,
> > >  };
> > >
> > >  #define MTL_MEDIA_ENGINES \
> > > @@ -418,6 +420,7 @@ static int xe_pci_probe(struct pci_dev *pdev, const
> > struct pci_device_id *ent)
> > >  	xe->info.vm_max_level = desc->vm_max_level;
> > >  	xe->info.supports_usm = desc->supports_usm;
> > >  	xe->info.has_asid = desc->has_asid;
> > > +	xe->info.has_gt_error_vectors = desc->has_gt_error_vectors;
> > >  	xe->info.has_flat_ccs = desc->has_flat_ccs;
> > >  	xe->info.has_4tile = desc->has_4tile;
> > >  	xe->info.has_range_tlb_invalidation =
> > > desc->has_range_tlb_invalidation;
> > > --
> > > 2.25.1
> > >
> > 
> > --
> > Matt Roper
> > Graphics Software Engineer
> > Linux GPU Platform Enablement
> > Intel Corporation

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list