[Intel-xe] [PATCH 2/4] drm/xe/ras: Log the GT hw errors.

Fri May 5 07:24:20 UTC 2023

On 04-05-2023 05:32, Matt Roper wrote:
> On Fri, Apr 28, 2023 at 01:00:04AM -0700, Ghimiray, Himal Prasad wrote:
> ...
>>> All of this new infrastructure seems pretty questionable at the moment.
>>> We're doing extra work to count up errors, but then never doing anything
>>> with the counts.  You mention in the cover letter that these will be exposed
>>> to userspace eventually, but what's the benefit of that?  Which userspace
>>> component is going to actually use this information?  What do you expect
>>> userspace to do if it finds out there's been a fatal or correctable error in
>>> some low-level hardware unit?  Generally userspace shouldn't even need to
>>> care about the really low-level hardware details; if something has truly gone
>>> fatally wrong, it's game over for userspace and it probably doesn't matter
>>> exactly where in the hardware things are actually busted.
>>>
>>> Without some extra justification from the userspace point of view, it feels
>>> like we're just adding a bunch of code that doesn't have a real-world
>>> purpose.
>> The error counters  exposed by KMD will be used by sysman 
>> They will be categorized to specific category of error in sysman:
>>  https://spec.oneapi.io/level-zero/latest/sysman/api.html#ras
> 
> L0 sysman looks sort of like a complicated libdrm replacement.  I.e.,
> it's just a wrapper library over various uapi interfaces, but isn't
> really a "consumer" of that uapi itself.  We need to understand the
> whole top-to-bottom stack to make sure that whatever interfaces and
> representation are selected (both at the Xe and Sysman levels) actually
> makes sense and satisfies the full stack needs.
> 
> Is the actual reporting of these errors going to be done via the
> standard Linux RAS/EDAC interfaces?  I.e., what's documented at
> https://www.kernel.org/doc/html/v6.3/admin-guide/ras.html ?  If so, then
> there's already a bunch of real userspace tools that work with that, so
> that would probably help justify the work here, and make it clear that
> we're not just reinventing the wheel.  If we're not tying into that,
> then we probably need to justify clearly why it can't be used.

I don't think we can use EDAC, IIUC it expects errors to be reported via
MCA(Machine Check Architecute)/MCE and our HW doesn't do that. the
correctable and non fatal errors are reported via MSI and fatal as a
PCI_ERR message.

Thanks,
Aravind.
> 
>>>
>>>> +			    const enum xe_gt_driver_errors error,
>>>> +			    const char *fmt, ...)
>>>> +{
>>>> +	struct va_format vaf;
>>>> +	va_list args;
>>>> +
>>>> +	va_start(args, fmt);
>>>> +	vaf.fmt = fmt;
>>>> +	vaf.va = &args;
>>>> +
>>>> +	BUILD_BUG_ON(ARRAY_SIZE(xe_gt_driver_errors_to_str) !=
>>>> +		     INTEL_GT_DRIVER_ERROR_COUNT);
>>>> +
>>>> +	WARN_ON_ONCE(error >= INTEL_GT_DRIVER_ERROR_COUNT);
>>>> +
>>>> +	gt->errors.driver[error]++;
>>>> +
>>>> +	drm_err_ratelimited(&gt_to_xe(gt)->drm, "GT%u [%s] %pV",
>>>> +			    gt->info.id,
>>>> +			    xe_gt_driver_errors_to_str[error],
>>>> +			    &vaf);
>>>> +	va_end(args);
>>>> +}
>>>> +
>>>>  struct xe_gt *xe_find_full_gt(struct xe_gt *gt)  {
>>>>  	struct xe_gt *search;
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
>>>> b/drivers/gpu/drm/xe/xe_gt_types.h
>>>> index 8f29aba455e0..9580a40c0142 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>>>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>>>> @@ -33,6 +33,43 @@ enum xe_gt_type {
>>>>  typedef unsigned long xe_dss_mask_t[BITS_TO_LONGS(32 *
>>>> XE_MAX_DSS_FUSE_REGS)];  typedef unsigned long
>>>> xe_eu_mask_t[BITS_TO_LONGS(32 * XE_MAX_EU_FUSE_REGS)];
>>>>
>>>> +/* Count of GT Correctable and FATAL HW ERRORS */ enum
>>>> +intel_gt_hw_errors {
>>>> +	INTEL_GT_HW_ERROR_COR_SUBSLICE = 0,
>>>> +	INTEL_GT_HW_ERROR_COR_L3BANK,
>>>> +	INTEL_GT_HW_ERROR_COR_L3_SNG,
>>>> +	INTEL_GT_HW_ERROR_COR_GUC,
>>>> +	INTEL_GT_HW_ERROR_COR_SAMPLER,
>>>> +	INTEL_GT_HW_ERROR_COR_SLM,
>>>> +	INTEL_GT_HW_ERROR_COR_EU_IC,
>>>> +	INTEL_GT_HW_ERROR_COR_EU_GRF,
>>>> +	INTEL_GT_HW_ERROR_FAT_SUBSLICE,
>>>> +	INTEL_GT_HW_ERROR_FAT_L3BANK,
>>>> +	INTEL_GT_HW_ERROR_FAT_ARR_BIST,
>>>> +	INTEL_GT_HW_ERROR_FAT_FPU,
>>>> +	INTEL_GT_HW_ERROR_FAT_L3_DOUB,
>>>> +	INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK,
>>>> +	INTEL_GT_HW_ERROR_FAT_GUC,
>>>> +	INTEL_GT_HW_ERROR_FAT_IDI_PAR,
>>>> +	INTEL_GT_HW_ERROR_FAT_SQIDI,
>>>> +	INTEL_GT_HW_ERROR_FAT_SAMPLER,
>>>> +	INTEL_GT_HW_ERROR_FAT_SLM,
>>>> +	INTEL_GT_HW_ERROR_FAT_EU_IC,
>>>> +	INTEL_GT_HW_ERROR_FAT_EU_GRF,
>>>> +	INTEL_GT_HW_ERROR_FAT_TLB,
>>>> +	INTEL_GT_HW_ERROR_FAT_L3_FABRIC,
>>>> +	INTEL_GT_HW_ERROR_COUNT
>>>> +};
>>>> +
>>>> +enum xe_gt_driver_errors {
>>>> +	INTEL_GT_DRIVER_ERROR_INTERRUPT = 0,
>>>> +	INTEL_GT_DRIVER_ERROR_COUNT
>>>> +};
>>>> +
>>>> +void xe_gt_log_driver_error(struct xe_gt *gt,
>>>> +			    const enum xe_gt_driver_errors error,
>>>> +			    const char *fmt, ...);
>>>> +
>>>>  struct xe_mmio_range {
>>>>  	u32 start;
>>>>  	u32 end;
>>>> @@ -357,6 +394,12 @@ struct xe_gt {
>>>>  	 *    of a steered operation
>>>>  	 */
>>>>  	spinlock_t mcr_lock;
>>>> +
>>>> +	struct intel_hw_errors {
>>>> +		unsigned long hw[INTEL_GT_HW_ERROR_COUNT];
>>>> +		unsigned long driver[INTEL_GT_DRIVER_ERROR_COUNT];
>>>> +	} errors;
>>>> +
>>>>  };
>>>>
>>>>  #endif
>>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>>> index 6b922332bff1..4626f7280aaf 100644
>>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>>> @@ -19,6 +19,7 @@
>>>>  #include "xe_hw_engine.h"
>>>>  #include "xe_mmio.h"
>>>>
>>>> +#define HAS_GT_ERROR_VECTORS(xe) ((xe)->info.has_gt_error_vectors)
>>>>  static void gen3_assert_iir_is_zero(struct xe_gt *gt, i915_reg_t reg)
>>>> {
>>>>  	u32 val = xe_mmio_read32(gt, reg.reg); @@ -359,44 +360,281 @@
>>>> hardware_error_type_to_str(const enum hardware_error hw_err)
>>>>  	}
>>>>  }
>>>>
>>>> +#define xe_gt_hw_err(gt, fmt, ...) \
>>>> +	drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "GT%d detected "
>>> fmt, \
>>>> +			(gt)->info.id, ##__VA_ARGS__)
>>>
>>> As on the previous patch, it looks like we're printing error-level kernel
>>> messages for correctable errors (i.e., things the hardware caught and fixed
>>> internally like ECC).  Generally those kind of things shouldn't be putting
>>> errors in the kernel log because there's no actual problem from the end user
>>> perspective.
>> Agreed. Should we use warning for correctable errors ?
> 
> Presumably we shouldn't be reporting them in dmesg at all by default.
> It looks like the EDAC subsystem has its own ways of controlling if/when 
> stuff shows up in dmesg.
> 
>>>
>>>> +
>>>>  static void
>>>> -xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error
>>>> hw_err)
>>>> +xe_gt_correctable_hw_error_stats_update(struct xe_gt  *gt, unsigned
>>>> +long errstat)
>>>>  {
>>>> -	const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>>> -	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
>>>> -	u32 errstat;
>>>> +	u32 errbit, cnt;
>>>>
>>>> -	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
>>>> +	if (!errstat && HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
>>>> +		return;
>>>>
>>>> -	errstat = xe_mmio_read32(gt, ERR_STAT_GT_REG(hw_err).reg);
>>>> +	for_each_set_bit(errbit, &errstat, GT_HW_ERROR_MAX_ERR_BITS) {
>>>> +		if (gt->xe->info.platform == XE_PVC && !(REG_BIT(errbit) &
>>> PVC_COR_ERR_MASK)) {
>>>> +			xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT,
>>>> +					       "UNKNOWN CORRECTABLE
>>> error\n");
>>>> +			continue;
>>>> +		}
>>>>
>>>> -	if (unlikely(!errstat)) {
>>>> -		DRM_ERROR("ERR_STAT_GT_REG_%s blank!\n",
>>> hw_err_str);
>>>> -		return;
>>>> +	switch (errbit) {
>>>> +	case L3_SNG_COR_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_L3_SNG]++;
>>>> +		xe_gt_hw_err(gt, "L3 SINGLE CORRECTABLE error\n");
>>>> +		break;
>>>> +	case GUC_COR_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_GUC]++;
>>>> +		xe_gt_hw_err(gt, "SINGLE BIT GUC SRAM CORRECTABLE
>>> error\n");
>>>> +		break;
>>>> +	case SAMPLER_COR_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_SAMPLER]++;
>>>> +		xe_gt_hw_err(gt, "SINGLE BIT SAMPLER CORRECTABLE
>>> error\n");
>>>> +		break;
>>>> +	case SLM_COR_ERR:
>>>> +		cnt = xe_mmio_read32(gt,
>>> SLM_ECC_ERROR_CNTR(HARDWARE_ERROR_CORRECTABLE).reg);
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_SLM] = cnt;
>>>> +		xe_gt_hw_err(gt, "%u SINGLE BIT SLM CORRECTABLE
>>> error\n", cnt);
>>>> +		break;
>>>> +	case EU_IC_COR_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_EU_IC]++;
>>>> +		xe_gt_hw_err(gt, "SINGLE BIT EU IC CORRECTABLE error\n");
>>>> +		break;
>>>> +	case EU_GRF_COR_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_COR_EU_GRF]++;
>>>> +		xe_gt_hw_err(gt, "SINGLE BIT EU GRF CORRECTABLE
>>> error\n");
>>>> +		break;
>>>> +	default:
>>>> +		xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT, "UNKNOWN CORRECTABLE error\n");
>>>> +		break;
>>>> +	}
>>>>  	}
>>>> +}
>>>>
>>>> -	/*
>>>> -	 * TODO: The GT Non Fatal Error Status Register
>>>> -	 * only has reserved bitfields defined.
>>>> -	 * Remove once there is something to service.
>>>> -	 */
>>>> -	if (hw_err == HARDWARE_ERROR_NONFATAL) {
>>>> -		DRM_ERROR("detected Non-Fatal error\n");
>>>> -		xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg,
>>> errstat);
>>>> +static void xe_gt_fatal_hw_error_stats_update(struct xe_gt *gt,
>>>> +unsigned long errstat) {
>>>> +	u32 errbit, cnt;
>>>> +
>>>> +	if (!errstat && HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
>>>>  		return;
>>>> +
>>>> +	for_each_set_bit(errbit, &errstat, GT_HW_ERROR_MAX_ERR_BITS) {
>>>> +		if (gt->xe->info.platform == XE_PVC && !(REG_BIT(errbit) &
>>> PVC_FAT_ERR_MASK)) {
>>>> +			xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT,
>>>> +					       "UNKNOWN FATAL error\n");
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +	switch (errbit) {
>>>> +	case ARRAY_BIST_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_ARR_BIST]++;
>>>> +		xe_gt_hw_err(gt, "Array BIST FATAL error\n");
>>>> +		break;
>>>> +	case FPU_UNCORR_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_FPU]++;
>>>> +		xe_gt_hw_err(gt, "FPU FATAL error\n");
>>>> +		break;
>>>> +	case L3_DOUBLE_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_L3_DOUB]++;
>>>> +		xe_gt_hw_err(gt, "L3 Double FATAL error\n");
>>>> +		break;
>>>> +	case L3_ECC_CHK_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK]++;
>>>> +		xe_gt_hw_err(gt, "L3 ECC Checker FATAL error\n");
>>>> +		break;
>>>> +	case GUC_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_GUC]++;
>>>> +		xe_gt_hw_err(gt, "GUC SRAM FATAL error\n");
>>>> +		break;
>>>> +	case IDI_PAR_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_IDI_PAR]++;
>>>> +		xe_gt_hw_err(gt, "IDI PARITY FATAL error\n");
>>>> +		break;
>>>> +	case SQIDI_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SQIDI]++;
>>>> +		xe_gt_hw_err(gt, "SQIDI FATAL error\n");
>>>> +		break;
>>>> +	case SAMPLER_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SAMPLER]++;
>>>> +		xe_gt_hw_err(gt, "SAMPLER FATAL error\n");
>>>> +		break;
>>>> +	case SLM_FAT_ERR:
>>>> +		cnt = xe_mmio_read32(gt,
>>> SLM_ECC_ERROR_CNTR(HARDWARE_ERROR_FATAL).reg);
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_SLM] = cnt;
>>>> +		xe_gt_hw_err(gt, "%u SLM FATAL error\n", cnt);
>>>> +		break;
>>>> +	case EU_IC_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_EU_IC]++;
>>>> +		xe_gt_hw_err(gt, "EU IC FATAL error\n");
>>>> +		break;
>>>> +	case EU_GRF_FAT_ERR:
>>>> +		gt->errors.hw[INTEL_GT_HW_ERROR_FAT_EU_GRF]++;
>>>> +		xe_gt_hw_err(gt, "EU GRF FATAL error\n");
>>>> +		break;
>>>> +	default:
>>>> +		xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT,
>>>> +				       "UNKNOWN FATAL error\n");
>>>> +		break;
>>>> +	}
>>>>  	}
>>>> +}
>>>>
>>>> -	/*
>>>> -	 * TODO: The remaining GT errors don't have a
>>>> -	 * need for targeted logging at the moment. We
>>>> -	 * still want to log detection of these errors, but
>>>> -	 * let's aggregate them until someone has a need for them.
>>>> -	 */
>>>> -	if (errstat & other_errors)
>>>> -		DRM_ERROR("detected hardware error(s) in
>>> ERR_STAT_GT_REG_%s: 0x%08x\n",
>>>> -			  hw_err_str, errstat & other_errors);
>>>> +static void
>>>> +xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error
>>>> +hw_err) {
>>>> +	const char *hw_err_str = hardware_error_type_to_str(hw_err);
>>>> +	unsigned long errstat;
>>>> +
>>>> +	lockdep_assert_held(&gt_to_xe(gt)->irq.lock);
>>>>
>>>> -	xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg, errstat);
>>>> +	if (!HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
>>>> +		errstat = xe_mmio_read32(gt,
>>> ERR_STAT_GT_REG(hw_err).reg);
>>>> +		if (unlikely(!errstat)) {
>>>> +			xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT,
>>>> +					       "ERR_STAT_GT_REG_%s
>>> blank!\n", hw_err_str);
>>>> +			return;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	switch (hw_err) {
>>>> +	case HARDWARE_ERROR_CORRECTABLE:
>>>> +		if (HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
>>>> +			bool error = false;
>>>> +			int i;
>>>> +
>>>> +			errstat = 0;
>>>> +			for (i = 0; i < ERR_STAT_GT_COR_VCTR_LEN; i++) {
>>>> +				u32 err_type =
>>> ERR_STAT_GT_COR_VCTR_LEN;
>>>> +				unsigned long vctr;
>>>> +				const char *name;
>>>> +
>>>> +				vctr = xe_mmio_read32(gt,
>>> ERR_STAT_GT_COR_VCTR_REG(i).reg);
>>>> +				if (!vctr)
>>>> +					continue;
>>>> +
>>>> +				switch (i) {
>>>> +				case ERR_STAT_GT_VCTR0:
>>>> +				case ERR_STAT_GT_VCTR1:
>>>> +					err_type =
>>> INTEL_GT_HW_ERROR_COR_SUBSLICE;
>>>> +					gt->errors.hw[err_type] +=
>>> hweight32(vctr);
>>>> +					name = "SUBSLICE";
>>>> +
>>>> +					/* Avoid second read/write to error
>>> status register*/
>>>> +					if (errstat)
>>>> +						break;
>>>> +
>>>> +					errstat = xe_mmio_read32(gt,
>>> ERR_STAT_GT_REG(hw_err).reg);
>>>> +					xe_gt_hw_err(gt,
>>> "ERR_STAT_GT_CORRECTABLE:0x%08lx\n",
>>>> +						     errstat);
>>>> +
>>> 	xe_gt_correctable_hw_error_stats_update(gt, errstat);
>>>> +					if (errstat)
>>>> +						xe_mmio_write32(gt,
>>> ERR_STAT_GT_REG(hw_err).reg,
>>>> +								errstat);
>>>> +					break;
>>>> +
>>>> +				case ERR_STAT_GT_VCTR2:
>>>> +				case ERR_STAT_GT_VCTR3:
>>>> +					err_type =
>>> INTEL_GT_HW_ERROR_COR_L3BANK;
>>>> +					gt->errors.hw[err_type] +=
>>> hweight32(vctr);
>>>> +					name = "L3 BANK";
>>>> +					break;
>>>> +				default:
>>>> +					name = "UNKNOWN";
>>>> +					break;
>>>> +				}
>>>> +				xe_mmio_write32(gt,
>>> ERR_STAT_GT_COR_VCTR_REG(i).reg, vctr);
>>>> +				xe_gt_hw_err(gt, "%s CORRECTABLE error,
>>> ERR_VECT_GT_CORRECTABLE_%d:0x%08lx\n",
>>>> +					     name, i, vctr);
>>>> +				error = true;
>>>> +			}
>>>> +
>>>> +			if (!error)
>>>> +				xe_gt_hw_err(gt, "UNKNOWN CORRECTABLE
>>> error\n");
>>>> +		} else {
>>>> +			xe_gt_correctable_hw_error_stats_update(gt,
>>> errstat);
>>>> +			xe_gt_hw_err(gt,
>>> "ERR_STAT_GT_CORRECTABLE:0x%08lx\n", errstat);
>>>> +		}
>>>> +		break;
>>>> +	case HARDWARE_ERROR_NONFATAL:
>>>> +	      /*
>>>> +	       * TODO: The GT Non Fatal Error Status Register
>>>> +	       * only has reserved bitfields defined.
>>>> +	       * Remove once there is something to service.
>>>> +	       */
>>>> +		drm_err_ratelimited(&gt_to_xe(gt)->drm, HW_ERR "detected
>>> Non-Fatal error\n");
>>>> +		break;
>>>> +	case HARDWARE_ERROR_FATAL:
>>>> +		if (HAS_GT_ERROR_VECTORS(gt_to_xe(gt))) {
>>>> +			bool error = false;
>>>> +			int i;
>>>> +
>>>> +			errstat = 0;
>>>> +			for (i = 0; i < ERR_STAT_GT_FATAL_VCTR_LEN; i++) {
>>>> +				u32 err_type =
>>> ERR_STAT_GT_FATAL_VCTR_LEN;
>>>> +				unsigned long vctr;
>>>> +				const char *name;
>>>> +
>>>> +				vctr = xe_mmio_read32(gt,
>>> ERR_STAT_GT_FATAL_VCTR_REG(i).reg);
>>>> +				if (!vctr)
>>>> +					continue;
>>>> +
>>>> +				/* i represents the vector register index */
>>>> +				switch (i) {
>>>> +				case ERR_STAT_GT_VCTR0:
>>>> +				case ERR_STAT_GT_VCTR1:
>>>> +					err_type =
>>> INTEL_GT_HW_ERROR_FAT_SUBSLICE;
>>>> +					gt->errors.hw[err_type] +=
>>> hweight32(vctr);
>>>> +					name = "SUBSLICE";
>>>> +
>>>> +					/*Avoid second read/write to error
>>> status register.*/
>>>> +					if (errstat)
>>>> +						break;
>>>> +
>>>> +					errstat = xe_mmio_read32(gt,
>>> ERR_STAT_GT_REG(hw_err).reg);
>>>> +					xe_gt_hw_err(gt,
>>> "ERR_STAT_GT_FATAL:0x%08lx\n", errstat);
>>>> +
>>> 	xe_gt_fatal_hw_error_stats_update(gt, errstat);
>>>> +					if (errstat)
>>>> +						xe_mmio_write32(gt,
>>> ERR_STAT_GT_REG(hw_err).reg,
>>>> +								errstat);
>>>> +					break;
>>>> +
>>>> +				case ERR_STAT_GT_VCTR2:
>>>> +				case ERR_STAT_GT_VCTR3:
>>>> +					err_type =
>>> INTEL_GT_HW_ERROR_FAT_L3BANK;
>>>> +					gt->errors.hw[err_type] +=
>>> hweight32(vctr);
>>>> +					name = "L3 BANK";
>>>> +					break;
>>>> +				case ERR_STAT_GT_VCTR6:
>>>> +					gt-
>>>> errors.hw[INTEL_GT_HW_ERROR_FAT_TLB] += hweight16(vctr);
>>>> +					name = "TLB";
>>>> +					break;
>>>> +				case ERR_STAT_GT_VCTR7:
>>>> +					gt-
>>>> errors.hw[INTEL_GT_HW_ERROR_FAT_L3_FABRIC] += hweight8(vctr);
>>>> +					name = "L3 FABRIC";
>>>> +					break;
>>>> +				default:
>>>> +					name = "UNKNOWN";
>>>> +					break;
>>>> +				}
>>>> +				xe_mmio_write32(gt,
>>> ERR_STAT_GT_FATAL_VCTR_REG(i).reg, vctr);
>>>> +				xe_gt_hw_err(gt, "%s FATAL error,
>>> ERR_VECT_GT_FATAL_%d:0x%08lx\n",
>>>> +					     name, i, vctr);
>>>> +				error = true;
>>>> +			}
>>>> +			if (!error)
>>>> +				xe_gt_hw_err(gt, "UNKNOWN FATAL
>>> error\n");
>>>> +		} else {
>>>> +			xe_gt_fatal_hw_error_stats_update(gt, errstat);
>>>> +			xe_gt_hw_err(gt, "ERR_STAT_GT_FATAL:0x%08lx\n",
>>> errstat);
>>>> +		}
>>>> +		break;
>>>> +	default:
>>>> +		break;
>>>> +	}
>>>> +
>>>> +	if (!HAS_GT_ERROR_VECTORS(gt_to_xe(gt)))
>>>> +		xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg,
>>> errstat);
>>>>  }
>>>>
>>>>  static void
>>>> @@ -409,7 +647,8 @@ xe_hw_error_source_handler(struct xe_gt *gt,
>>> const enum hardware_error hw_err)
>>>>  	spin_lock_irqsave(&gt_to_xe(gt)->irq.lock, flags);
>>>>  	errsrc = xe_mmio_read32(gt, DEV_ERR_STAT_REG(hw_err).reg);
>>>>  	if (unlikely(!errsrc)) {
>>>> -		DRM_ERROR("DEV_ERR_STAT_REG_%s blank!\n",
>>> hw_err_str);
>>>> +		xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT,
>>>> +				       "DEV_ERR_STAT_REG_%s blank!\n",
>>> hw_err_str);
>>>>  		goto out_unlock;
>>>>  	}
>>>>
>>>> @@ -417,8 +656,9 @@ xe_hw_error_source_handler(struct xe_gt *gt,
>>> const enum hardware_error hw_err)
>>>>  		xe_gt_hw_error_handler(gt, hw_err);
>>>>
>>>>  	if (errsrc & ~DEV_ERR_STAT_GT_ERROR)
>>>> -		DRM_ERROR("non-GT hardware error(s) in
>>> DEV_ERR_STAT_REG_%s: 0x%08x\n",
>>>> -			  hw_err_str, errsrc & ~DEV_ERR_STAT_GT_ERROR);
>>>> +		xe_gt_log_driver_error(gt,
>>> INTEL_GT_DRIVER_ERROR_INTERRUPT,
>>>> +				       "non-GT hardware error(s) in
>>> DEV_ERR_STAT_REG_%s: 0x%08x\n",
>>>> +				       hw_err_str, errsrc &
>>> ~DEV_ERR_STAT_GT_ERROR);
>>>>
>>>>  	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err).reg, errsrc);
>>>>
>>>> @@ -634,12 +874,44 @@ static void irq_uninstall(struct drm_device *drm,
>>> void *arg)
>>>>  		pci_disable_msi(pdev);
>>>>  }
>>>>
>>>> +/**
>>>> + * process_hw_errors - checks for the occurrence of HW errors
>>>> + *
>>>> + * This checks for the HW Errors including FATAL error that might
>>>> + * have occurred in the previous boot of the driver which will
>>>> + * initiate PCIe FLR reset of the device and cause the
>>>> + * driver to reload.
>>>
>>> Is this saying that there's already been a PCIe FLR and you're trying to read
>>> the registers after that reset has happened?  The bspec indicates that these
>>> registers have 'DEV' style reset, so they wouldn't be able to preserve their
>>> values across a reset.
>> Registers preserve the value across reset.
>> BSPEC: 50875
>>>
>>>> + */
>>>> +static void process_hw_errors(struct xe_device *xe) {
>>>> +	struct xe_gt *gt0 = xe_device_get_gt(xe, 0);
>>>> +	u32 dev_pcieerr_status, master_ctl;
>>>> +	struct xe_gt *gt;
>>>> +	int i;
>>>> +
>>>> +	dev_pcieerr_status = xe_mmio_read32(gt0,
>>> DEV_PCIEERR_STATUS.reg);
>>>> +
>>>> +	for_each_gt(gt, xe, i) {
>>>> +		if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
>>>> +			xe_hw_error_source_handler(gt,
>>> HARDWARE_ERROR_FATAL);
>>>> +
>>>> +		master_ctl = xe_mmio_read32(gt,
>>> GEN11_GFX_MSTR_IRQ.reg);
>>>> +		xe_mmio_write32(gt, GEN11_GFX_MSTR_IRQ.reg,
>>> master_ctl);
>>>> +		xe_hw_error_irq_handler(gt, master_ctl);
>>>> +	}
>>>> +	if (dev_pcieerr_status)
>>>> +		xe_mmio_write32(gt, DEV_PCIEERR_STATUS.reg,
>>> dev_pcieerr_status); }
>>>> +
>>>>  int xe_irq_install(struct xe_device *xe)  {
>>>>  	int irq = to_pci_dev(xe->drm.dev)->irq;
>>>>  	irq_handler_t irq_handler;
>>>>  	int err;
>>>>
>>>> +	if (IS_DGFX(xe))
>>>> +		process_hw_errors(xe);
>>>
>>> Why is this conditional on DGFX?  From what I can see this also applies to
>>> integrated platforms like MTL too.
>> From RAS perspective report of error is required on DGFX only.
>> sysman don't use these counters from IGFX
> 
> I don't think we should add artificial limitations to the kernel driver
> just because one wrapper library (which probably won't even get used by
> most users) has them.  If the hardware can report errors, and if users
> can bypass sysman and just use EDAC stuff, then I don't see a reason to
> hide the errors on integrated platforms?
> 
> 
> Matt
> 
>>>
>>>
>>> Matt
>>>
>>>> +
>>>>  	irq_handler = xe_irq_handler(xe);
>>>>  	if (!irq_handler) {
>>>>  		drm_err(&xe->drm, "No supported interrupt handler"); diff --
>>> git
>>>> a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index
>>>> 1844cff8fba8..69098194cef8 100644
>>>> --- a/drivers/gpu/drm/xe/xe_pci.c
>>>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>>>> @@ -73,6 +73,7 @@ struct xe_device_desc {
>>>>  	bool has_range_tlb_invalidation;
>>>>  	bool has_asid;
>>>>  	bool has_link_copy_engine;
>>>> +	bool has_gt_error_vectors;
>>>>  };
>>>>
>>>>  __diag_push();
>>>> @@ -232,6 +233,7 @@ static const struct xe_device_desc pvc_desc = {
>>>>  	.supports_usm = true,
>>>>  	.has_asid = true,
>>>>  	.has_link_copy_engine = true,
>>>> +	.has_gt_error_vectors = true,
>>>>  };
>>>>
>>>>  #define MTL_MEDIA_ENGINES \
>>>> @@ -418,6 +420,7 @@ static int xe_pci_probe(struct pci_dev *pdev, const
>>> struct pci_device_id *ent)
>>>>  	xe->info.vm_max_level = desc->vm_max_level;
>>>>  	xe->info.supports_usm = desc->supports_usm;
>>>>  	xe->info.has_asid = desc->has_asid;
>>>> +	xe->info.has_gt_error_vectors = desc->has_gt_error_vectors;
>>>>  	xe->info.has_flat_ccs = desc->has_flat_ccs;
>>>>  	xe->info.has_4tile = desc->has_4tile;
>>>>  	xe->info.has_range_tlb_invalidation =
>>>> desc->has_range_tlb_invalidation;
>>>> --
>>>> 2.25.1
>>>>
>>>
>>> --
>>> Matt Roper
>>> Graphics Software Engineer
>>> Linux GPU Platform Enablement
>>> Intel Corporation
>