[Intel-xe] [PATCH V11 1/1] drm/xe: Introduce and update counter for low level driver errors

Wed Nov 22 18:28:36 UTC 2023

On 20.11.2023 13:46, Matthew Brost wrote:
> On Tue, Oct 31, 2023 at 04:05:25PM +0530, Tejas Upadhyay wrote:
>> Introduce low level driver error counter and incrementing on
>> each occurrence. Focus is on errors that are not functionally
>> affecting the system and might otherwise go unnoticed and cause
>> power/performance regressions, so checking for the error
>> counters should help.
>>
>> Importantly the intention is not to go adding new error checks,
>> but to make sure the existing important error conditions are
>> propagated in terms of counter under respective categories like
>> below :
>> - GT
>>   - GUC COMMUNICATION
>>   - ENGINE OTHER
>>   - GT OTHER
>>
>> - Tile
>>   - GTT
>>   - INTERRUPT
>>
>> Currently this is just a counting of errors, later these
>> counters will be reported through netlink interface when it is
>> implemented and ready.
>>
>> V11:
>>   - Unify tlb invalidation timeout errs - Michal
>>   - Improve kernel doc comments - Michal
>>   - Improve logging output message - Michal
>> V10:
>>   - Report and count errors from common place i.e caller - Michal
>>   - Fixed some minor nits - Michal
>> V9:
>>   - Make one patch for API and counter update - Michal
>>   - Remove counter from places where driver load will fail - Michal
>>   - Remove extra \n from logging
>>   - Improve commit message - Aravind/Michal
>> V8:
>>   - Correct missed ret value handling
>> V7:
>>   - removed double couting of err - Michal
>> V6:
>>   - move drm_err to gt and tile specific err API - Aravind
>>   - Use GTT naming instead of GGTT - Aravind/Niranjana
>> V5:
>>   - Dump err_type in string format
>> V4:
>>   - dump err_type in drm_err log - Himal
>> V2:
>>   - Use modified APIs
>>
>> Signed-off-by: Tejas Upadhyay <tejas.upadhyay at intel.com>
>> ---
>>  drivers/gpu/drm/xe/xe_device_types.h        | 16 ++++++++
>>  drivers/gpu/drm/xe/xe_gt.c                  | 41 ++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_gt.h                  |  4 ++
>>  drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 13 ++++---
>>  drivers/gpu/drm/xe/xe_gt_types.h            | 18 +++++++++
>>  drivers/gpu/drm/xe/xe_guc.c                 | 16 +++++++-
>>  drivers/gpu/drm/xe/xe_guc_ct.c              | 43 ++++++++++++++++++---
>>  drivers/gpu/drm/xe/xe_irq.c                 |  6 ++-
>>  drivers/gpu/drm/xe/xe_reg_sr.c              | 22 +++++------
>>  drivers/gpu/drm/xe/xe_tile.c                | 41 ++++++++++++++++++++
>>  drivers/gpu/drm/xe/xe_tile.h                |  3 ++
>>  11 files changed, 198 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> index cb537cac1ef9..0626640f06d9 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -61,6 +61,19 @@ struct xe_pat_ops;
>>  		 const struct xe_tile * : (const struct xe_device *)((tile__)->xe),	\
>>  		 struct xe_tile * : (tile__)->xe)
>>  
>> +/**
>> + * enum xe_tile_drv_err_type - Types of tile level errors
>> + * @XE_TILE_DRV_ERR_GTT: Error type for all PPGTT and GTT errors
>> + * @XE_TILE_DRV_ERR_INTR: Interrupt errors
>> + * @__XE_TILE_DRV_ERR_MAX: Number of defined error types, keep this last
>> + */
>> +enum xe_tile_drv_err_type {
>> +	XE_TILE_DRV_ERR_GTT,
>> +	XE_TILE_DRV_ERR_INTR,
>> +	/* private: number of defined error types, keep this last */
>> +	__XE_TILE_DRV_ERR_MAX
>> +};
>> +
>>  /**
>>   * struct xe_mem_region - memory region structure
>>   * This is used to describe a memory region in xe
>> @@ -190,6 +203,9 @@ struct xe_tile {
>>  
>>  	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>  	struct kobject *sysfs;
>> +
>> +	/** @drv_err_cnt: driver error counter for this tile */
>> +	u32 drv_err_cnt[__XE_TILE_DRV_ERR_MAX];
>>  };
>>  
>>  /**
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>> index d380f67b3365..30a4b837f01a 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -49,6 +49,47 @@
>>  #include "xe_wa.h"
>>  #include "xe_wopcm.h"
>>  
>> +static const char *const xe_gt_drv_err_to_str[] = {
>> +	[XE_GT_DRV_ERR_GUC_COMM] = "GUC COMMUNICATION",
>> +	[XE_GT_DRV_ERR_ENGINE] = "ENGINE OTHER",
>> +	[XE_GT_DRV_ERR_OTHERS] = "GT OTHER"
>> +};
>> +
>> +/**
>> + * xe_gt_report_driver_error - Count driver error for GT
>> + * @gt: GT to count error for
>> + * @err: enum error type
>> + * @fmt: debug message format to print error
>> + * @...: variable args to print error
>> + *
>> + * Increment the driver error counter in respective error
>> + * category for this GT.
>> + *
>> + * Return: void.
>> + */
>> +void xe_gt_report_driver_error(struct xe_gt *gt,
>> +			       const enum xe_gt_drv_err_type err,
>> +			       const char *fmt, ...)
>> +{
>> +	struct va_format vaf;
>> +	va_list args;
>> +
>> +	BUILD_BUG_ON(ARRAY_SIZE(xe_gt_drv_err_to_str) !=
>> +		     __XE_GT_DRV_ERR_MAX);
>> +
>> +	xe_gt_assert(gt, err >= 0);
>> +	xe_gt_assert(gt, err < __XE_GT_DRV_ERR_MAX);
>> +	WRITE_ONCE(gt->drv_err_cnt[err],
>> +		   READ_ONCE(gt->drv_err_cnt[err]) + 1);
>> +
>> +	va_start(args, fmt);
>> +	vaf.fmt = fmt;
>> +	vaf.va = &args;
>> +
>> +	xe_gt_err(gt, "[%s] %pV\n", xe_gt_drv_err_to_str[err], &vaf);
>> +	va_end(args);
>> +}
>> +
>>  struct xe_gt *xe_gt_alloc(struct xe_tile *tile)
>>  {
>>  	struct xe_gt *gt;
>> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
>> index caded203a8a0..efd83707b367 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.h
>> +++ b/drivers/gpu/drm/xe/xe_gt.h
>> @@ -67,4 +67,8 @@ static inline bool xe_gt_is_usm_hwe(struct xe_gt *gt, struct xe_hw_engine *hwe)
>>  		hwe->instance == gt->usm.reserved_bcs_instance;
>>  }
>>  
>> +void xe_gt_report_driver_error(struct xe_gt *gt,
>> +			       const enum xe_gt_drv_err_type err,
>> +			       const char *fmt, ...);
>> +
>>  #endif
>> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>> index bd6005b9d498..8db4018a2c3a 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>> @@ -9,6 +9,7 @@
>>  #include "xe_gt.h"
>>  #include "xe_guc.h"
>>  #include "xe_guc_ct.h"
>> +#include "xe_tile.h"
>>  #include "xe_trace.h"
>>  
>>  #define TLB_TIMEOUT	(HZ / 4)
>> @@ -35,8 +36,10 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
>>  			break;
>>  
>>  		trace_xe_gt_tlb_invalidation_fence_timeout(fence);
>> -		drm_err(&gt_to_xe(gt)->drm, "gt%d: TLB invalidation fence timeout, seqno=%d recv=%d",
>> -			gt->info.id, fence->seqno, gt->tlb_invalidation.seqno_recv);
>> +		xe_tile_report_driver_error(gt_to_tile(gt), XE_TILE_DRV_ERR_GTT,
>> +					    "GT%u: TLB invalidation time'd out, seqno=%d recv=%d",
>> +					    gt->info.id, fence->seqno,
>> +					    gt->tlb_invalidation.seqno_recv);
>>  
>>  		list_del(&fence->link);
>>  		fence->base.error = -ETIME;
>> @@ -317,7 +320,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
>>   */
>>  int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
>>  {
>> -	struct xe_device *xe = gt_to_xe(gt);
>>  	struct xe_guc *guc = &gt->uc.guc;
>>  	int ret;
>>  
>> @@ -329,8 +331,9 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
>>  				 tlb_invalidation_seqno_past(gt, seqno),
>>  				 TLB_TIMEOUT);
>>  	if (!ret) {
>> -		drm_err(&xe->drm, "gt%d: TLB invalidation time'd out, seqno=%d, recv=%d\n",
>> -			gt->info.id, seqno, gt->tlb_invalidation.seqno_recv);
>> +		xe_tile_report_driver_error(gt_to_tile(gt), XE_TILE_DRV_ERR_GTT,
>> +					    "GT%u: TLB invalidation time'd out, seqno=%d, recv=%d",
>> +					    gt->info.id, seqno, gt->tlb_invalidation.seqno_recv);
>>  		return -ETIME;
>>  	}
>>  
>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
>> index d3f2793684e2..5bb7a2c6ecc2 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>> @@ -24,6 +24,21 @@ enum xe_gt_type {
>>  	XE_GT_TYPE_MEDIA,
>>  };
>>  
>> +/**
>> + * enum xe_gt_drv_err_type - Types of GT level errors
>> + * @XE_GT_DRV_ERR_GUC_COMM: Driver guc communication errors
>> + * @XE_GT_DRV_ERR_ENGINE: Engine execution errors
>> + * @XE_GT_DRV_ERR_OTHERS: Other errors like error during save/restore registers
>> + * @__XE_GT_DRV_ERR_MAX: Number of defined error types, keep this last
>> + */
>> +enum xe_gt_drv_err_type {
>> +	XE_GT_DRV_ERR_GUC_COMM,
>> +	XE_GT_DRV_ERR_ENGINE,
>> +	XE_GT_DRV_ERR_OTHERS,
>> +	/* private: number of defined error types, keep this last */
>> +	__XE_GT_DRV_ERR_MAX
>> +};
>> +
>>  #define XE_MAX_DSS_FUSE_REGS	3
>>  #define XE_MAX_EU_FUSE_REGS	1
>>  
>> @@ -347,6 +362,9 @@ struct xe_gt {
>>  		/** @oob: bitmap with active OOB workaroudns */
>>  		unsigned long *oob;
>>  	} wa_active;
>> +
>> +	/** @drv_err_cnt: driver error counter for this GT */
>> +	u32 drv_err_cnt[__XE_GT_DRV_ERR_MAX];
>>  };
>>  
>>  #endif
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index 84f0b5488783..d25fb605e9dc 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -619,8 +619,8 @@ int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr)
>>  	return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
>>  }
>>  
>> -int xe_guc_mmio_send_recv(struct xe_guc *guc, const u32 *request,
>> -			  u32 len, u32 *response_buf)
>> +static int __xe_guc_mmio_send_recv(struct xe_guc *guc, const u32 *request,
>> +				   u32 len, u32 *response_buf)
>>  {
>>  	struct xe_device *xe = guc_to_xe(guc);
>>  	struct xe_gt *gt = guc_to_gt(guc);
>> @@ -724,6 +724,18 @@ int xe_guc_mmio_send_recv(struct xe_guc *guc, const u32 *request,
>>  	return FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, header);
>>  }
>>  
>> +int xe_guc_mmio_send_recv(struct xe_guc *guc, const u32 *request,
>> +			  u32 len, u32 *response_buf)
>> +{
>> +	int ret = __xe_guc_mmio_send_recv(guc, request, len, response_buf);
>> +
>> +	if (ret < 0)
>> +		xe_gt_report_driver_error(guc_to_gt(guc), XE_GT_DRV_ERR_GUC_COMM,
>> +					  "MMIO send failed (%pe)",
>> +					  ERR_PTR(ret));
>> +	return ret;
>> +}
>> +
>>  int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len)
>>  {
>>  	return xe_guc_mmio_send_recv(guc, request, len, NULL);
>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
>> index a84e111bb36a..ce8ba923a4cd 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>> @@ -547,9 +547,9 @@ static void kick_reset(struct xe_guc_ct *ct)
>>  
>>  static int dequeue_one_g2h(struct xe_guc_ct *ct);
>>  
>> -static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
>> -			      u32 g2h_len, u32 num_g2h,
>> -			      struct g2h_fence *g2h_fence)
>> +static int _guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
>> +			       u32 g2h_len, u32 num_g2h,
>> +			       struct g2h_fence *g2h_fence)
>>  {
>>  	struct drm_device *drm = &ct_to_xe(ct)->drm;
>>  	struct drm_printer p = drm_info_printer(drm->dev);
>> @@ -621,6 +621,20 @@ static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
>>  	return -EDEADLK;
>>  }
>>  
>> +static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
>> +			      u32 g2h_len, u32 num_g2h,
>> +			      struct g2h_fence *g2h_fence)
>> +{
>> +	int ret = _guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence);
>> +
>> +	if (ret < 0)
> 
> I think this can fail in a way where (the CT is disabled, returns
> -ENODEV) this is harmless during a GT reset. This might make CI unhappy
> if this case pops. If the device is suspended and the CT is disables,
> this is likely an error.

but shouldn't we still report this -ENODEV error regardless of CI
happiness ?

using CTB during a reset (even if we recover) is a bad thing, no ?

> 
>> +		xe_gt_report_driver_error(ct_to_gt(ct),
>> +					  XE_GT_DRV_ERR_GUC_COMM,
>> +					  "CTB send failed (%pe)",
>> +					  ERR_PTR(ret));
>> +	return ret;
>> +}
>> +
>>  static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len,
>>  		       u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence)
>>  {
>> @@ -690,8 +704,8 @@ static bool retry_failure(struct xe_guc_ct *ct, int ret)
>>  	return true;
>>  }
>>  
>> -static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>> -			    u32 *response_buffer, bool no_fail)
>> +static int __guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>> +			      u32 *response_buffer, bool no_fail)
>>  {
>>  	struct xe_device *xe = ct_to_xe(ct);
>>  	struct g2h_fence g2h_fence;
>> @@ -755,6 +769,19 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>>  	return ret > 0 ? 0 : ret;
>>  }
>>  
>> +static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>> +			    u32 *response_buffer, bool no_fail)
>> +{
>> +	int ret = __guc_ct_send_recv(ct, action, len, response_buffer, no_fail);
>> +
>> +	if (ret < 0)
> 
> This is actually probably an error for the case mentioned above, I'd
> have to double check on the use case but my feeling the caller should
> ensure a GT reset isn't occuring if it needs recv some information. Or
> at the very least retry after the reset is complete.

we might not have any ready to use mechanism to tell whether reset is
ongoing, but likely it will be useless as a reset could happen anytime
after caller checks for it.

even if a caller will retry, IMO we should report the error.

and likely we should introduce different error codes to distinguish
between: disabled/permanent vs. disabled/in_recovery/reset

> 
>> +		xe_gt_report_driver_error(ct_to_gt(ct),
>> +					  XE_GT_DRV_ERR_GUC_COMM,
>> +					  "CTB send failed (%pe)",
>> +					  ERR_PTR(ret));
>> +	return ret;
>> +}
>> +
>>  int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
>>  			u32 *response_buffer)
>>  {
>> @@ -1133,6 +1160,12 @@ static void g2h_worker_func(struct work_struct *w)
>>  		ret = dequeue_one_g2h(ct);
>>  		mutex_unlock(&ct->lock);
>>  
>> +		if (ret < 0)
> 
> Same as first comment, this is not really an error for the -ENODEV if a
> GT reset is occuring, likely an error for the suspend case.

so should CTB be aware of the ongoing reset and return different code
(-ECANCELED instead -ENODEV) to allow filtering or just rely on its own
"enabled" flag and always report an error ?

Michal

> 
> Matt 
> 
>> +			xe_gt_report_driver_error(ct_to_gt(ct),
>> +						  XE_GT_DRV_ERR_GUC_COMM,
>> +						  "CTB receive failed (%pe)",
>> +						  ERR_PTR(ret));
>> +
>>  		if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) {
>>  			struct drm_device *drm = &ct_to_xe(ct)->drm;
>>  			struct drm_printer p = drm_info_printer(drm->dev);
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>> index 21d5273d7e61..29a22cb64825 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -18,6 +18,7 @@
>>  #include "xe_guc.h"
>>  #include "xe_hw_engine.h"
>>  #include "xe_mmio.h"
>> +#include "xe_tile.h"
>>  
>>  /*
>>   * Interrupt registers for a unit are always consecutive and ordered
>> @@ -227,8 +228,9 @@ gt_engine_identity(struct xe_device *xe,
>>  		 !time_after32(local_clock() >> 10, timeout_ts));
>>  
>>  	if (unlikely(!(ident & INTR_DATA_VALID))) {
>> -		drm_err(&xe->drm, "INTR_IDENTITY_REG%u:%u 0x%08x not valid!\n",
>> -			bank, bit, ident);
>> +		xe_tile_report_driver_error(gt_to_tile(mmio), XE_TILE_DRV_ERR_INTR,
>> +					    "INTR_IDENTITY_REG%u:%u 0x%08x not valid!",
>> +					    bank, bit, ident);
>>  		return 0;
>>  	}
>>  
>> diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c b/drivers/gpu/drm/xe/xe_reg_sr.c
>> index 87adefb56024..f2e20e10d927 100644
>> --- a/drivers/gpu/drm/xe/xe_reg_sr.c
>> +++ b/drivers/gpu/drm/xe/xe_reg_sr.c
>> @@ -125,12 +125,12 @@ int xe_reg_sr_add(struct xe_reg_sr *sr,
>>  	return 0;
>>  
>>  fail:
>> -	xe_gt_err(gt,
>> -		  "discarding save-restore reg %04lx (clear: %08x, set: %08x, masked: %s, mcr: %s): ret=%d\n",
>> -		  idx, e->clr_bits, e->set_bits,
>> -		  str_yes_no(e->reg.masked),
>> -		  str_yes_no(e->reg.mcr),
>> -		  ret);
>> +	xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS,
>> +				  "discarding save-restore reg %04lx (clear: %08x, set: %08x, masked: %s, mcr: %s): ret=%d",
>> +				  idx, e->clr_bits, e->set_bits,
>> +				  str_yes_no(e->reg.masked),
>> +				  str_yes_no(e->reg.mcr),
>> +				  ret);
>>  	reg_sr_inc_error(sr);
>>  
>>  	return ret;
>> @@ -207,7 +207,7 @@ void xe_reg_sr_apply_mmio(struct xe_reg_sr *sr, struct xe_gt *gt)
>>  	return;
>>  
>>  err_force_wake:
>> -	xe_gt_err(gt, "Failed to apply, err=%d\n", err);
>> +	xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS, "Failed to apply, err=%d", err);
>>  }
>>  
>>  void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe)
>> @@ -234,9 +234,9 @@ void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe)
>>  	p = drm_debug_printer(KBUILD_MODNAME);
>>  	xa_for_each(&sr->xa, reg, entry) {
>>  		if (slot == RING_MAX_NONPRIV_SLOTS) {
>> -			xe_gt_err(gt,
>> -				  "hwe %s: maximum register whitelist slots (%d) reached, refusing to add more\n",
>> -				  hwe->name, RING_MAX_NONPRIV_SLOTS);
>> +			xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_ENGINE,
>> +						  "hwe %s: maximum register whitelist slots (%d) reached, refusing to add more",
>> +						  hwe->name, RING_MAX_NONPRIV_SLOTS);
>>  			break;
>>  		}
>>  
>> @@ -259,7 +259,7 @@ void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe)
>>  	return;
>>  
>>  err_force_wake:
>> -	drm_err(&xe->drm, "Failed to apply, err=%d\n", err);
>> +	xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS, "Failed to apply, err=%d", err);
>>  }
>>  
>>  /**
>> diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
>> index 131752a57f65..4de9b5e558a2 100644
>> --- a/drivers/gpu/drm/xe/xe_tile.c
>> +++ b/drivers/gpu/drm/xe/xe_tile.c
>> @@ -71,6 +71,47 @@
>>   *  - MOCS and PAT programming
>>   */
>>  
>> +static const char *const xe_tile_drv_err_to_str[] = {
>> +	[XE_TILE_DRV_ERR_GTT] = "GTT",
>> +	[XE_TILE_DRV_ERR_INTR] = "INTERRUPT"
>> +};
>> +
>> +/**
>> + * xe_tile_report_driver_error - Count driver error for tile
>> + * @tile: tile to count error for
>> + * @err: Enum error type
>> + * @fmt: debug message format to print error
>> + * @...: variable args to print error
>> + *
>> + * Increment the driver error counter in respective error
>> + * category for this tile.
>> + *
>> + * Return: void.
>> + */
>> +void xe_tile_report_driver_error(struct xe_tile *tile,
>> +				 const enum xe_tile_drv_err_type err,
>> +				 const char *fmt, ...)
>> +{
>> +	struct va_format vaf;
>> +	va_list args;
>> +
>> +	BUILD_BUG_ON(ARRAY_SIZE(xe_tile_drv_err_to_str) !=
>> +		     __XE_TILE_DRV_ERR_MAX);
>> +
>> +	xe_tile_assert(tile, err >= 0);
>> +	xe_tile_assert(tile, err < __XE_TILE_DRV_ERR_MAX);
>> +	WRITE_ONCE(tile->drv_err_cnt[err],
>> +		   READ_ONCE(tile->drv_err_cnt[err]) + 1);
>> +
>> +	va_start(args, fmt);
>> +	vaf.fmt = fmt;
>> +	vaf.va = &args;
>> +
>> +	drm_err(&tile->xe->drm, "TILE%u [%s] %pV\n",
>> +		tile->id, xe_tile_drv_err_to_str[err], &vaf);
>> +	va_end(args);
>> +}
>> +
>>  /**
>>   * xe_tile_alloc - Perform per-tile memory allocation
>>   * @tile: Tile to perform allocations for
>> diff --git a/drivers/gpu/drm/xe/xe_tile.h b/drivers/gpu/drm/xe/xe_tile.h
>> index 782c47f8bd45..c79108fb9579 100644
>> --- a/drivers/gpu/drm/xe/xe_tile.h
>> +++ b/drivers/gpu/drm/xe/xe_tile.h
>> @@ -14,5 +14,8 @@ int xe_tile_alloc(struct xe_tile *tile);
>>  int xe_tile_init_noalloc(struct xe_tile *tile);
>>  
>>  void xe_tile_migrate_wait(struct xe_tile *tile);
>> +void xe_tile_report_driver_error(struct xe_tile *tile,
>> +				 const enum xe_tile_drv_err_type err,
>> +				 const char *fmt, ...);
>>  
>>  #endif
>> -- 
>> 2.25.1
>>