[Intel-xe] [PATCH V9 1/1] drm/xe: Introduce and update counter for low level driver errors

Fri Oct 27 15:54:36 UTC 2023

> -----Original Message-----
> From: Wajdeczko, Michal <Michal.Wajdeczko at intel.com>
> Sent: Friday, October 27, 2023 7:09 PM
> To: Upadhyay, Tejas <tejas.upadhyay at intel.com>; intel-
> xe at lists.freedesktop.org
> Subject: Re: [Intel-xe] [PATCH V9 1/1] drm/xe: Introduce and update counter
> for low level driver errors
> 
> 
> 
> On 27.10.2023 15:02, Upadhyay, Tejas wrote:
> >
> >
> >> -----Original Message-----
> >> From: Wajdeczko, Michal <Michal.Wajdeczko at intel.com>
> >> Sent: Thursday, October 26, 2023 3:02 AM
> >> To: Upadhyay, Tejas <tejas.upadhyay at intel.com>; intel-
> >> xe at lists.freedesktop.org
> >> Subject: Re: [Intel-xe] [PATCH V9 1/1] drm/xe: Introduce and update
> >> counter for low level driver errors
> >>
> >>
> >>
> >> On 20.10.2023 12:03, Tejas Upadhyay wrote:
> >>> we added a low level driver error counter and incrementing on
> >>
> >> nit: "added" - stale comment ?
> >>
> >>> each occurrence. Focus is on errors that are not functionally
> >>> affecting the system and might otherwise go unnoticed and cause
> >>> power/performance regressions, so checking for the error counters
> >>> should help.
> >>>
> >>> Importantly the intention is not to go adding new error checks, but
> >>> to make sure the existing important error conditions are propagated
> >>> in terms of counter under respective categories like below :
> >>> - GT
> >>>   - GUC COMMUNICATION
> >>>   - ENGINE OTHER
> >>>   - GT OTHER
> >>>
> >>> - Tile
> >>>   - GTT
> >>>   - INTERRUPT
> >>>
> >>> Currently this is just a counting of errors, later these counters
> >>> will be reported through netlink interface when it is implemented
> >>> and ready.
> >>>
> >>> V9:
> >>>   - Make one patch for API and counter update - Michal
> >>>   - Remove counter from places where driver load will fail - Michal
> >>>   - Remove extra \n from logging
> >>>   - Improve commit message - Aravind/Michal
> >>> V8:
> >>>   - Correct missed ret value handling
> >>> V7:
> >>>   - removed double couting of err - Michal
> >>> V6:
> >>>   - move drm_err to gt and tile specific err API - Aravind
> >>>   - Use GTT naming instead of GGTT - Aravind/Niranjana
> >>> V5:
> >>>   - Dump err_type in string format
> >>> V4:
> >>>   - dump err_type in drm_err log - Himal
> >>> V2:
> >>>   - Use modified APIs
> >>>
> >>> Signed-off-by: Tejas Upadhyay <tejas.upadhyay at intel.com>
> >>> ---
> >>>  drivers/gpu/drm/xe/xe_device_types.h        | 15 +++++++
> >>>  drivers/gpu/drm/xe/xe_gt.c                  | 41 +++++++++++++++++++
> >>>  drivers/gpu/drm/xe/xe_gt.h                  |  4 ++
> >>>  drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 13 +++---
> >>>  drivers/gpu/drm/xe/xe_gt_types.h            | 17 ++++++++
> >>>  drivers/gpu/drm/xe/xe_guc.c                 |  9 ++---
> >>>  drivers/gpu/drm/xe/xe_guc_ct.c              | 44 +++++++++++----------
> >>>  drivers/gpu/drm/xe/xe_guc_pc.c              | 12 ++++--
> >>>  drivers/gpu/drm/xe/xe_guc_submit.c          |  5 ++-
> >>>  drivers/gpu/drm/xe/xe_irq.c                 |  6 ++-
> >>>  drivers/gpu/drm/xe/xe_reg_sr.c              | 22 +++++------
> >>>  drivers/gpu/drm/xe/xe_tile.c                | 41 +++++++++++++++++++
> >>>  drivers/gpu/drm/xe/xe_tile.h                |  3 ++
> >>>  13 files changed, 183 insertions(+), 49 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h
> >>> b/drivers/gpu/drm/xe/xe_device_types.h
> >>> index 44d622d4cc3a..da88e69beeaf 100644
> >>> --- a/drivers/gpu/drm/xe/xe_device_types.h
> >>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> >>> @@ -61,6 +61,18 @@ struct xe_pat_ops;
> >>>  		 const struct xe_tile * : (const struct xe_device
> >>> *)((tile__)->xe),
> >> 	\
> >>>  		 struct xe_tile * : (tile__)->xe)
> >>>
> >>> +/** @enum xe_tile_drv_err_type
> >>
> >> this doesn't look like a well formatted kernel-doc same with enum
> >> values below
> >>
> >>> + *  enum for different types of tile level errors in driver  */
> >>> +enum xe_tile_drv_err_type {
> >>> +	/* Error type for all PPGTT and GTT errors */
> >>> +	XE_TILE_DRV_ERR_GTT,
> >>> +	/* Interrupt errors */
> >>> +	XE_TILE_DRV_ERR_INTR,
> >>> +	/* Max defined error types, keep this last */
> >>
> >> nit: better:
> >>
> >> 	/* private: number of defined error types, keep this last */
> >>
> >>> +	__XE_TILE_DRV_ERR_MAX
> >>> +};
> >>> +
> >>>  /**
> >>>   * struct xe_mem_region - memory region structure
> >>>   * This is used to describe a memory region in xe @@ -190,6 +202,9
> >>> @@ struct xe_tile {
> >>>
> >>>  	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
> >>>  	struct kobject *sysfs;
> >>> +
> >>> +	/** @drv_err_cnt: driver error counter for this tile */
> >>> +	u32 drv_err_cnt[__XE_TILE_DRV_ERR_MAX];
> >>>  };
> >>>
> >>>  /**
> >>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> >>> index 74e1f47bd401..993f2a9240ae 100644
> >>> --- a/drivers/gpu/drm/xe/xe_gt.c
> >>> +++ b/drivers/gpu/drm/xe/xe_gt.c
> >>> @@ -48,6 +48,47 @@
> >>>  #include "xe_wa.h"
> >>>  #include "xe_wopcm.h"
> >>>
> >>> +static const char *const xe_gt_drv_err_to_str[] = {
> >>> +	[XE_GT_DRV_ERR_GUC_COMM] = "GUC COMMUNICATION",
> >>> +	[XE_GT_DRV_ERR_ENGINE] = "ENGINE OTHER",
> >>> +	[XE_GT_DRV_ERR_OTHERS] = "GT OTHER"
> >>> +};
> >>> +
> >>> +/**
> >>> + * xe_gt_report_driver_error - Count driver error for gt
> >>
> >> s/gt/GT
> >>
> >>> + * @gt: GT to count error for
> >>> + * @err: enum error type
> >>> + * @fmt: debug message format to print error
> >>> + * @...: variable args to print error
> >>> + *
> >>> + * Increment the driver error counter in respective error
> >>> + * category for this GT.
> >>> + *
> >>> + * Return: void.
> >>> + */
> >>> +void xe_gt_report_driver_error(struct xe_gt *gt,
> >>> +			       const enum xe_gt_drv_err_type err,
> >>> +			       const char *fmt, ...)
> >>> +{
> >>> +	struct va_format vaf;
> >>> +	va_list args;
> >>> +
> >>> +	BUILD_BUG_ON(ARRAY_SIZE(xe_gt_drv_err_to_str) !=
> >>> +		     __XE_GT_DRV_ERR_MAX);
> >>> +
> >>> +	xe_gt_assert(gt, err >= 0);
> >>> +	xe_gt_assert(gt, err < __XE_GT_DRV_ERR_MAX);
> >>> +	WRITE_ONCE(gt->drv_err_cnt[err],
> >>> +		   READ_ONCE(gt->drv_err_cnt[err]) + 1);
> >>> +
> >>> +	va_start(args, fmt);
> >>> +	vaf.fmt = fmt;
> >>> +	vaf.va = &args;
> >>> +
> >>> +	xe_gt_err(gt, "[%s] %pV\n", xe_gt_drv_err_to_str[err], &vaf);
> >>> +	va_end(args);
> >>> +}
> >>> +
> >>>  struct xe_gt *xe_gt_alloc(struct xe_tile *tile)  {
> >>>  	struct xe_gt *gt;
> >>> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> >>> index caded203a8a0..efd83707b367 100644
> >>> --- a/drivers/gpu/drm/xe/xe_gt.h
> >>> +++ b/drivers/gpu/drm/xe/xe_gt.h
> >>> @@ -67,4 +67,8 @@ static inline bool xe_gt_is_usm_hwe(struct xe_gt
> >>> *gt,
> >> struct xe_hw_engine *hwe)
> >>>  		hwe->instance == gt->usm.reserved_bcs_instance;  }
> >>>
> >>> +void xe_gt_report_driver_error(struct xe_gt *gt,
> >>> +			       const enum xe_gt_drv_err_type err,
> >>> +			       const char *fmt, ...);
> >>> +
> >>>  #endif
> >>> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> >>> b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> >>> index bd6005b9d498..6af2f7b01f7b 100644
> >>> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> >>> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> >>> @@ -9,6 +9,7 @@
> >>>  #include "xe_gt.h"
> >>>  #include "xe_guc.h"
> >>>  #include "xe_guc_ct.h"
> >>> +#include "xe_tile.h"
> >>>  #include "xe_trace.h"
> >>>
> >>>  #define TLB_TIMEOUT	(HZ / 4)
> >>> @@ -35,8 +36,10 @@ static void xe_gt_tlb_fence_timeout(struct
> >> work_struct *work)
> >>>  			break;
> >>>
> >>>  		trace_xe_gt_tlb_invalidation_fence_timeout(fence);
> >>> -		drm_err(&gt_to_xe(gt)->drm, "gt%d: TLB invalidation fence
> >> timeout, seqno=%d recv=%d",
> >>> -			gt->info.id, fence->seqno, gt-
> >>> tlb_invalidation.seqno_recv);
> >>> +		xe_tile_report_driver_error(gt_to_tile(gt),
> >> XE_TILE_DRV_ERR_GTT,
> >>> +					    "GT%u: TLB invalidation fence
> >> timeout, seqno=%d recv=%d",
> >>
> >> this message
> >>
> >>> +					    gt->info.id, fence->seqno,
> >>> +					    gt->tlb_invalidation.seqno_recv);
> >>>
> >>>  		list_del(&fence->link);
> >>>  		fence->base.error = -ETIME;
> >>> @@ -317,7 +320,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
> >>>   */
> >>>  int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)  {
> >>> -	struct xe_device *xe = gt_to_xe(gt);
> >>>  	struct xe_guc *guc = &gt->uc.guc;
> >>>  	int ret;
> >>>
> >>> @@ -329,8 +331,9 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt
> >>> *gt, int
> >> seqno)
> >>>  				 tlb_invalidation_seqno_past(gt, seqno),
> >>>  				 TLB_TIMEOUT);
> >>>  	if (!ret) {
> >>> -		drm_err(&xe->drm, "gt%d: TLB invalidation time'd out,
> >> seqno=%d, recv=%d\n",
> >>> -			gt->info.id, seqno, gt->tlb_invalidation.seqno_recv);
> >>> +		xe_tile_report_driver_error(gt_to_tile(gt),
> >> XE_TILE_DRV_ERR_GTT,
> >>> +					    "GT%u: TLB invalidation time'd out,
> >> seqno=%d, recv=%d",
> >>
> >> and this message are about the same thing but are little different
> >>
> >> can we print that from one place (using helper):
> >>
> >> static void tlb_seqno_error(gt, seq)
> >> {
> >> 	xe_tile_report_driver_error(
> >> 		gt_to_tile(gt),
> >> 		XE_TILE_DRV_ERR_GTT,
> >> 		"GT%u: TLB invalidation failed for seqno=%d,
> >> 		last received=%d",
> >> 		seqno,
> >> 		gt->tlb_invalidation.seqno_recv);
> >> }
> >>
> >> or at least unify the message?
> >
> > TLB invalidation seqno and fence seqno are different, atleast we need those
> differentiation in error messages so it looks ok in my opinion.
> 
> hmm, are you sure these are different ?
> 
> see send_tlb_invalidation():
> 
> 	seqno = gt->tlb_invalidation.seqno;
> 	...
> 	fence->seqno = seqno;

Are you sure? Then why we have different members in tlb_invalidation struct {. Also I think it is tlb invalidations in sequence like,

Tblinv1, Tblinv2, Tblinv3, Tblinv4, fence, Tblinv6, Tblinv7, Tblinv8, Tblinv9, fence ....

So although they are in sequence, fence is not actually tlb invalidation its flushing of previous tlb invalidations in sequence.

Don’t know if I am making sense, but this is how I understood it.

Thanks,
Tejas
> 
> >
> > Thanks,
> > Tejas
> >>
> >>> +					    gt->info.id, seqno, gt-
> >>> tlb_invalidation.seqno_recv);
> >>>  		return -ETIME;
> >>>  	}
> >>>
> >>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> >>> b/drivers/gpu/drm/xe/xe_gt_types.h
> >>> index d3f2793684e2..6a6609162b3d 100644
> >>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> >>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> >>> @@ -24,6 +24,20 @@ enum xe_gt_type {
> >>>  	XE_GT_TYPE_MEDIA,
> >>>  };
> >>>
> >>> +/** @enum xe_gt_drv_err_type
> >>> + *  enum for different types of gt level errors in driver
> >>
> >> ditto, not a proper kernel-doc
> >>
> >>> + */
> >>> +enum xe_gt_drv_err_type {
> >>> +	/* Driver guc communication errors */
> >>> +	XE_GT_DRV_ERR_GUC_COMM,
> >>> +	/* Engine execution errors */
> >>> +	XE_GT_DRV_ERR_ENGINE,
> >>> +	/* Other errors like error during save/restore registers */
> >>> +	XE_GT_DRV_ERR_OTHERS,
> >>> +	/* Max defined error types, keep this last */
> >>> +	__XE_GT_DRV_ERR_MAX
> >>> +};
> >>> +
> >>>  #define XE_MAX_DSS_FUSE_REGS	3
> >>>  #define XE_MAX_EU_FUSE_REGS	1
> >>>
> >>> @@ -347,6 +361,9 @@ struct xe_gt {
> >>>  		/** @oob: bitmap with active OOB workaroudns */
> >>>  		unsigned long *oob;
> >>>  	} wa_active;
> >>> +
> >>> +	/** @drv_err_cnt: driver error counter for this GT */
> >>> +	u32 drv_err_cnt[__XE_GT_DRV_ERR_MAX];
> >>>  };
> >>>
> >>>  #endif
> >>> diff --git a/drivers/gpu/drm/xe/xe_guc.c
> >>> b/drivers/gpu/drm/xe/xe_guc.c index 84f0b5488783..aeb9cf814440
> >>> 100644
> >>> --- a/drivers/gpu/drm/xe/xe_guc.c
> >>> +++ b/drivers/gpu/drm/xe/xe_guc.c
> >>> @@ -663,8 +663,7 @@ int xe_guc_mmio_send_recv(struct xe_guc *guc,
> >> const u32 *request,
> >>>  			     50000, &reply, false);
> >>>  	if (ret) {
> >>>  timeout:
> >>> -		drm_err(&xe->drm, "mmio request %#x: no reply %#x\n",
> >>> -			request[0], reply);
> >>> +		xe_gt_err(gt, "mmio request %#x: no reply %#x\n", request[0],
> >>> +reply);
> >>
> >> good but unrelated change for this series
> >>
> >>>  		return ret;
> >>>  	}
> >>>
> >>> @@ -697,16 +696,14 @@ int xe_guc_mmio_send_recv(struct xe_guc
> *guc,
> >> const u32 *request,
> >>>  		u32 hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT,
> >> header);
> >>>  		u32 error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR,
> >> header);
> >>>
> >>> -		drm_err(&xe->drm, "mmio request %#x: failure %#x/%#x\n",
> >>> -			request[0], error, hint);
> >>> +		xe_gt_err(gt, "mmio request %#x: failure %#x/%#x\n",
> >> request[0],
> >>> +error, hint);
> >>
> >> ditto
> >>
> >>>  		return -ENXIO;
> >>>  	}
> >>>
> >>>  	if (FIELD_GET(GUC_HXG_MSG_0_TYPE, header) !=
> >>>  	    GUC_HXG_TYPE_RESPONSE_SUCCESS) {
> >>>  proto:
> >>> -		drm_err(&xe->drm, "mmio request %#x: unexpected reply
> >> %#x\n",
> >>> -			request[0], header);
> >>> +		xe_gt_err(gt, "mmio request %#x: unexpected reply %#x\n",
> >>> +request[0], header);
> >>
> >> ditto
> >>
> >>>  		return -EPROTO;
> >>>  	}
> >>>
> >>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c
> >>> b/drivers/gpu/drm/xe/xe_guc_ct.c index 8b686c8b3339..52dddf23a3f7
> >>> 100644
> >>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> >>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> >>> @@ -732,8 +732,9 @@ static int guc_ct_send_recv(struct xe_guc_ct
> >>> *ct, const u32 *action, u32 len,
> >>>
> >>>  	ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ);
> >>>  	if (!ret) {
> >>
> >> this
> >>
> >>> -		drm_err(&xe->drm, "Timed out wait for G2H, fence %u,
> >> action %04x",
> >>> -			g2h_fence.seqno, action[0]);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "Timed out wait for G2H, fence %u,
> >> action %04x",
> >>> +					  g2h_fence.seqno, action[0]);
> >>>  		xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno);
> >>>  		return -ETIME;
> >>>  	}
> >>> @@ -744,8 +745,9 @@ static int guc_ct_send_recv(struct xe_guc_ct
> >>> *ct,
> >> const u32 *action, u32 len,
> >>>  		goto retry;
> >>>  	}
> >>>  	if (g2h_fence.fail) {
> >>
> >> and this are just two points where guc_ct_send_recv() prints the
> >> error message on failure, but actually there are few more error
> >> returns that were just not covered with any diagnostic message.
> >>
> >> so maybe instead of partially covering only selected failures, better
> >> option will be to report GUC_COMM_ERROR once at single exit point or
> >> at the next level caller function (still inside CTB component) ?
> >>
> >> 	ret = guc_ct_send_recv(...)
> >> 	if (ret < 0)
> >> 		xe_gt_report_driver_error(ct_to_gt(ct),
> >> 			XE_GT_DRV_ERR_GUC_COMM,
> >> 			"CTB communication failed on send (%pe)",
> >> 			ERR_PTR(ret));
> >> 	return ret;
> >>
> >>> -		drm_err(&xe->drm, "Send failed, action 0x%04x, error %d,
> >> hint %d",
> >>> -			action[0], g2h_fence.error, g2h_fence.hint);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "Send failed, action 0x%04x, error
> >> %d, hint %d",
> >>> +					  action[0], g2h_fence.error,
> >> g2h_fence.hint);
> >>>  		ret = -EIO;
> >>>  	}
> >>>
> >>> @@ -829,7 +831,6 @@ static int parse_g2h_response(struct xe_guc_ct
> >>> *ct, u32 *msg, u32 len)
> >>>
> >>>  static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)  {
> >>> -	struct xe_device *xe = ct_to_xe(ct);
> >>>  	u32 hxg, origin, type;
> >>>  	int ret;
> >>>
> >>> @@ -839,9 +840,9 @@ static int parse_g2h_msg(struct xe_guc_ct *ct,
> >>> u32 *msg, u32 len)
> >>>
> >>>  	origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg);
> >>>  	if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) {
> >>> -		drm_err(&xe->drm,
> >>> -			"G2H channel broken on read, origin=%d, reset
> >> required\n",
> >>> -			origin);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "G2H channel broken on read,
> >> origin=%d, reset required",
> >>> +					  origin);
> >>
> >> CTB channel can be also broken on send, but error reporting was
> >> missing there, so maybe again, one place to report all errors on read:
> >>
> >> 	ret = dequeue_one_g2h(...)
> >> 	if (ret < 0)
> >> 		xe_gt_report_driver_error(ct_to_gt(ct),
> >> 			XE_GT_DRV_ERR_GUC_COMM,
> >> 			"CTB communication failed on read (%pe)",
> >> 			ERR_PTR(ret));
> >>
> >>>  		ct->ctbs.g2h.info.broken = true;
> >>>
> >>>  		return -EPROTO;
> >>> @@ -858,9 +859,9 @@ static int parse_g2h_msg(struct xe_guc_ct *ct,
> >>> u32
> >> *msg, u32 len)
> >>>  		ret = parse_g2h_response(ct, msg, len);
> >>>  		break;
> >>>  	default:
> >>> -		drm_err(&xe->drm,
> >>> -			"G2H channel broken on read, type=%d, reset
> >> required\n",
> >>> -			type);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "G2H channel broken on read,
> >> type=%d, reset required",
> >>
> >> this was wrong message, it needs to be updated separately to
> >>
> >> 	"Unexpected G2H message type %#x\n",
> >>
> >> also, I'm not sure that this warrants a reset, but that's other story
> >>
> >>> +					  type);
> >>>  		ct->ctbs.g2h.info.broken = true;
> >>>
> >>>  		ret = -EOPNOTSUPP;
> >>> @@ -871,7 +872,6 @@ static int parse_g2h_msg(struct xe_guc_ct *ct,
> >>> u32 *msg, u32 len)
> >>>
> >>>  static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
> >>> {
> >>> -	struct xe_device *xe = ct_to_xe(ct);
> >>>  	struct xe_guc *guc = ct_to_guc(ct);
> >>>  	u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1]);
> >>>  	u32 *payload = msg + GUC_CTB_HXG_MSG_MIN_LEN; @@ -918,12
> >> +918,15 @@
> >>> static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
> >>>  							   adj_len);
> >>>  		break;
> >>>  	default:
> >>> -		drm_err(&xe->drm, "unexpected action 0x%04x\n", action);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "unexpected action 0x%04x",
> >>> +					  action);
> >>
> >> it seems that there is a bug here, we should have here -EOPNOTSUPP"
> >> so it would fall into below error reporting, but as said above, maybe
> >> better to have one place to report any error on read (we will avoid
> >> any duplicate reports and also make sure to catch all issues)
> >>
> >>>  	}
> >>>
> >>>  	if (ret)
> >>> -		drm_err(&xe->drm, "action 0x%04x failed processing,
> >> ret=%d\n",
> >>> -			action, ret);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "action 0x%04x failed processing,
> >> ret=%d",
> >>> +					  action, ret);
> >>>
> >>>  	return 0;
> >>>  }
> >>> @@ -957,9 +960,9 @@ static int g2h_read(struct xe_guc_ct *ct, u32
> >>> *msg,
> >> bool fast_path)
> >>>  			   sizeof(u32));
> >>>  	len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) +
> >> GUC_CTB_MSG_MIN_LEN;
> >>>  	if (len > avail) {
> >>> -		drm_err(&xe->drm,
> >>> -			"G2H channel broken on read, avail=%d, len=%d,
> >> reset required\n",
> >>> -			avail, len);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "G2H channel broken on read,
> >> avail=%d, len=%d, reset required",
> >>> +					  avail, len);
> >>>  		g2h->info.broken = true;
> >>>
> >>>  		return -EPROTO;
> >>> @@ -1027,8 +1030,9 @@ static void g2h_fast_path(struct xe_guc_ct
> >>> *ct, u32
> >> *msg, u32 len)
> >>>  	}
> >>>
> >>>  	if (ret)
> >>> -		drm_err(&xe->drm, "action 0x%04x failed processing,
> >> ret=%d\n",
> >>> -			action, ret);
> >>> +		xe_gt_report_driver_error(ct_to_gt(ct),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +					  "action 0x%04x failed processing,
> >> ret=%d",
> >>> +					  action, ret);
> >>>  }
> >>>
> >>>  /**
> >>> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c
> >>> b/drivers/gpu/drm/xe/xe_guc_pc.c index d9375d1d582f..137c7ce44f47
> >>> 100644
> >>> --- a/drivers/gpu/drm/xe/xe_guc_pc.c
> >>> +++ b/drivers/gpu/drm/xe/xe_guc_pc.c
> >>> @@ -218,9 +218,15 @@ static int pc_action_set_param(struct xe_guc_pc
> >> *pc, u8 id, u32 value)
> >>>  		return -EAGAIN;
> >>>
> >>>  	ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
> >>> -	if (ret)
> >>> -		drm_err(&pc_to_xe(pc)->drm, "GuC PC set param failed:
> >> %pe",
> >>> -			ERR_PTR(ret));
> >>> +	if (ret) {
> >>> +		/* GT reset already triggered due to deadlock, lets not report
> >> this error */
> >>> +		if (ret != -EDEADLK)
> >>> +			xe_gt_report_driver_error(pc_to_gt(pc),
> >> XE_GT_DRV_ERR_GUC_COMM,
> >>> +						  "GuC PC set param failed:
> >> %pe",
> >>> +						  ERR_PTR(ret));
> >>> +		else
> >>> +			xe_gt_err(pc_to_gt(pc), "GuC PC set param failed:
> >> %pe\n",
> >>> +ERR_PTR(ret));
> >>
> >> this selective reporting makes things unreliable and error prone
> >> (ouch!)
> >>
> >> note that you still didn't avoid duplication, as this error was
> >> reported as xe_gt_report_driver_error("Send failed, action 0x%04x,
> >> ...")
> >>
> >> IMO better to report GuC comm send/recv errors in one place (in CTB
> >> layer)
> >>
> >> btw, since you care about "power/performance regressions" then maybe
> >> errors from GUC_PC deserve it's own category for easier diagnostics ?
> >>
> >>> +	}
> >>>
> >>>  	return ret;
> >>>  }
> >>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> >>> b/drivers/gpu/drm/xe/xe_guc_submit.c
> >>> index 870dc5c532fa..283832d518b1 100644
> >>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> >>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> >>> @@ -1681,8 +1681,9 @@ int
> >> xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg,
> >> u32 le
> >>>  	reason = msg[2];
> >>>
> >>>  	/* Unexpected failure of a hardware feature, log an actual error */
> >>> -	drm_err(&xe->drm, "GuC engine reset request failed on %d:%d
> >> because 0x%08X",
> >>> -		guc_class, instance, reason);
> >>> +	xe_gt_report_driver_error(guc_to_gt(guc), XE_GT_DRV_ERR_ENGINE,
> >>> +				  "GuC engine reset request failed on %d:%d
> >> because 0x%08X",
> >>> +				  guc_class, instance, reason);
> >>>
> >>>  	xe_gt_reset_async(guc_to_gt(guc));
> >>>
> >>> diff --git a/drivers/gpu/drm/xe/xe_irq.c
> >>> b/drivers/gpu/drm/xe/xe_irq.c index 61350ed32c61..2b20582b0425
> >>> 100644
> >>> --- a/drivers/gpu/drm/xe/xe_irq.c
> >>> +++ b/drivers/gpu/drm/xe/xe_irq.c
> >>> @@ -18,6 +18,7 @@
> >>>  #include "xe_guc.h"
> >>>  #include "xe_hw_engine.h"
> >>>  #include "xe_mmio.h"
> >>> +#include "xe_tile.h"
> >>>
> >>>  /*
> >>>   * Interrupt registers for a unit are always consecutive and
> >>> ordered @@ -227,8 +228,9 @@ gt_engine_identity(struct xe_device *xe,
> >>>  		 !time_after32(local_clock() >> 10, timeout_ts));
> >>>
> >>>  	if (unlikely(!(ident & INTR_DATA_VALID))) {
> >>> -		drm_err(&xe->drm, "INTR_IDENTITY_REG%u:%u 0x%08x not
> >> valid!\n",
> >>> -			bank, bit, ident);
> >>> +		xe_tile_report_driver_error(gt_to_tile(mmio),
> >> XE_TILE_DRV_ERR_INTR,
> >>> +					    "INTR_IDENTITY_REG%u:%u
> >> 0x%08x not valid!",
> >>> +					    bank, bit, ident);
> >>>  		return 0;
> >>>  	}
> >>>
> >>> diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c
> >>> b/drivers/gpu/drm/xe/xe_reg_sr.c index 87adefb56024..f2e20e10d927
> >>> 100644
> >>> --- a/drivers/gpu/drm/xe/xe_reg_sr.c
> >>> +++ b/drivers/gpu/drm/xe/xe_reg_sr.c
> >>> @@ -125,12 +125,12 @@ int xe_reg_sr_add(struct xe_reg_sr *sr,
> >>>  	return 0;
> >>>
> >>>  fail:
> >>> -	xe_gt_err(gt,
> >>> -		  "discarding save-restore reg %04lx (clear: %08x, set: %08x,
> >> masked: %s, mcr: %s): ret=%d\n",
> >>> -		  idx, e->clr_bits, e->set_bits,
> >>> -		  str_yes_no(e->reg.masked),
> >>> -		  str_yes_no(e->reg.mcr),
> >>> -		  ret);
> >>> +	xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS,
> >>> +				  "discarding save-restore reg %04lx (clear:
> >> %08x, set: %08x, masked: %s, mcr: %s): ret=%d",
> >>> +				  idx, e->clr_bits, e->set_bits,
> >>> +				  str_yes_no(e->reg.masked),
> >>> +				  str_yes_no(e->reg.mcr),
> >>> +				  ret);
> >>>  	reg_sr_inc_error(sr);
> >>>
> >>>  	return ret;
> >>> @@ -207,7 +207,7 @@ void xe_reg_sr_apply_mmio(struct xe_reg_sr *sr,
> >> struct xe_gt *gt)
> >>>  	return;
> >>>
> >>>  err_force_wake:
> >>> -	xe_gt_err(gt, "Failed to apply, err=%d\n", err);
> >>> +	xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS, "Failed to
> >>> +apply, err=%d", err);
> >>>  }
> >>>
> >>>  void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe) @@ -234,9
> >>> +234,9 @@ void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe)
> >>>  	p = drm_debug_printer(KBUILD_MODNAME);
> >>>  	xa_for_each(&sr->xa, reg, entry) {
> >>>  		if (slot == RING_MAX_NONPRIV_SLOTS) {
> >>> -			xe_gt_err(gt,
> >>> -				  "hwe %s: maximum register whitelist slots
> >> (%d) reached, refusing to add more\n",
> >>> -				  hwe->name, RING_MAX_NONPRIV_SLOTS);
> >>> +			xe_gt_report_driver_error(gt,
> >> XE_GT_DRV_ERR_ENGINE,
> >>> +						  "hwe %s: maximum register
> >> whitelist slots (%d) reached, refusing to add more",
> >>> +						  hwe->name,
> >> RING_MAX_NONPRIV_SLOTS);
> >>>  			break;
> >>>  		}
> >>>
> >>> @@ -259,7 +259,7 @@ void xe_reg_sr_apply_whitelist(struct
> >>> xe_hw_engine
> >> *hwe)
> >>>  	return;
> >>>
> >>>  err_force_wake:
> >>> -	drm_err(&xe->drm, "Failed to apply, err=%d\n", err);
> >>> +	xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS, "Failed to
> >>> +apply, err=%d", err);
> >>>  }
> >>>
> >>>  /**
> >>> diff --git a/drivers/gpu/drm/xe/xe_tile.c
> >>> b/drivers/gpu/drm/xe/xe_tile.c index 131752a57f65..364ec9628a41
> >>> 100644
> >>> --- a/drivers/gpu/drm/xe/xe_tile.c
> >>> +++ b/drivers/gpu/drm/xe/xe_tile.c
> >>> @@ -71,6 +71,47 @@
> >>>   *  - MOCS and PAT programming
> >>>   */
> >>>
> >>> +static const char *const xe_tile_drv_err_to_str[] = {
> >>> +	[XE_TILE_DRV_ERR_GTT] = "GTT",
> >>> +	[XE_TILE_DRV_ERR_INTR] = "INTERRUPT"
> >>> +};
> >>> +
> >>> +/**
> >>> + * xe_tile_report_driver_error - Count driver error for tile
> >>> + * @tile: Tile to count error for
> >>> + * @err: Enum error type
> >>> + * @fmt: debug message format to print error
> >>> + * @...: variable args to print error
> >>> + *
> >>> + * Increment the driver error counter in respective error
> >>> + * category for this tile.
> >>> + *
> >>> + * Return: void.
> >>> + */
> >>> +void xe_tile_report_driver_error(struct xe_tile *tile,
> >>> +				 const enum xe_tile_drv_err_type err,
> >>> +				 const char *fmt, ...)
> >>> +{
> >>> +	struct va_format vaf;
> >>> +	va_list args;
> >>> +
> >>> +	BUILD_BUG_ON(ARRAY_SIZE(xe_tile_drv_err_to_str) !=
> >>> +		     __XE_TILE_DRV_ERR_MAX);
> >>> +
> >>> +	xe_assert(tile_to_xe(tile), err >= 0);
> >>> +	xe_assert(tile_to_xe(tile), err < __XE_TILE_DRV_ERR_MAX);
> >>
> >> you can use xe_tile_assert() here
> >>
> >>> +	WRITE_ONCE(tile->drv_err_cnt[err],
> >>> +		   READ_ONCE(tile->drv_err_cnt[err]) + 1);
> >>> +
> >>> +	va_start(args, fmt);
> >>> +	vaf.fmt = fmt;
> >>> +	vaf.va = &args;
> >>> +
> >>> +	drm_err(&tile->xe->drm, "TILE%u [%s] %pV\n",
> >>> +		tile->id, xe_tile_drv_err_to_str[err], &vaf);
> >>> +	va_end(args);
> >>> +}
> >>> +
> >>>  /**
> >>>   * xe_tile_alloc - Perform per-tile memory allocation
> >>>   * @tile: Tile to perform allocations for diff --git
> >>> a/drivers/gpu/drm/xe/xe_tile.h b/drivers/gpu/drm/xe/xe_tile.h index
> >>> 782c47f8bd45..c79108fb9579 100644
> >>> --- a/drivers/gpu/drm/xe/xe_tile.h
> >>> +++ b/drivers/gpu/drm/xe/xe_tile.h
> >>> @@ -14,5 +14,8 @@ int xe_tile_alloc(struct xe_tile *tile);  int
> >>> xe_tile_init_noalloc(struct xe_tile *tile);
> >>>
> >>>  void xe_tile_migrate_wait(struct xe_tile *tile);
> >>> +void xe_tile_report_driver_error(struct xe_tile *tile,
> >>> +				 const enum xe_tile_drv_err_type err,
> >>> +				 const char *fmt, ...);
> >>>
> >>>  #endif