[PATCH v3 2/5] drm/xe: Add infrastructure for Device OOB workarounds

Matt Roper matthew.d.roper at intel.com
Wed Jun 25 20:39:27 UTC 2025


On Wed, Jun 25, 2025 at 12:51:04PM -0700, Matt Roper wrote:
> On Wed, Jun 25, 2025 at 12:33:37PM -0700, Matt Atwood wrote:
> > Some workarounds need to be able to be applied ahead of any GT
> > initialization for example 15015404425. This patch creates XE_DEVICE_WA
> > macro, in the same vein as XE_WA. This macro can be used ahead of GT
> > initialization, and can be tracked in sysfs. This should alleviate some
> > of the complexities that exist in i915.
> 
> I'd be inclined to break this into two patches:
> 
>  * A patch to RTP that adds devices as a new type of RTP context.  This
>    should also add WARN_ON guards for all of the existing RTP rules that
>    require a gt (similar to those that exist for engines).
> 
>  * A patch that adds the the the tracking information to xe_device and
>    calls the init/process functions at the appropriate place.
> 
> > 
> > v2: name change SoC to Device, address style issues
> > 
> > Signed-off-by: Matt Atwood <matthew.s.atwood at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_device_types.h |  9 ++++++
> >  drivers/gpu/drm/xe/xe_pci.c          |  4 +++
> >  drivers/gpu/drm/xe/xe_rtp.c          |  5 +++
> >  drivers/gpu/drm/xe/xe_rtp.h          |  3 +-
> >  drivers/gpu/drm/xe/xe_rtp_types.h    |  2 ++
> >  drivers/gpu/drm/xe/xe_wa.c           | 46 ++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_wa.h           | 18 +++++++++--
> >  7 files changed, 83 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > index 6aca4b1a2824..62f09d2923c3 100644
> > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > @@ -360,6 +360,15 @@ struct xe_device {
> >  		u8 skip_pcode:1;
> >  	} info;
> >  
> > +	/** @xe_active.oob: bitmap with active OOB workarounds */
> > +	unsigned long *oob;
> > +	/**
> > +	 *  @xe_active.oob_initialized: mark oob as initialized to help
> > +	 *  detecting misuse of XE_SOC_WA() - it can only be called on
> > +	 *  initialization after SOC OOB WAs have been processed
> > +	 */
> > +	bool oob_initialized;
> 
> Did you mean to wrap these in a sub-struct?  Because right now the
> kerneldocs don't match the actual placement.  Also the "SOC" stuff is
> outdated.
> 
> > +
> >  	/** @survivability: survivability information for device */
> >  	struct xe_survivability survivability;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> > index 824461c31288..a9f708608f3e 100644
> > --- a/drivers/gpu/drm/xe/xe_pci.c
> > +++ b/drivers/gpu/drm/xe/xe_pci.c
> > @@ -32,6 +32,7 @@
> >  #include "xe_step.h"
> >  #include "xe_survivability_mode.h"
> >  #include "xe_tile.h"
> > +#include "xe_wa.h"
> >  
> >  enum toggle_d3cold {
> >  	D3COLD_DISABLE,
> > @@ -832,6 +833,9 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> >  	if (err)
> >  		return err;
> >  
> > +	xe_device_wa_init(xe);
> > +	xe_device_wa_process_oob(xe);
> > +
> >  	err = xe_device_probe_early(xe);
> >  	/*
> >  	 * In Boot Survivability mode, no drm card is exposed and driver
> > diff --git a/drivers/gpu/drm/xe/xe_rtp.c b/drivers/gpu/drm/xe/xe_rtp.c
> > index 29e694bb1219..2a0b0a1a8969 100644
> > --- a/drivers/gpu/drm/xe/xe_rtp.c
> > +++ b/drivers/gpu/drm/xe/xe_rtp.c
> > @@ -196,6 +196,11 @@ static void rtp_get_context(struct xe_rtp_process_ctx *ctx,
> >  		*gt = (*hwe)->gt;
> >  		*xe = gt_to_xe(*gt);
> >  		break;
> > +	case XE_RTP_PROCESS_TYPE_DEVICE:
> > +		*hwe = NULL;
> > +		*gt = NULL;
> > +		*xe = ctx->xe;
> > +		break;
> >  	}
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_rtp.h b/drivers/gpu/drm/xe/xe_rtp.h
> > index 4fe736a11c42..ac260feaabef 100644
> > --- a/drivers/gpu/drm/xe/xe_rtp.h
> > +++ b/drivers/gpu/drm/xe/xe_rtp.h
> > @@ -422,7 +422,8 @@ struct xe_reg_sr;
> >  
> >  #define XE_RTP_PROCESS_CTX_INITIALIZER(arg__) _Generic((arg__),							\
> >  	struct xe_hw_engine * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_ENGINE },	\
> > -	struct xe_gt * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_GT })
> > +	struct xe_gt * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_GT },	\
> > +	struct xe_device * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_DEVICE })
> >  
> >  void xe_rtp_process_ctx_enable_active_tracking(struct xe_rtp_process_ctx *ctx,
> >  					       unsigned long *active_entries,
> > diff --git a/drivers/gpu/drm/xe/xe_rtp_types.h b/drivers/gpu/drm/xe/xe_rtp_types.h
> > index 1b76b947c706..123579ebf2cf 100644
> > --- a/drivers/gpu/drm/xe/xe_rtp_types.h
> > +++ b/drivers/gpu/drm/xe/xe_rtp_types.h
> > @@ -112,12 +112,14 @@ struct xe_rtp_entry {
> >  enum xe_rtp_process_type {
> >  	XE_RTP_PROCESS_TYPE_GT,
> >  	XE_RTP_PROCESS_TYPE_ENGINE,
> > +	XE_RTP_PROCESS_TYPE_DEVICE,
> >  };
> >  
> >  struct xe_rtp_process_ctx {
> >  	union {
> >  		struct xe_gt *gt;
> >  		struct xe_hw_engine *hwe;
> > +		struct xe_device *xe;
> >  	};
> >  	enum xe_rtp_process_type type;
> >  	unsigned long *active_entries;
> > diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> > index f51218a7a580..2cb2fab20070 100644
> > --- a/drivers/gpu/drm/xe/xe_wa.c
> > +++ b/drivers/gpu/drm/xe/xe_wa.c
> > @@ -11,6 +11,7 @@
> >  #include <linux/fault-inject.h>
> >  
> >  #include <generated/xe_wa_oob.h>
> > +#include <generated/xe_device_wa_oob.h>
> >  
> >  #include "regs/xe_engine_regs.h"
> >  #include "regs/xe_gt_regs.h"
> > @@ -876,6 +877,13 @@ static __maybe_unused const struct xe_rtp_entry oob_was[] = {
> >  
> >  static_assert(ARRAY_SIZE(oob_was) - 1 == _XE_WA_OOB_COUNT);
> >  
> > +static __maybe_unused const struct xe_rtp_entry device_oob_was[] = {
> > +#include <generated/xe_device_wa_oob.c>
> > +	{}
> > +};
> > +
> > +static_assert(ARRAY_SIZE(device_oob_was) - 1 == _XE_DEVICE_WA_OOB_COUNT);
> > +
> >  __diag_pop();
> >  
> >  /**
> > @@ -895,6 +903,39 @@ void xe_wa_process_oob(struct xe_gt *gt)
> >  	xe_rtp_process(&ctx, oob_was);
> >  }
> >  
> > +int xe_device_wa_init(struct xe_device *xe)
> 
> We've been trying to be better about naming functions from xe_foo.c as
> xe_foo_*().  So these new functions might be better with names like
> xe_wa_device_init() and xe_wa_process_device_oob() and such.
> 
> At some point we should also probably go back and rename the existing GT
> functions to include 'gt' in their names at the corresponding places.
> 
> > +{
> > +	unsigned long *p;
> > +
> > +	p = drmm_kzalloc(&xe->drm,
> > +			 sizeof(xe->oob) * BITS_TO_LONGS(ARRAY_SIZE(device_oob_was)),

Overlooked this before...this winds up working because the size of a
pointer (xe->oob) is the same as the size of an unsigned long.  But
logically we want to know the allocation size of each long in the array
here, so passing either *p or *xe->oob would be better (preferably *p
since that matches the coding pattern used pretty much everywhere in the
kernel).


Matt

> > +			 GFP_KERNEL);
> > +
> > +	if (!p)
> > +		return -ENOMEM;
> > +
> > +	xe->oob = p;
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * xe_device_wa_process_oob - process OOB workaround table
> > + * @xe: xe device instance process workarounds for
> > + *
> > + * Process OOB workaround table for this platform, marking in @xe the
> > + * workarounds that are active.
> > + */
> > +void xe_device_wa_process_oob(struct xe_device *xe)
> > +{
> > +	struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(xe);
> > +
> > +	xe_rtp_process_ctx_enable_active_tracking(&ctx, xe->oob, ARRAY_SIZE(device_oob_was));
> > +
> > +	xe->oob_initialized = true;
> > +	xe_rtp_process(&ctx, device_oob_was);
> > +}
> > +
> >  /**
> >   * xe_wa_process_gt - process GT workaround table
> >   * @gt: GT instance to process workarounds for
> > @@ -1000,6 +1041,11 @@ void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p)
> >  	for_each_set_bit(idx, gt->wa_active.oob, ARRAY_SIZE(oob_was))
> >  		if (oob_was[idx].name)
> >  			drm_printf_indent(p, 1, "%s\n", oob_was[idx].name);
> > +
> > +	drm_printf(p, "\nDevice OOB Workarounds\n");
> > +	for_each_set_bit(idx, gt_to_xe(gt)->oob, ARRAY_SIZE(device_oob_was))
> > +		if (device_oob_was[idx].name)
> > +			drm_printf_indent(p, 1, "%s\n", device_oob_was[idx].name);
> 
> This function generates the contents of a per-GT debugfs node.  We don't
> want to stick device workarounds at the bottom of a per-GT entry; we
> should probably have a different debugfs entry outside the GT hierarchy
> to report these.  That would also ensure that we can track device
> workarounds on platforms with no GTs and no GT debugfs (that's not
> something that exists today, but there's been a lot of interest in
> letting igpu's run in a "not GT" mode for debug purposes, so we'll
> probably have it eventually).
> 
> >  }
> >  
> >  /*
> > diff --git a/drivers/gpu/drm/xe/xe_wa.h b/drivers/gpu/drm/xe/xe_wa.h
> > index 52337405b5bc..09d065e0abb5 100644
> > --- a/drivers/gpu/drm/xe/xe_wa.h
> > +++ b/drivers/gpu/drm/xe/xe_wa.h
> > @@ -14,7 +14,9 @@ struct xe_hw_engine;
> >  struct xe_tile;
> >  
> >  int xe_wa_init(struct xe_gt *gt);
> > +int xe_device_wa_init(struct xe_device *xe);
> >  void xe_wa_process_oob(struct xe_gt *gt);
> > +void xe_device_wa_process_oob(struct xe_device *xe);
> >  void xe_wa_process_gt(struct xe_gt *gt);
> >  void xe_wa_process_engine(struct xe_hw_engine *hwe);
> >  void xe_wa_process_lrc(struct xe_hw_engine *hwe);
> > @@ -22,14 +24,24 @@ void xe_wa_apply_tile_workarounds(struct xe_tile *tile);
> >  void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p);
> >  
> >  /**
> > - * XE_WA - Out-of-band workarounds, that don't fit the lifecycle any
> > - *         other more specific type
> > + * XE_WA - Out-of-band GT workarounds, that don't fit the lifecycle any
> > + *         other more specific type,
> >   * @gt__: gt instance
> > - * @id__: XE_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h
> > + * @id__: XE_WA_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h
> >   */
> >  #define XE_WA(gt__, id__) ({						\
> >  	xe_gt_assert(gt__, (gt__)->wa_active.oob_initialized);		\
> >  	test_bit(XE_WA_OOB_ ## id__, (gt__)->wa_active.oob);		\
> >  })
> >  
> > +/**
> > + * XE_DEVICE_WA - Out-of-band Device workarounds, that don't fit the lifecycle any
> > + *         other more specific type
> > + * @xe__: xe_device
> > + * @id__: XE_SOC_WA_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h
> 
> The "SOC" here is outdated.  Also the wrong header is referenced.
> 
> > + */
> > +#define XE_DEVICE_WA(xe__, id__) ({					\
> > +	xe_assert(xe__, (xe__)->oob_initialized);			\
> > +	test_bit(XE_DEVICE_WA_OOB_ ## id__, (xe__)->oob);			\
> > +})
> >  #endif
> 
> This is probably fine short-term, but I think eventually we should just
> have a single XE_WA() call that _Generic()'s on its parameter to do the
> lookup in the proper place (gt or device).  If we don't want to _Generic
> it for some reason, then the existing macro should probably get renamed
> to XE_GT_WA() to clarify its purpose.
> 
> 
> Matt
> 
> > -- 
> > 2.49.0
> > 
> 
> -- 
> Matt Roper
> Graphics Software Engineer
> Linux GPU Platform Enablement
> Intel Corporation
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list