[PATCH v3 2/5] drm/xe: Add infrastructure for Device OOB workarounds

Matt Roper matthew.d.roper at intel.com
Wed Jun 25 19:51:04 UTC 2025


On Wed, Jun 25, 2025 at 12:33:37PM -0700, Matt Atwood wrote:
> Some workarounds need to be able to be applied ahead of any GT
> initialization for example 15015404425. This patch creates XE_DEVICE_WA
> macro, in the same vein as XE_WA. This macro can be used ahead of GT
> initialization, and can be tracked in sysfs. This should alleviate some
> of the complexities that exist in i915.

I'd be inclined to break this into two patches:

 * A patch to RTP that adds devices as a new type of RTP context.  This
   should also add WARN_ON guards for all of the existing RTP rules that
   require a gt (similar to those that exist for engines).

 * A patch that adds the the the tracking information to xe_device and
   calls the init/process functions at the appropriate place.

> 
> v2: name change SoC to Device, address style issues
> 
> Signed-off-by: Matt Atwood <matthew.s.atwood at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device_types.h |  9 ++++++
>  drivers/gpu/drm/xe/xe_pci.c          |  4 +++
>  drivers/gpu/drm/xe/xe_rtp.c          |  5 +++
>  drivers/gpu/drm/xe/xe_rtp.h          |  3 +-
>  drivers/gpu/drm/xe/xe_rtp_types.h    |  2 ++
>  drivers/gpu/drm/xe/xe_wa.c           | 46 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_wa.h           | 18 +++++++++--
>  7 files changed, 83 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 6aca4b1a2824..62f09d2923c3 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -360,6 +360,15 @@ struct xe_device {
>  		u8 skip_pcode:1;
>  	} info;
>  
> +	/** @xe_active.oob: bitmap with active OOB workarounds */
> +	unsigned long *oob;
> +	/**
> +	 *  @xe_active.oob_initialized: mark oob as initialized to help
> +	 *  detecting misuse of XE_SOC_WA() - it can only be called on
> +	 *  initialization after SOC OOB WAs have been processed
> +	 */
> +	bool oob_initialized;

Did you mean to wrap these in a sub-struct?  Because right now the
kerneldocs don't match the actual placement.  Also the "SOC" stuff is
outdated.

> +
>  	/** @survivability: survivability information for device */
>  	struct xe_survivability survivability;
>  
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index 824461c31288..a9f708608f3e 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -32,6 +32,7 @@
>  #include "xe_step.h"
>  #include "xe_survivability_mode.h"
>  #include "xe_tile.h"
> +#include "xe_wa.h"
>  
>  enum toggle_d3cold {
>  	D3COLD_DISABLE,
> @@ -832,6 +833,9 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  	if (err)
>  		return err;
>  
> +	xe_device_wa_init(xe);
> +	xe_device_wa_process_oob(xe);
> +
>  	err = xe_device_probe_early(xe);
>  	/*
>  	 * In Boot Survivability mode, no drm card is exposed and driver
> diff --git a/drivers/gpu/drm/xe/xe_rtp.c b/drivers/gpu/drm/xe/xe_rtp.c
> index 29e694bb1219..2a0b0a1a8969 100644
> --- a/drivers/gpu/drm/xe/xe_rtp.c
> +++ b/drivers/gpu/drm/xe/xe_rtp.c
> @@ -196,6 +196,11 @@ static void rtp_get_context(struct xe_rtp_process_ctx *ctx,
>  		*gt = (*hwe)->gt;
>  		*xe = gt_to_xe(*gt);
>  		break;
> +	case XE_RTP_PROCESS_TYPE_DEVICE:
> +		*hwe = NULL;
> +		*gt = NULL;
> +		*xe = ctx->xe;
> +		break;
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_rtp.h b/drivers/gpu/drm/xe/xe_rtp.h
> index 4fe736a11c42..ac260feaabef 100644
> --- a/drivers/gpu/drm/xe/xe_rtp.h
> +++ b/drivers/gpu/drm/xe/xe_rtp.h
> @@ -422,7 +422,8 @@ struct xe_reg_sr;
>  
>  #define XE_RTP_PROCESS_CTX_INITIALIZER(arg__) _Generic((arg__),							\
>  	struct xe_hw_engine * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_ENGINE },	\
> -	struct xe_gt * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_GT })
> +	struct xe_gt * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_GT },	\
> +	struct xe_device * :	(struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_DEVICE })
>  
>  void xe_rtp_process_ctx_enable_active_tracking(struct xe_rtp_process_ctx *ctx,
>  					       unsigned long *active_entries,
> diff --git a/drivers/gpu/drm/xe/xe_rtp_types.h b/drivers/gpu/drm/xe/xe_rtp_types.h
> index 1b76b947c706..123579ebf2cf 100644
> --- a/drivers/gpu/drm/xe/xe_rtp_types.h
> +++ b/drivers/gpu/drm/xe/xe_rtp_types.h
> @@ -112,12 +112,14 @@ struct xe_rtp_entry {
>  enum xe_rtp_process_type {
>  	XE_RTP_PROCESS_TYPE_GT,
>  	XE_RTP_PROCESS_TYPE_ENGINE,
> +	XE_RTP_PROCESS_TYPE_DEVICE,
>  };
>  
>  struct xe_rtp_process_ctx {
>  	union {
>  		struct xe_gt *gt;
>  		struct xe_hw_engine *hwe;
> +		struct xe_device *xe;
>  	};
>  	enum xe_rtp_process_type type;
>  	unsigned long *active_entries;
> diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> index f51218a7a580..2cb2fab20070 100644
> --- a/drivers/gpu/drm/xe/xe_wa.c
> +++ b/drivers/gpu/drm/xe/xe_wa.c
> @@ -11,6 +11,7 @@
>  #include <linux/fault-inject.h>
>  
>  #include <generated/xe_wa_oob.h>
> +#include <generated/xe_device_wa_oob.h>
>  
>  #include "regs/xe_engine_regs.h"
>  #include "regs/xe_gt_regs.h"
> @@ -876,6 +877,13 @@ static __maybe_unused const struct xe_rtp_entry oob_was[] = {
>  
>  static_assert(ARRAY_SIZE(oob_was) - 1 == _XE_WA_OOB_COUNT);
>  
> +static __maybe_unused const struct xe_rtp_entry device_oob_was[] = {
> +#include <generated/xe_device_wa_oob.c>
> +	{}
> +};
> +
> +static_assert(ARRAY_SIZE(device_oob_was) - 1 == _XE_DEVICE_WA_OOB_COUNT);
> +
>  __diag_pop();
>  
>  /**
> @@ -895,6 +903,39 @@ void xe_wa_process_oob(struct xe_gt *gt)
>  	xe_rtp_process(&ctx, oob_was);
>  }
>  
> +int xe_device_wa_init(struct xe_device *xe)

We've been trying to be better about naming functions from xe_foo.c as
xe_foo_*().  So these new functions might be better with names like
xe_wa_device_init() and xe_wa_process_device_oob() and such.

At some point we should also probably go back and rename the existing GT
functions to include 'gt' in their names at the corresponding places.

> +{
> +	unsigned long *p;
> +
> +	p = drmm_kzalloc(&xe->drm,
> +			 sizeof(xe->oob) * BITS_TO_LONGS(ARRAY_SIZE(device_oob_was)),
> +			 GFP_KERNEL);
> +
> +	if (!p)
> +		return -ENOMEM;
> +
> +	xe->oob = p;
> +
> +	return 0;
> +}
> +
> +/**
> + * xe_device_wa_process_oob - process OOB workaround table
> + * @xe: xe device instance process workarounds for
> + *
> + * Process OOB workaround table for this platform, marking in @xe the
> + * workarounds that are active.
> + */
> +void xe_device_wa_process_oob(struct xe_device *xe)
> +{
> +	struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(xe);
> +
> +	xe_rtp_process_ctx_enable_active_tracking(&ctx, xe->oob, ARRAY_SIZE(device_oob_was));
> +
> +	xe->oob_initialized = true;
> +	xe_rtp_process(&ctx, device_oob_was);
> +}
> +
>  /**
>   * xe_wa_process_gt - process GT workaround table
>   * @gt: GT instance to process workarounds for
> @@ -1000,6 +1041,11 @@ void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p)
>  	for_each_set_bit(idx, gt->wa_active.oob, ARRAY_SIZE(oob_was))
>  		if (oob_was[idx].name)
>  			drm_printf_indent(p, 1, "%s\n", oob_was[idx].name);
> +
> +	drm_printf(p, "\nDevice OOB Workarounds\n");
> +	for_each_set_bit(idx, gt_to_xe(gt)->oob, ARRAY_SIZE(device_oob_was))
> +		if (device_oob_was[idx].name)
> +			drm_printf_indent(p, 1, "%s\n", device_oob_was[idx].name);

This function generates the contents of a per-GT debugfs node.  We don't
want to stick device workarounds at the bottom of a per-GT entry; we
should probably have a different debugfs entry outside the GT hierarchy
to report these.  That would also ensure that we can track device
workarounds on platforms with no GTs and no GT debugfs (that's not
something that exists today, but there's been a lot of interest in
letting igpu's run in a "not GT" mode for debug purposes, so we'll
probably have it eventually).

>  }
>  
>  /*
> diff --git a/drivers/gpu/drm/xe/xe_wa.h b/drivers/gpu/drm/xe/xe_wa.h
> index 52337405b5bc..09d065e0abb5 100644
> --- a/drivers/gpu/drm/xe/xe_wa.h
> +++ b/drivers/gpu/drm/xe/xe_wa.h
> @@ -14,7 +14,9 @@ struct xe_hw_engine;
>  struct xe_tile;
>  
>  int xe_wa_init(struct xe_gt *gt);
> +int xe_device_wa_init(struct xe_device *xe);
>  void xe_wa_process_oob(struct xe_gt *gt);
> +void xe_device_wa_process_oob(struct xe_device *xe);
>  void xe_wa_process_gt(struct xe_gt *gt);
>  void xe_wa_process_engine(struct xe_hw_engine *hwe);
>  void xe_wa_process_lrc(struct xe_hw_engine *hwe);
> @@ -22,14 +24,24 @@ void xe_wa_apply_tile_workarounds(struct xe_tile *tile);
>  void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p);
>  
>  /**
> - * XE_WA - Out-of-band workarounds, that don't fit the lifecycle any
> - *         other more specific type
> + * XE_WA - Out-of-band GT workarounds, that don't fit the lifecycle any
> + *         other more specific type,
>   * @gt__: gt instance
> - * @id__: XE_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h
> + * @id__: XE_WA_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h
>   */
>  #define XE_WA(gt__, id__) ({						\
>  	xe_gt_assert(gt__, (gt__)->wa_active.oob_initialized);		\
>  	test_bit(XE_WA_OOB_ ## id__, (gt__)->wa_active.oob);		\
>  })
>  
> +/**
> + * XE_DEVICE_WA - Out-of-band Device workarounds, that don't fit the lifecycle any
> + *         other more specific type
> + * @xe__: xe_device
> + * @id__: XE_SOC_WA_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h

The "SOC" here is outdated.  Also the wrong header is referenced.

> + */
> +#define XE_DEVICE_WA(xe__, id__) ({					\
> +	xe_assert(xe__, (xe__)->oob_initialized);			\
> +	test_bit(XE_DEVICE_WA_OOB_ ## id__, (xe__)->oob);			\
> +})
>  #endif

This is probably fine short-term, but I think eventually we should just
have a single XE_WA() call that _Generic()'s on its parameter to do the
lookup in the proper place (gt or device).  If we don't want to _Generic
it for some reason, then the existing macro should probably get renamed
to XE_GT_WA() to clarify its purpose.


Matt

> -- 
> 2.49.0
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list