[PATCH v3] drm/xe: Apply Wa_16023105232

Matt Roper matthew.d.roper at intel.com
Tue Mar 11 21:33:03 UTC 2025


On Mon, Mar 10, 2025 at 11:55:52AM -0700, Vinay Belgaumkar wrote:
> The WA requires KMD to disable DOP clock gating during a semaphore
> wait and also ensure that idle delay for every CS is lower than the
> idle wait time in the PWRCTX_MAXCNT register. Default values for these
> registers already comply with this restriction.

I'm not sure about the default value statement here.  For 0xD00, the
default value is already correct for Xe3, but it is _not_ correct for
Xe2.  For the delay values, the default for MAXCNT is ~6us, but there's
no default value documented for IDLEDLY, so we can't really say whether
it will be higher or lower by default (and it may even vary by platform,
SKU, or firmware version).

> 
> v2: Store timestamp_base in gt info and other comments (Daniele)
> v3: Skip WA check for VF
> 
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_engine_regs.h |  2 ++
>  drivers/gpu/drm/xe/xe_gt_clock.c         | 38 +++++++++++++++++++++---
>  drivers/gpu/drm/xe/xe_gt_types.h         |  2 ++
>  drivers/gpu/drm/xe/xe_hw_engine.c        | 31 +++++++++++++++++++
>  drivers/gpu/drm/xe/xe_wa_oob.rules       |  2 ++
>  5 files changed, 71 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> index 4f372dc2cb89..067468c62adb 100644
> --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> @@ -130,6 +130,8 @@
>  #define RING_EXECLIST_STATUS_LO(base)		XE_REG((base) + 0x234)
>  #define RING_EXECLIST_STATUS_HI(base)		XE_REG((base) + 0x234 + 4)
>  
> +#define RING_IDLEDLY(base)			XE_REG((base) + 0x23C)

Nitpick: for consistency with other definitions we should use lowercase
hex (0x23c).

> +
>  #define RING_CONTEXT_CONTROL(base)		XE_REG((base) + 0x244, XE_REG_OPTION_MASKED)
>  #define	  CTX_CTRL_PXP_ENABLE			REG_BIT(10)
>  #define	  CTX_CTRL_OAC_CONTEXT_ENABLE		REG_BIT(8)
> diff --git a/drivers/gpu/drm/xe/xe_gt_clock.c b/drivers/gpu/drm/xe/xe_gt_clock.c
> index 2a958c92d8ea..0394292e777e 100644
> --- a/drivers/gpu/drm/xe/xe_gt_clock.c
> +++ b/drivers/gpu/drm/xe/xe_gt_clock.c
> @@ -16,12 +16,13 @@
>  #include "xe_macros.h"
>  #include "xe_mmio.h"
>  
> +#define f19_2_mhz	19200000
> +#define f24_mhz		24000000
> +#define f25_mhz		25000000
> +#define f38_4_mhz	38400000
> +
>  static u32 get_crystal_clock_freq(u32 rpm_config_reg)
>  {
> -	const u32 f19_2_mhz = 19200000;
> -	const u32 f24_mhz = 24000000;
> -	const u32 f25_mhz = 25000000;
> -	const u32 f38_4_mhz = 38400000;
>  	u32 crystal_clock = REG_FIELD_GET(RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_MASK,
>  					  rpm_config_reg);
>  
> @@ -40,10 +41,37 @@ static u32 get_crystal_clock_freq(u32 rpm_config_reg)
>  	}
>  }
>  
> +/**
> + * xe_gt_timestamp_base - Return the GT timestamp base
> + *
> + * @crystal_clock_freq: clock freq from 0xd00 register
> + * Returns: rounded down time in nsec
> + */
> +static u32 xe_gt_timestamp_base(u32 crystal_clock_freq)
> +{
> +	const u32 ts_base_83 = 83333;
> +	const u32 ts_base_52 = 52083;
> +	const u32 ts_base_80 = 80000;
> +
> +	switch (crystal_clock_freq) {
> +	case f24_mhz:
> +		return DIV_ROUND_CLOSEST(ts_base_83, 1000);
> +	case f19_2_mhz:
> +	case f38_4_mhz:
> +		return DIV_ROUND_CLOSEST(ts_base_52, 1000);
> +	case f25_mhz:
> +		return DIV_ROUND_CLOSEST(ts_base_80, 1000);

Do we want to truncate these values down to full nanoseconds here?  It
seems like that might cause a small amount of unnecessary rounding error
when we later use the value in multiplication.  Maybe it would be better
to leave this in picoseconds for now, and only convert to nanoseconds
later after we're done doing the math?

> +	default:
> +		XE_WARN_ON("NOT_POSSIBLE");
> +		return 0;
> +	}
> +}
> +
>  int xe_gt_clock_init(struct xe_gt *gt)
>  {
>  	u32 c0 = xe_mmio_read32(&gt->mmio, RPM_CONFIG0);
>  	u32 freq = 0;
> +	u32 timestamp_base = 0;
>  
>  	/*
>  	 * CTC_MODE[0] = 1 is definitely not supported for Xe2 and later
> @@ -59,6 +87,7 @@ int xe_gt_clock_init(struct xe_gt *gt)
>  		xe_gt_warn(gt, "CTC_MODE[0] is set; this is unexpected and undocumented\n");
>  
>  	freq = get_crystal_clock_freq(c0);
> +	timestamp_base = xe_gt_timestamp_base(freq);

Would it be simpler to combine these two functions that both read from
0xd00 into a single call that returns both values from the table on
bspec 72056?  E.g.,

        read_crystal_clock(c0, &freq, &gt->info.timestamp_base);

>  
>  	/*
>  	 * Now figure out how the command stream's timestamp
> @@ -68,6 +97,7 @@ int xe_gt_clock_init(struct xe_gt *gt)
>  	freq >>= 3 - REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, c0);
>  
>  	gt->info.reference_clock = freq;
> +	gt->info.timestamp_base = timestamp_base;
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index e3cfb026ac88..7def0959da35 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -121,6 +121,8 @@ struct xe_gt {
>  		enum xe_gt_type type;
>  		/** @info.reference_clock: clock frequency */
>  		u32 reference_clock;
> +		/** @info.timestamp_base: GT timestamp base */
> +		u32 timestamp_base;
>  		/**
>  		 * @info.engine_mask: mask of engines present on GT. Some of
>  		 * them may be reserved in runtime and not available for user.
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index 223b95de388c..47dfc403b71d 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -8,7 +8,9 @@
>  #include <linux/nospec.h>
>  
>  #include <drm/drm_managed.h>
> +#include <drm/drm_print.h>
>  #include <uapi/drm/xe_drm.h>
> +#include <generated/xe_wa_oob.h>
>  
>  #include "regs/xe_engine_regs.h"
>  #include "regs/xe_gt_regs.h"
> @@ -21,6 +23,7 @@
>  #include "xe_gsc.h"
>  #include "xe_gt.h"
>  #include "xe_gt_ccs_mode.h"
> +#include "xe_gt_clock.h"
>  #include "xe_gt_printk.h"
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
> @@ -458,6 +461,12 @@ hw_engine_setup_default_state(struct xe_hw_engine *hwe)
>  		  XE_RTP_ACTIONS(SET(CSFE_CHICKEN1(0), CS_PRIORITY_MEM_READ,
>  				     XE_RTP_ACTION_FLAG(ENGINE_BASE)))
>  		},
> +		{ XE_RTP_NAME("Disable DOP clk gating"),
> +		  XE_RTP_RULES(MEDIA_VERSION_RANGE(1301, 3000), OR,
> +			       GRAPHICS_VERSION_RANGE(2001, 3001)),
> +		  XE_RTP_ACTIONS(SET(RING_PSMI_CTL(0), RC_SEMA_IDLE_MSG_DISABLE,
> +				     XE_RTP_ACTION_FLAG(ENGINE_BASE)))
> +		},

Since this is part of an actual workaround we should put this in xe_wa.c
rather than here.  Also, this is only needed on Xe2 platforms; on Xe3
the register already has the necessary value as long as we don't touch
it.

>  	};
>  
>  	xe_rtp_process_to_sr(&ctx, engine_entries, ARRAY_SIZE(engine_entries), &hwe->reg_sr);
> @@ -564,6 +573,24 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  	xe_reg_whitelist_process_engine(hwe);
>  }
>  
> +static void check_wa_16023105232(struct xe_gt *gt, struct xe_hw_engine *hwe)

"check" makes me think this is going to be a function that returns
true/false and has no side effects, but that's not the case here.  It
would be better to name this function for the action it's actually
taking, e.g., "adjust_idledly" or "set_idledy_lower_than_maxcnt."  Also
we should just be able to pass just the hwe as a parameter since we can
obtain the GT from that.


> +{
> +	u32 idledly, maxcnt;
> +	u32 idledly_units_ns = 8 * gt->info.timestamp_base;
> +	u32 maxcnt_units_ns = 640;
> +
> +	if (XE_WA(gt, 16023105232)) {
> +		idledly = xe_mmio_read32(&gt->mmio, RING_IDLEDLY(hwe->mmio_base));
> +		maxcnt = xe_mmio_read32(&gt->mmio, RING_PWRCTX_MAXCNT(hwe->mmio_base));
> +
> +		if (drm_WARN_ON(&gt_to_xe(gt)->drm,

We can use xe_gt_WARN_ON() for better GT-specific output.  But why do we
need a warning at all?  This is basically what this workaround is asking
us to do --- reprogram IDLEDLY such that its value is lower than
MAXCNT's --- so I'd expect there to be legitimate cases where this isn't
true before we apply the workaround, otherwise they wouldn't have needed
to create the workaround.

> +				(idledly * idledly_units_ns) >= (maxcnt * maxcnt_units_ns))) {
> +			xe_mmio_write32(&gt->mmio, RING_IDLEDLY(hwe->mmio_base),
> +					((maxcnt - 1) * maxcnt_units_ns) / idledly_units_ns);
> +		}
> +	}
> +}
> +
>  static int hw_engine_init(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  			  enum xe_hw_engine_id id)
>  {
> @@ -604,6 +631,10 @@ static int hw_engine_init(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  	if (xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY)
>  		gt->usm.reserved_bcs_instance = hwe->instance;
>  
> +	/* Ensure IDLEDLY is lower than MAXCNT */
> +	if (!IS_SRIOV_VF(xe))
> +		check_wa_16023105232(gt, hwe);

As noted above, we can adjust the name of the function to indicate what
it actually does, and we can also move the XE_WA check to here:

        if (XE_WA(gt, 16023105232))
                adjust_idledly(hwe);

I believe the not-VF check should already be handled by the XE_WA check.


Matt

> +
>  	return devm_add_action_or_reset(xe->drm.dev, hw_engine_fini, hwe);
>  
>  err_hwsp:
> diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
> index e0c5fa460487..0c738af24f7c 100644
> --- a/drivers/gpu/drm/xe/xe_wa_oob.rules
> +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
> @@ -53,3 +53,5 @@ no_media_l3	MEDIA_VERSION(3000)
>  		GRAPHICS_VERSION_RANGE(1270, 1274)
>  1508761755	GRAPHICS_VERSION(1255)
>  		GRAPHICS_VERSION(1260), GRAPHICS_STEP(A0, B0)
> +16023105232	GRAPHICS_VERSION_RANGE(2001, 3001)
> +		MEDIA_VERSION_RANGE(1301, 3000)
> -- 
> 2.38.1
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list