[PATCH 2/3] drm/xe/xelp: Implement HSDES#16010904313 workarounds

Matt Roper matthew.d.roper at intel.com
Fri May 16 19:00:00 UTC 2025


On Fri, May 16, 2025 at 08:44:11AM +0100, Tvrtko Ursulin wrote:
> Add XeLP workarounds specified in HSDES#16010904313.
> 
> To do this we add the context indirect workaround page to the context
> state and set it up using the i915 as the programming reference.

We should probably split the general indirect context setup/prep into a
separate patch from specific workarounds like Wa_16010904313 and 

> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> ---
>  drivers/gpu/drm/xe/regs/xe_engine_regs.h |   1 +
>  drivers/gpu/drm/xe/regs/xe_lrc_layout.h  |   7 ++
>  drivers/gpu/drm/xe/xe_lrc.c              | 115 +++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_lrc_types.h        |   3 +-
>  4 files changed, 125 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> index 7ade41e2b7b3..d1d2592e010d 100644
> --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> @@ -75,6 +75,7 @@
>  #define RING_ACTHD(base)			XE_REG((base) + 0x74)
>  #define RING_DMA_FADD(base)			XE_REG((base) + 0x78)
>  #define RING_HWS_PGA(base)			XE_REG((base) + 0x80)
> +#define RING_CMD_BUF_CCTL(base)			XE_REG((base) + 0x84)
>  #define RING_HWSTAM(base)			XE_REG((base) + 0x98)
>  #define RING_MI_MODE(base)			XE_REG((base) + 0x9c)
>  #define RING_NOPID(base)			XE_REG((base) + 0x94)
> diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> index 994af591a2e8..82723f9783b1 100644
> --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> @@ -12,6 +12,8 @@
>  #define CTX_RING_START			(0x08 + 1)
>  #define CTX_RING_CTL			(0x0a + 1)
>  #define CTX_BB_PER_CTX_PTR		(0x12 + 1)
> +#define CTX_CS_INDIRECT_CTX		(0x14 + 1)
> +#define CTX_CS_INDIRECT_CTX_OFFSET	(0x16 + 1)
>  #define CTX_TIMESTAMP			(0x22 + 1)
>  #define CTX_TIMESTAMP_UDW		(0x24 + 1)
>  #define CTX_INDIRECT_RING_STATE		(0x26 + 1)
> @@ -30,10 +32,15 @@
>  #define CTX_CS_INT_VEC_REG		0x5a
>  #define CTX_CS_INT_VEC_DATA		(CTX_CS_INT_VEC_REG + 1)
>  
> +#define CTX_GPR0			(0x74 + 1)
> +#define CTX_CMD_BUF_CCTL		(0xb6 + 1)
> +
>  #define INDIRECT_CTX_RING_HEAD		(0x02 + 1)
>  #define INDIRECT_CTX_RING_TAIL		(0x04 + 1)
>  #define INDIRECT_CTX_RING_START		(0x06 + 1)
>  #define INDIRECT_CTX_RING_START_UDW	(0x08 + 1)
>  #define INDIRECT_CTX_RING_CTL		(0x0a + 1)
>  
> +#define XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT	0xd
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> index 08835e36a60d..6022068ce6b6 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.c
> +++ b/drivers/gpu/drm/xe/xe_lrc.c
> @@ -39,6 +39,7 @@
>  #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
>  
>  #define LRC_PPHWSP_SIZE				SZ_4K
> +#define LRC_INDIRECT_CTX_SIZE			SZ_4K
>  #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
>  
>  static struct xe_device *
> @@ -47,6 +48,11 @@ lrc_to_xe(struct xe_lrc *lrc)
>  	return gt_to_xe(lrc->fence_ctx.gt);
>  }
>  
> +static bool xe_needs_indirect_ctx(struct xe_device *xe)
> +{
> +	return GRAPHICS_VERx100(xe) >= 1200 && GRAPHICS_VERx100(xe) <= 1210;

I don't think any platform/IP inherently "needs" indirect context usage;
it's just certain workarounds are implemented using it.  So it would be
best if we implement each of those workarounds through the OOB
workaround infrastructure.  Then this function should just || together
XE_WA checks for all the workarounds that result in indirect context
usage.


> +}
> +
>  size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
>  {
>  	struct xe_device *xe = gt_to_xe(gt);
> @@ -79,6 +85,9 @@ size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
>  		size += 1 * SZ_4K;
>  	}
>  
> +	if (xe_needs_indirect_ctx(xe))
> +		size += LRC_INDIRECT_CTX_SIZE;
> +
>  	/* Add indirect ring state page */
>  	if (xe_gt_has_indirect_ring_state(gt))
>  		size += LRC_INDIRECT_RING_STATE_SIZE;
> @@ -974,6 +983,106 @@ static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
>  
>  }
>  
> +static u32 *
> +xelp_emit_timestamp_wa(struct xe_lrc *lrc, u32 *cmd)
> +{
> +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
> +	*cmd++ = CS_GPR_REG(0, 0).addr;
> +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_TIMESTAMP * sizeof(u32);
> +	*cmd++ = 0;
> +
> +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
> +	*cmd++ = CS_GPR_REG(0, 0).addr;
> +	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
> +
> +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
> +	*cmd++ = CS_GPR_REG(0, 0).addr;
> +	*cmd++ = RING_CTX_TIMESTAMP(0).addr;

This doesn't seem to actually match the documented workaround details.
The workaround says we need to do three back-to-back MI_LRM instructions
to read CTX_TIMESTAMP into $lrc+0x108C.  The first two are supposed to
have dw0[21] (i.e., async mode) enabled, and the third should have it
disabled.  I'm guessing this got implemented in i915 before the formal
workaround description was finalized, or the hardware people updated the
description after we'd already implemented the workaround.

That will also prevent us from clobbering the GPR register, which will
let us drop the restore_scratch function.

> +
> +	return cmd;
> +}
> +
> +static u32 *
> +xelp_emit_restore_scratch(struct xe_lrc *lrc, u32 *cmd)
> +{
> +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
> +	*cmd++ = CS_GPR_REG(0, 0).addr;
> +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_GPR0 * sizeof(u32);
> +	*cmd++ = 0;
> +
> +	return cmd;
> +}
> +
> +static u32 *
> +xelp_emit_cmd_buf_wa(struct xe_lrc *lrc, u32 *cmd)
> +{
> +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
> +	*cmd++ = CS_GPR_REG(0, 0).addr;
> +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc)+ CTX_CMD_BUF_CCTL * sizeof(u32);
> +	*cmd++ = 0;
> +
> +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRR_DST_CS_MMIO;
> +	*cmd++ = CS_GPR_REG(0, 0).addr;
> +	*cmd++ = RING_CMD_BUF_CCTL(0).addr;

I don't see any documented workaround asking us to do this restoration
of the CMD_BUF_CCTL register.  I suspect the i915 implementation was
either something for pre-production hardware that never should have
stuck around in the driver, or it was hacking around some other driver
bug.  Are you finding this to actually be necessary for something today?

> +
> +	return cmd;
> +}
> +
> +static u32 *
> +xelp_setup_indirect_ctx_rcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> +			    u32 *cmd)
> +{
> +	/* HSDES#16010904313 */
> +	cmd = xelp_emit_timestamp_wa(lrc, cmd);
> +	cmd = xelp_emit_cmd_buf_wa(lrc, cmd);
> +	cmd = xelp_emit_restore_scratch(lrc, cmd);
> +
> +	return cmd;
> +}
> +
> +static u32 *
> +xelp_setup_indirect_ctx_xcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> +			    u32 *cmd)
> +{
> +	/* HSDES#16010904313 */
> +	cmd = xelp_emit_timestamp_wa(lrc, cmd);
> +	cmd = xelp_emit_restore_scratch(lrc, cmd);
> +
> +	return cmd;
> +}
> +
> +static void
> +xelp_setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
> +{
> +	u32 *start, *cmd, *regs;
> +	struct iosys_map map;
> +	u32 ggtt, offset;
> +
> +	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
> +		return;
> +
> +	offset = lrc->size - LRC_INDIRECT_CTX_SIZE;
> +	ggtt = xe_bo_ggtt_addr(lrc->bo) + offset;
> +
> +	map = lrc->bo->vmap;
> +	iosys_map_incr(&map, offset);
> +	start = cmd = map.vaddr;

Isn't this going to be a problem if the iosys_map is holding iomem?  If
we're going to build the indirect context batch directly with CPU
pointer accesses as the functions above do, don't we need to do that in
a separate buffer and then copy it into the iosys_map?


> +
> +	if (hwe->class == XE_ENGINE_CLASS_RENDER)
> +		cmd = xelp_setup_indirect_ctx_rcs(lrc, hwe, cmd);

This also deviates from the documented workaround.  According to the
workaround database, INDIRECT_CTX should be used for RCS/CCS, but we're
supposed to use BB_PER_CTX_PTR instead for RCS/CCS (both are similar
mechanisms, but INDIRECT_CTX happens somewhere in the middle of a
context restore, whereas BB_PER_CTX_PTR happens at the very end of the
context restore).


Matt

> +	else
> +		cmd = xelp_setup_indirect_ctx_xcs(lrc, hwe, cmd);
> +
> +	while ((unsigned long)cmd & 0x3f) /* Align to 64B cacheline. */
> +		*cmd++ = MI_NOOP;
> +
> +	map = __xe_lrc_regs_map(lrc);
> +	regs = map.vaddr;
> +
> +	regs[CTX_CS_INDIRECT_CTX] = ggtt | ((cmd - start) * sizeof(u32) / 64); /* Size in CLs. */
> +	regs[CTX_CS_INDIRECT_CTX_OFFSET] = XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
> +}
> +
>  #define PVC_CTX_ASID		(0x2e + 1)
>  #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
>  
> @@ -997,6 +1106,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  	lrc->size = ring_size + lrc_size;
>  	lrc->ring.size = ring_size;
>  	lrc->ring.tail = 0;
> +
> +	if (xe_needs_indirect_ctx(xe))
> +		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> +
>  	if (xe_gt_has_indirect_ring_state(gt))
>  		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
>  
> @@ -1127,6 +1240,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>  
>  	xe_lrc_setup_utilization(lrc);
>  
> +	xelp_setup_indirect_ctx(lrc, hwe);
> +
>  	return 0;
>  
>  err_lrc_finish:
> diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
> index 559c7c831212..9ce7d02ef210 100644
> --- a/drivers/gpu/drm/xe/xe_lrc_types.h
> +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
> @@ -29,7 +29,8 @@ struct xe_lrc {
>  	struct xe_gt *gt;
>  
>  	/** @flags: LRC flags */
> -#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x1
> +#define XE_LRC_FLAG_INDIRECT_CTX		0x1
> +#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x2
>  	u32 flags;
>  
>  	/** @refcount: ref count of this lrc */
> -- 
> 2.48.0
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list