[PATCH 2/3] drm/xe/xelp: Implement HSDES#16010904313 workarounds

Mon May 19 20:00:01 UTC 2025

On Mon, May 19, 2025 at 09:10:51AM +0100, Tvrtko Ursulin wrote:
> 
> On 16/05/2025 20:00, Matt Roper wrote:
> > On Fri, May 16, 2025 at 08:44:11AM +0100, Tvrtko Ursulin wrote:
> > > Add XeLP workarounds specified in HSDES#16010904313.
> > > 
> > > To do this we add the context indirect workaround page to the context
> > > state and set it up using the i915 as the programming reference.
> > 
> > We should probably split the general indirect context setup/prep into a
> > separate patch from specific workarounds like Wa_16010904313 and
> 
> I considered it but thought someone might complain why I am adding unused
> code. Anyway, I can split it out, no problem.
> 
> > > 
> > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> > > ---
> > >   drivers/gpu/drm/xe/regs/xe_engine_regs.h |   1 +
> > >   drivers/gpu/drm/xe/regs/xe_lrc_layout.h  |   7 ++
> > >   drivers/gpu/drm/xe/xe_lrc.c              | 115 +++++++++++++++++++++++
> > >   drivers/gpu/drm/xe/xe_lrc_types.h        |   3 +-
> > >   4 files changed, 125 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > > index 7ade41e2b7b3..d1d2592e010d 100644
> > > --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > > +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > > @@ -75,6 +75,7 @@
> > >   #define RING_ACTHD(base)			XE_REG((base) + 0x74)
> > >   #define RING_DMA_FADD(base)			XE_REG((base) + 0x78)
> > >   #define RING_HWS_PGA(base)			XE_REG((base) + 0x80)
> > > +#define RING_CMD_BUF_CCTL(base)			XE_REG((base) + 0x84)
> > >   #define RING_HWSTAM(base)			XE_REG((base) + 0x98)
> > >   #define RING_MI_MODE(base)			XE_REG((base) + 0x9c)
> > >   #define RING_NOPID(base)			XE_REG((base) + 0x94)
> > > diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > > index 994af591a2e8..82723f9783b1 100644
> > > --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > > +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
> > > @@ -12,6 +12,8 @@
> > >   #define CTX_RING_START			(0x08 + 1)
> > >   #define CTX_RING_CTL			(0x0a + 1)
> > >   #define CTX_BB_PER_CTX_PTR		(0x12 + 1)
> > > +#define CTX_CS_INDIRECT_CTX		(0x14 + 1)
> > > +#define CTX_CS_INDIRECT_CTX_OFFSET	(0x16 + 1)
> > >   #define CTX_TIMESTAMP			(0x22 + 1)
> > >   #define CTX_TIMESTAMP_UDW		(0x24 + 1)
> > >   #define CTX_INDIRECT_RING_STATE		(0x26 + 1)
> > > @@ -30,10 +32,15 @@
> > >   #define CTX_CS_INT_VEC_REG		0x5a
> > >   #define CTX_CS_INT_VEC_DATA		(CTX_CS_INT_VEC_REG + 1)
> > > +#define CTX_GPR0			(0x74 + 1)
> > > +#define CTX_CMD_BUF_CCTL		(0xb6 + 1)
> > > +
> > >   #define INDIRECT_CTX_RING_HEAD		(0x02 + 1)
> > >   #define INDIRECT_CTX_RING_TAIL		(0x04 + 1)
> > >   #define INDIRECT_CTX_RING_START		(0x06 + 1)
> > >   #define INDIRECT_CTX_RING_START_UDW	(0x08 + 1)
> > >   #define INDIRECT_CTX_RING_CTL		(0x0a + 1)
> > > +#define XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT	0xd
> > > +
> > >   #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
> > > index 08835e36a60d..6022068ce6b6 100644
> > > --- a/drivers/gpu/drm/xe/xe_lrc.c
> > > +++ b/drivers/gpu/drm/xe/xe_lrc.c
> > > @@ -39,6 +39,7 @@
> > >   #define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
> > >   #define LRC_PPHWSP_SIZE				SZ_4K
> > > +#define LRC_INDIRECT_CTX_SIZE			SZ_4K
> > >   #define LRC_INDIRECT_RING_STATE_SIZE		SZ_4K
> > >   static struct xe_device *
> > > @@ -47,6 +48,11 @@ lrc_to_xe(struct xe_lrc *lrc)
> > >   	return gt_to_xe(lrc->fence_ctx.gt);
> > >   }
> > > +static bool xe_needs_indirect_ctx(struct xe_device *xe)
> > > +{
> > > +	return GRAPHICS_VERx100(xe) >= 1200 && GRAPHICS_VERx100(xe) <= 1210;
> > 
> > I don't think any platform/IP inherently "needs" indirect context usage;
> > it's just certain workarounds are implemented using it.  So it would be
> > best if we implement each of those workarounds through the OOB
> > workaround infrastructure.  Then this function should just || together
> > XE_WA checks for all the workarounds that result in indirect context
> > usage.
> 
> Ack. I was't sure if HSDES#16010904313 was an actual defined workaround
> given there was no marking in the code, just the HSD reference in the
> commit.

Yeah, that's the workaround's lineage number, so the proper name is
Wa_16010904313.  It applies to all TGL, RLK, ADL, and DG1 (i.e.,
graphics versions 12.00 and 12.10).  It also applied to some
pre-production steppings of DG2 and PVC, but we don't support those
steppings anymore.

> 
> > > +}
> > > +
> > >   size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
> > >   {
> > >   	struct xe_device *xe = gt_to_xe(gt);
> > > @@ -79,6 +85,9 @@ size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
> > >   		size += 1 * SZ_4K;
> > >   	}
> > > +	if (xe_needs_indirect_ctx(xe))
> > > +		size += LRC_INDIRECT_CTX_SIZE;
> > > +
> > >   	/* Add indirect ring state page */
> > >   	if (xe_gt_has_indirect_ring_state(gt))
> > >   		size += LRC_INDIRECT_RING_STATE_SIZE;
> > > @@ -974,6 +983,106 @@ static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
> > >   }
> > > +static u32 *
> > > +xelp_emit_timestamp_wa(struct xe_lrc *lrc, u32 *cmd)
> > > +{
> > > +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
> > > +	*cmd++ = CS_GPR_REG(0, 0).addr;
> > > +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_TIMESTAMP * sizeof(u32);
> > > +	*cmd++ = 0;
> > > +
> > > +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
> > > +	*cmd++ = CS_GPR_REG(0, 0).addr;
> > > +	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
> > > +
> > > +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
> > > +	*cmd++ = CS_GPR_REG(0, 0).addr;
> > > +	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
> > 
> > This doesn't seem to actually match the documented workaround details.
> > The workaround says we need to do three back-to-back MI_LRM instructions
> > to read CTX_TIMESTAMP into $lrc+0x108C.  The first two are supposed to
> > have dw0[21] (i.e., async mode) enabled, and the third should have it
> > disabled.  I'm guessing this got implemented in i915 before the formal
> > workaround description was finalized, or the hardware people updated the
> > description after we'd already implemented the workaround.
> > 
> > That will also prevent us from clobbering the GPR register, which will
> > let us drop the restore_scratch function.
> 
> Nice, thanks for looking it up!
> 
> > > +
> > > +	return cmd;
> > > +}
> > > +
> > > +static u32 *
> > > +xelp_emit_restore_scratch(struct xe_lrc *lrc, u32 *cmd)
> > > +{
> > > +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
> > > +	*cmd++ = CS_GPR_REG(0, 0).addr;
> > > +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_GPR0 * sizeof(u32);
> > > +	*cmd++ = 0;
> > > +
> > > +	return cmd;
> > > +}
> > > +
> > > +static u32 *
> > > +xelp_emit_cmd_buf_wa(struct xe_lrc *lrc, u32 *cmd)
> > > +{
> > > +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
> > > +	*cmd++ = CS_GPR_REG(0, 0).addr;
> > > +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc)+ CTX_CMD_BUF_CCTL * sizeof(u32);
> > > +	*cmd++ = 0;
> > > +
> > > +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRR_DST_CS_MMIO;
> > > +	*cmd++ = CS_GPR_REG(0, 0).addr;
> > > +	*cmd++ = RING_CMD_BUF_CCTL(0).addr;
> > 
> > I don't see any documented workaround asking us to do this restoration
> > of the CMD_BUF_CCTL register.  I suspect the i915 implementation was
> > either something for pre-production hardware that never should have
> > stuck around in the driver, or it was hacking around some other driver
> > bug.  Are you finding this to actually be necessary for something today?
> 
> I am just copying from i915 where it looked b8a1181122f7 ("drm/i915: Use
> indirect ctx bb to mend CMD_BUF_CCTL") landed suspiciously close to
> 685d21096f6c ("drm/i915: Add per ctx batchbuffer wa for timestamp"), and as
> former had no WA or HSD mentions I suspected both could be part of the same,
> potentially security sensitive, work.
> 
> Would it be possible to search the WA database for workarounds touching
> CMD_BUF_CCTL? In case it has it's own designation which was not documented
> in the code.

I tracked it down; this was a pre-production workaround (officially
known as Wa_1607225878) that applied to A-steppings of both TGL and DG1.
It never applied to production steppings of those platforms, and never
applied to any steppings of the other gen12 platforms (RKL and ADL).

It does apply to production steppings of ICL (gen11), but that's not
relevant to the Xe driver.

> 
> > > +
> > > +	return cmd;
> > > +}
> > > +
> > > +static u32 *
> > > +xelp_setup_indirect_ctx_rcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > > +			    u32 *cmd)
> > > +{
> > > +	/* HSDES#16010904313 */
> > > +	cmd = xelp_emit_timestamp_wa(lrc, cmd);
> > > +	cmd = xelp_emit_cmd_buf_wa(lrc, cmd);
> > > +	cmd = xelp_emit_restore_scratch(lrc, cmd);
> > > +
> > > +	return cmd;
> > > +}
> > > +
> > > +static u32 *
> > > +xelp_setup_indirect_ctx_xcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > > +			    u32 *cmd)
> > > +{
> > > +	/* HSDES#16010904313 */
> > > +	cmd = xelp_emit_timestamp_wa(lrc, cmd);
> > > +	cmd = xelp_emit_restore_scratch(lrc, cmd);
> > > +
> > > +	return cmd;
> > > +}
> > > +
> > > +static void
> > > +xelp_setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
> > > +{
> > > +	u32 *start, *cmd, *regs;
> > > +	struct iosys_map map;
> > > +	u32 ggtt, offset;
> > > +
> > > +	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
> > > +		return;
> > > +
> > > +	offset = lrc->size - LRC_INDIRECT_CTX_SIZE;
> > > +	ggtt = xe_bo_ggtt_addr(lrc->bo) + offset;
> > > +
> > > +	map = lrc->bo->vmap;
> > > +	iosys_map_incr(&map, offset);
> > > +	start = cmd = map.vaddr;
> > 
> > Isn't this going to be a problem if the iosys_map is holding iomem?  If
> > we're going to build the indirect context batch directly with CPU
> > pointer accesses as the functions above do, don't we need to do that in
> > a separate buffer and then copy it into the iosys_map?
> 
> I thought it was good enough for now given it is only for integrated GPUs.

Your patch as written is matching on graphics version 12.10, which means
DG1 is included as well as the igpus.

Even if we ignore DG1, it's probably best not to break the iosys_map
abstraction for igpus (or at least not without big comment discalimers
on the code) because it's the kind of code that someone may reuse and or
copy/paste in the future to implement some other workaround without
realizing that it's a problem for iomem.

> 
> > > +
> > > +	if (hwe->class == XE_ENGINE_CLASS_RENDER)
> > > +		cmd = xelp_setup_indirect_ctx_rcs(lrc, hwe, cmd);
> > 
> > This also deviates from the documented workaround.  According to the
> > workaround database, INDIRECT_CTX should be used for RCS/CCS, but we're
> > supposed to use BB_PER_CTX_PTR instead for RCS/CCS (both are similar
> > mechanisms, but INDIRECT_CTX happens somewhere in the middle of a
> > context restore, whereas BB_PER_CTX_PTR happens at the very end of the
> > context restore).
> 
> For 16010904313 specifically? And did you mean to describe RCS/CCS vs other
> engines should use a different mechanism? If so which for which?

Woops sorry, I mistyped.  For Wa_16010904313, the workaround says:

    For BCS/VCS/VECS:
      In the Per-Context WABB
    For RCS/CCS:
      In the Indirect Context Pointer

Matt

> 
> Regards,
> 
> Tvrtko
> 
> > > +	else
> > > +		cmd = xelp_setup_indirect_ctx_xcs(lrc, hwe, cmd);
> > > +
> > > +	while ((unsigned long)cmd & 0x3f) /* Align to 64B cacheline. */
> > > +		*cmd++ = MI_NOOP;
> > > +
> > > +	map = __xe_lrc_regs_map(lrc);
> > > +	regs = map.vaddr;
> > > +
> > > +	regs[CTX_CS_INDIRECT_CTX] = ggtt | ((cmd - start) * sizeof(u32) / 64); /* Size in CLs. */
> > > +	regs[CTX_CS_INDIRECT_CTX_OFFSET] = XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
> > > +}
> > > +
> > >   #define PVC_CTX_ASID		(0x2e + 1)
> > >   #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
> > > @@ -997,6 +1106,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > >   	lrc->size = ring_size + lrc_size;
> > >   	lrc->ring.size = ring_size;
> > >   	lrc->ring.tail = 0;
> > > +
> > > +	if (xe_needs_indirect_ctx(xe))
> > > +		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
> > > +
> > >   	if (xe_gt_has_indirect_ring_state(gt))
> > >   		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
> > > @@ -1127,6 +1240,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
> > >   	xe_lrc_setup_utilization(lrc);
> > > +	xelp_setup_indirect_ctx(lrc, hwe);
> > > +
> > >   	return 0;
> > >   err_lrc_finish:
> > > diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
> > > index 559c7c831212..9ce7d02ef210 100644
> > > --- a/drivers/gpu/drm/xe/xe_lrc_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
> > > @@ -29,7 +29,8 @@ struct xe_lrc {
> > >   	struct xe_gt *gt;
> > >   	/** @flags: LRC flags */
> > > -#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x1
> > > +#define XE_LRC_FLAG_INDIRECT_CTX		0x1
> > > +#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x2
> > >   	u32 flags;
> > >   	/** @refcount: ref count of this lrc */
> > > -- 
> > > 2.48.0
> > > 
> > 
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation