[PATCH 2/3] drm/xe/xelp: Implement HSDES#16010904313 workarounds
Tvrtko Ursulin
tvrtko.ursulin at igalia.com
Mon May 19 08:10:51 UTC 2025
On 16/05/2025 20:00, Matt Roper wrote:
> On Fri, May 16, 2025 at 08:44:11AM +0100, Tvrtko Ursulin wrote:
>> Add XeLP workarounds specified in HSDES#16010904313.
>>
>> To do this we add the context indirect workaround page to the context
>> state and set it up using the i915 as the programming reference.
>
> We should probably split the general indirect context setup/prep into a
> separate patch from specific workarounds like Wa_16010904313 and
I considered it but thought someone might complain why I am adding
unused code. Anyway, I can split it out, no problem.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>> ---
>> drivers/gpu/drm/xe/regs/xe_engine_regs.h | 1 +
>> drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 7 ++
>> drivers/gpu/drm/xe/xe_lrc.c | 115 +++++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_lrc_types.h | 3 +-
>> 4 files changed, 125 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> index 7ade41e2b7b3..d1d2592e010d 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> @@ -75,6 +75,7 @@
>> #define RING_ACTHD(base) XE_REG((base) + 0x74)
>> #define RING_DMA_FADD(base) XE_REG((base) + 0x78)
>> #define RING_HWS_PGA(base) XE_REG((base) + 0x80)
>> +#define RING_CMD_BUF_CCTL(base) XE_REG((base) + 0x84)
>> #define RING_HWSTAM(base) XE_REG((base) + 0x98)
>> #define RING_MI_MODE(base) XE_REG((base) + 0x9c)
>> #define RING_NOPID(base) XE_REG((base) + 0x94)
>> diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> index 994af591a2e8..82723f9783b1 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> @@ -12,6 +12,8 @@
>> #define CTX_RING_START (0x08 + 1)
>> #define CTX_RING_CTL (0x0a + 1)
>> #define CTX_BB_PER_CTX_PTR (0x12 + 1)
>> +#define CTX_CS_INDIRECT_CTX (0x14 + 1)
>> +#define CTX_CS_INDIRECT_CTX_OFFSET (0x16 + 1)
>> #define CTX_TIMESTAMP (0x22 + 1)
>> #define CTX_TIMESTAMP_UDW (0x24 + 1)
>> #define CTX_INDIRECT_RING_STATE (0x26 + 1)
>> @@ -30,10 +32,15 @@
>> #define CTX_CS_INT_VEC_REG 0x5a
>> #define CTX_CS_INT_VEC_DATA (CTX_CS_INT_VEC_REG + 1)
>>
>> +#define CTX_GPR0 (0x74 + 1)
>> +#define CTX_CMD_BUF_CCTL (0xb6 + 1)
>> +
>> #define INDIRECT_CTX_RING_HEAD (0x02 + 1)
>> #define INDIRECT_CTX_RING_TAIL (0x04 + 1)
>> #define INDIRECT_CTX_RING_START (0x06 + 1)
>> #define INDIRECT_CTX_RING_START_UDW (0x08 + 1)
>> #define INDIRECT_CTX_RING_CTL (0x0a + 1)
>>
>> +#define XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0xd
>> +
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>> index 08835e36a60d..6022068ce6b6 100644
>> --- a/drivers/gpu/drm/xe/xe_lrc.c
>> +++ b/drivers/gpu/drm/xe/xe_lrc.c
>> @@ -39,6 +39,7 @@
>> #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
>>
>> #define LRC_PPHWSP_SIZE SZ_4K
>> +#define LRC_INDIRECT_CTX_SIZE SZ_4K
>> #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
>>
>> static struct xe_device *
>> @@ -47,6 +48,11 @@ lrc_to_xe(struct xe_lrc *lrc)
>> return gt_to_xe(lrc->fence_ctx.gt);
>> }
>>
>> +static bool xe_needs_indirect_ctx(struct xe_device *xe)
>> +{
>> + return GRAPHICS_VERx100(xe) >= 1200 && GRAPHICS_VERx100(xe) <= 1210;
>
> I don't think any platform/IP inherently "needs" indirect context usage;
> it's just certain workarounds are implemented using it. So it would be
> best if we implement each of those workarounds through the OOB
> workaround infrastructure. Then this function should just || together
> XE_WA checks for all the workarounds that result in indirect context
> usage.
Ack. I was't sure if HSDES#16010904313 was an actual defined workaround
given there was no marking in the code, just the HSD reference in the
commit.
>> +}
>> +
>> size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
>> {
>> struct xe_device *xe = gt_to_xe(gt);
>> @@ -79,6 +85,9 @@ size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
>> size += 1 * SZ_4K;
>> }
>>
>> + if (xe_needs_indirect_ctx(xe))
>> + size += LRC_INDIRECT_CTX_SIZE;
>> +
>> /* Add indirect ring state page */
>> if (xe_gt_has_indirect_ring_state(gt))
>> size += LRC_INDIRECT_RING_STATE_SIZE;
>> @@ -974,6 +983,106 @@ static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
>>
>> }
>>
>> +static u32 *
>> +xelp_emit_timestamp_wa(struct xe_lrc *lrc, u32 *cmd)
>> +{
>> + *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
>> + *cmd++ = CS_GPR_REG(0, 0).addr;
>> + *cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_TIMESTAMP * sizeof(u32);
>> + *cmd++ = 0;
>> +
>> + *cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
>> + *cmd++ = CS_GPR_REG(0, 0).addr;
>> + *cmd++ = RING_CTX_TIMESTAMP(0).addr;
>> +
>> + *cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
>> + *cmd++ = CS_GPR_REG(0, 0).addr;
>> + *cmd++ = RING_CTX_TIMESTAMP(0).addr;
>
> This doesn't seem to actually match the documented workaround details.
> The workaround says we need to do three back-to-back MI_LRM instructions
> to read CTX_TIMESTAMP into $lrc+0x108C. The first two are supposed to
> have dw0[21] (i.e., async mode) enabled, and the third should have it
> disabled. I'm guessing this got implemented in i915 before the formal
> workaround description was finalized, or the hardware people updated the
> description after we'd already implemented the workaround.
>
> That will also prevent us from clobbering the GPR register, which will
> let us drop the restore_scratch function.
Nice, thanks for looking it up!
>> +
>> + return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_emit_restore_scratch(struct xe_lrc *lrc, u32 *cmd)
>> +{
>> + *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
>> + *cmd++ = CS_GPR_REG(0, 0).addr;
>> + *cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_GPR0 * sizeof(u32);
>> + *cmd++ = 0;
>> +
>> + return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_emit_cmd_buf_wa(struct xe_lrc *lrc, u32 *cmd)
>> +{
>> + *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
>> + *cmd++ = CS_GPR_REG(0, 0).addr;
>> + *cmd++ = __xe_lrc_regs_ggtt_addr(lrc)+ CTX_CMD_BUF_CCTL * sizeof(u32);
>> + *cmd++ = 0;
>> +
>> + *cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRR_DST_CS_MMIO;
>> + *cmd++ = CS_GPR_REG(0, 0).addr;
>> + *cmd++ = RING_CMD_BUF_CCTL(0).addr;
>
> I don't see any documented workaround asking us to do this restoration
> of the CMD_BUF_CCTL register. I suspect the i915 implementation was
> either something for pre-production hardware that never should have
> stuck around in the driver, or it was hacking around some other driver
> bug. Are you finding this to actually be necessary for something today?
I am just copying from i915 where it looked b8a1181122f7 ("drm/i915: Use
indirect ctx bb to mend CMD_BUF_CCTL") landed suspiciously close to
685d21096f6c ("drm/i915: Add per ctx batchbuffer wa for timestamp"), and
as former had no WA or HSD mentions I suspected both could be part of
the same, potentially security sensitive, work.
Would it be possible to search the WA database for workarounds touching
CMD_BUF_CCTL? In case it has it's own designation which was not
documented in the code.
>> +
>> + return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_setup_indirect_ctx_rcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>> + u32 *cmd)
>> +{
>> + /* HSDES#16010904313 */
>> + cmd = xelp_emit_timestamp_wa(lrc, cmd);
>> + cmd = xelp_emit_cmd_buf_wa(lrc, cmd);
>> + cmd = xelp_emit_restore_scratch(lrc, cmd);
>> +
>> + return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_setup_indirect_ctx_xcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>> + u32 *cmd)
>> +{
>> + /* HSDES#16010904313 */
>> + cmd = xelp_emit_timestamp_wa(lrc, cmd);
>> + cmd = xelp_emit_restore_scratch(lrc, cmd);
>> +
>> + return cmd;
>> +}
>> +
>> +static void
>> +xelp_setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>> +{
>> + u32 *start, *cmd, *regs;
>> + struct iosys_map map;
>> + u32 ggtt, offset;
>> +
>> + if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
>> + return;
>> +
>> + offset = lrc->size - LRC_INDIRECT_CTX_SIZE;
>> + ggtt = xe_bo_ggtt_addr(lrc->bo) + offset;
>> +
>> + map = lrc->bo->vmap;
>> + iosys_map_incr(&map, offset);
>> + start = cmd = map.vaddr;
>
> Isn't this going to be a problem if the iosys_map is holding iomem? If
> we're going to build the indirect context batch directly with CPU
> pointer accesses as the functions above do, don't we need to do that in
> a separate buffer and then copy it into the iosys_map?
I thought it was good enough for now given it is only for integrated GPUs.
>> +
>> + if (hwe->class == XE_ENGINE_CLASS_RENDER)
>> + cmd = xelp_setup_indirect_ctx_rcs(lrc, hwe, cmd);
>
> This also deviates from the documented workaround. According to the
> workaround database, INDIRECT_CTX should be used for RCS/CCS, but we're
> supposed to use BB_PER_CTX_PTR instead for RCS/CCS (both are similar
> mechanisms, but INDIRECT_CTX happens somewhere in the middle of a
> context restore, whereas BB_PER_CTX_PTR happens at the very end of the
> context restore).
For 16010904313 specifically? And did you mean to describe RCS/CCS vs
other engines should use a different mechanism? If so which for which?
Regards,
Tvrtko
>> + else
>> + cmd = xelp_setup_indirect_ctx_xcs(lrc, hwe, cmd);
>> +
>> + while ((unsigned long)cmd & 0x3f) /* Align to 64B cacheline. */
>> + *cmd++ = MI_NOOP;
>> +
>> + map = __xe_lrc_regs_map(lrc);
>> + regs = map.vaddr;
>> +
>> + regs[CTX_CS_INDIRECT_CTX] = ggtt | ((cmd - start) * sizeof(u32) / 64); /* Size in CLs. */
>> + regs[CTX_CS_INDIRECT_CTX_OFFSET] = XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
>> +}
>> +
>> #define PVC_CTX_ASID (0x2e + 1)
>> #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
>>
>> @@ -997,6 +1106,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>> lrc->size = ring_size + lrc_size;
>> lrc->ring.size = ring_size;
>> lrc->ring.tail = 0;
>> +
>> + if (xe_needs_indirect_ctx(xe))
>> + lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
>> +
>> if (xe_gt_has_indirect_ring_state(gt))
>> lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
>>
>> @@ -1127,6 +1240,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>
>> xe_lrc_setup_utilization(lrc);
>>
>> + xelp_setup_indirect_ctx(lrc, hwe);
>> +
>> return 0;
>>
>> err_lrc_finish:
>> diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
>> index 559c7c831212..9ce7d02ef210 100644
>> --- a/drivers/gpu/drm/xe/xe_lrc_types.h
>> +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
>> @@ -29,7 +29,8 @@ struct xe_lrc {
>> struct xe_gt *gt;
>>
>> /** @flags: LRC flags */
>> -#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1
>> +#define XE_LRC_FLAG_INDIRECT_CTX 0x1
>> +#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x2
>> u32 flags;
>>
>> /** @refcount: ref count of this lrc */
>> --
>> 2.48.0
>>
>
More information about the Intel-xe
mailing list