[PATCH 2/3] drm/xe/xelp: Implement HSDES#16010904313 workarounds

Fri May 23 10:33:16 UTC 2025

Hi Matt,

One clarification below:

On 16/05/2025 20:00, Matt Roper wrote:
> On Fri, May 16, 2025 at 08:44:11AM +0100, Tvrtko Ursulin wrote:
>> +static u32 *
>> +xelp_emit_timestamp_wa(struct xe_lrc *lrc, u32 *cmd)
>> +{
>> +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
>> +	*cmd++ = CS_GPR_REG(0, 0).addr;
>> +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_TIMESTAMP * sizeof(u32);
>> +	*cmd++ = 0;
>> +
>> +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
>> +	*cmd++ = CS_GPR_REG(0, 0).addr;
>> +	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
>> +
>> +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRI_LRM_CS_MMIO;
>> +	*cmd++ = CS_GPR_REG(0, 0).addr;
>> +	*cmd++ = RING_CTX_TIMESTAMP(0).addr;
> 
> This doesn't seem to actually match the documented workaround details.
> The workaround says we need to do three back-to-back MI_LRM instructions
> to read CTX_TIMESTAMP into $lrc+0x108C.  The first two are supposed to
> have dw0[21] (i.e., async mode) enabled, and the third should have it
> disabled.  I'm guessing this got implemented in i915 before the formal
> workaround description was finalized, or the hardware people updated the
> description after we'd already implemented the workaround.
> 
> That will also prevent us from clobbering the GPR register, which will
> let us drop the restore_scratch function.

I only realized this was unclear when I went to work on a respin to 
address it.

Are you saying the workaround specifies to read from LRC CTX_TIMESTAMP 
into LRC+0x108c? This wouldn't be MI_LRM though.

Or read from RING_CTX_TIMESTAMP into LRC+0x108c? And this wouldn't be 
MI_LRM either.

So I don't know what three back-to-back MI_LRM the workaround actually 
asks for.

Especially when it says first two should have async mode enabled. So 
maybe not to, but _from_ LRC+0x108c? Into RING_CTX_TIMESTAMP three times?

In which case the question would by why LRC+0x108c. That is where GPR 
registers are saved, right? But it is not GPR0, which one I am not sure 
because I am confused by the fact TGL PRM says that in LRCA there are 64 
dwords for 16 64-bit registers, which should be 32 dwords really, no?

Even in this case I don't see how writting a random GPR into a 
RING_CTX_TIMESTAMP makes sense.

Regards,

Tvrtko

> 
>> +
>> +	return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_emit_restore_scratch(struct xe_lrc *lrc, u32 *cmd)
>> +{
>> +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
>> +	*cmd++ = CS_GPR_REG(0, 0).addr;
>> +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc) + CTX_GPR0 * sizeof(u32);
>> +	*cmd++ = 0;
>> +
>> +	return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_emit_cmd_buf_wa(struct xe_lrc *lrc, u32 *cmd)
>> +{
>> +	*cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
>> +	*cmd++ = CS_GPR_REG(0, 0).addr;
>> +	*cmd++ = __xe_lrc_regs_ggtt_addr(lrc)+ CTX_CMD_BUF_CCTL * sizeof(u32);
>> +	*cmd++ = 0;
>> +
>> +	*cmd++ = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO | MI_LRR_DST_CS_MMIO;
>> +	*cmd++ = CS_GPR_REG(0, 0).addr;
>> +	*cmd++ = RING_CMD_BUF_CCTL(0).addr;
> 
> I don't see any documented workaround asking us to do this restoration
> of the CMD_BUF_CCTL register.  I suspect the i915 implementation was
> either something for pre-production hardware that never should have
> stuck around in the driver, or it was hacking around some other driver
> bug.  Are you finding this to actually be necessary for something today?
> 
>> +
>> +	return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_setup_indirect_ctx_rcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>> +			    u32 *cmd)
>> +{
>> +	/* HSDES#16010904313 */
>> +	cmd = xelp_emit_timestamp_wa(lrc, cmd);
>> +	cmd = xelp_emit_cmd_buf_wa(lrc, cmd);
>> +	cmd = xelp_emit_restore_scratch(lrc, cmd);
>> +
>> +	return cmd;
>> +}
>> +
>> +static u32 *
>> +xelp_setup_indirect_ctx_xcs(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>> +			    u32 *cmd)
>> +{
>> +	/* HSDES#16010904313 */
>> +	cmd = xelp_emit_timestamp_wa(lrc, cmd);
>> +	cmd = xelp_emit_restore_scratch(lrc, cmd);
>> +
>> +	return cmd;
>> +}
>> +
>> +static void
>> +xelp_setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>> +{
>> +	u32 *start, *cmd, *regs;
>> +	struct iosys_map map;
>> +	u32 ggtt, offset;
>> +
>> +	if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
>> +		return;
>> +
>> +	offset = lrc->size - LRC_INDIRECT_CTX_SIZE;
>> +	ggtt = xe_bo_ggtt_addr(lrc->bo) + offset;
>> +
>> +	map = lrc->bo->vmap;
>> +	iosys_map_incr(&map, offset);
>> +	start = cmd = map.vaddr;
> 
> Isn't this going to be a problem if the iosys_map is holding iomem?  If
> we're going to build the indirect context batch directly with CPU
> pointer accesses as the functions above do, don't we need to do that in
> a separate buffer and then copy it into the iosys_map?
> 
> 
>> +
>> +	if (hwe->class == XE_ENGINE_CLASS_RENDER)
>> +		cmd = xelp_setup_indirect_ctx_rcs(lrc, hwe, cmd);
> 
> This also deviates from the documented workaround.  According to the
> workaround database, INDIRECT_CTX should be used for RCS/CCS, but we're
> supposed to use BB_PER_CTX_PTR instead for RCS/CCS (both are similar
> mechanisms, but INDIRECT_CTX happens somewhere in the middle of a
> context restore, whereas BB_PER_CTX_PTR happens at the very end of the
> context restore).
> 
> 
> Matt
> 
>> +	else
>> +		cmd = xelp_setup_indirect_ctx_xcs(lrc, hwe, cmd);
>> +
>> +	while ((unsigned long)cmd & 0x3f) /* Align to 64B cacheline. */
>> +		*cmd++ = MI_NOOP;
>> +
>> +	map = __xe_lrc_regs_map(lrc);
>> +	regs = map.vaddr;
>> +
>> +	regs[CTX_CS_INDIRECT_CTX] = ggtt | ((cmd - start) * sizeof(u32) / 64); /* Size in CLs. */
>> +	regs[CTX_CS_INDIRECT_CTX_OFFSET] = XELP_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
>> +}
>> +
>>   #define PVC_CTX_ASID		(0x2e + 1)
>>   #define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
>>   
>> @@ -997,6 +1106,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>   	lrc->size = ring_size + lrc_size;
>>   	lrc->ring.size = ring_size;
>>   	lrc->ring.tail = 0;
>> +
>> +	if (xe_needs_indirect_ctx(xe))
>> +		lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
>> +
>>   	if (xe_gt_has_indirect_ring_state(gt))
>>   		lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
>>   
>> @@ -1127,6 +1240,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
>>   
>>   	xe_lrc_setup_utilization(lrc);
>>   
>> +	xelp_setup_indirect_ctx(lrc, hwe);
>> +
>>   	return 0;
>>   
>>   err_lrc_finish:
>> diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
>> index 559c7c831212..9ce7d02ef210 100644
>> --- a/drivers/gpu/drm/xe/xe_lrc_types.h
>> +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
>> @@ -29,7 +29,8 @@ struct xe_lrc {
>>   	struct xe_gt *gt;
>>   
>>   	/** @flags: LRC flags */
>> -#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x1
>> +#define XE_LRC_FLAG_INDIRECT_CTX		0x1
>> +#define XE_LRC_FLAG_INDIRECT_RING_STATE		0x2
>>   	u32 flags;
>>   
>>   	/** @refcount: ref count of this lrc */
>> -- 
>> 2.48.0
>>
>