[PATCH v9 6/8] drm/xe: Add plumbing for indirect context workarounds
Tvrtko Ursulin
tvrtko.ursulin at igalia.com
Fri Jul 11 14:36:57 UTC 2025
On 11/07/2025 04:28, Lucas De Marchi wrote:
> On Wed, Jul 09, 2025 at 11:54:53AM +0100, Tvrtko Ursulin wrote:
>> Some upcoming workarounds need to be emitted from the indirect workaround
>> context so lets add some plumbing where they will be able to easily slot
>> in.
>>
>> No functional changes for now since everything is still deactivated.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>> Cc: Matt Roper <matthew.d.roper at intel.com>
>> ---
>> drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 5 ++
>> drivers/gpu/drm/xe/xe_lrc.c | 86 ++++++++++++++++++++++++-
>> drivers/gpu/drm/xe/xe_lrc_types.h | 3 +-
>> 3 files changed, 91 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/
>> drm/xe/regs/xe_lrc_layout.h
>> index 994af591a2e8..e0437d0454e6 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> @@ -12,6 +12,8 @@
>> #define CTX_RING_START (0x08 + 1)
>> #define CTX_RING_CTL (0x0a + 1)
>> #define CTX_BB_PER_CTX_PTR (0x12 + 1)
>> +#define CTX_CS_INDIRECT_CTX (0x14 + 1)
>> +#define CTX_CS_INDIRECT_CTX_OFFSET (0x16 + 1)
>> #define CTX_TIMESTAMP (0x22 + 1)
>> #define CTX_TIMESTAMP_UDW (0x24 + 1)
>> #define CTX_INDIRECT_RING_STATE (0x26 + 1)
>> @@ -36,4 +38,7 @@
>> #define INDIRECT_CTX_RING_START_UDW (0x08 + 1)
>> #define INDIRECT_CTX_RING_CTL (0x0a + 1)
>>
>> +#define CTX_INDIRECT_OFFSET_MASK REG_GENMASK(15, 6)
>> +#define CTX_INDIRECT_CTX_OFFSET_DEFAULT
>> REG_FIELD_PREP(CTX_INDIRECT_OFFSET_MASK, 0xd)
>> +
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>> index 768cb9de48d6..508bf032e06e 100644
>> --- a/drivers/gpu/drm/xe/xe_lrc.c
>> +++ b/drivers/gpu/drm/xe/xe_lrc.c
>> @@ -39,6 +39,8 @@
>> #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
>>
>> #define LRC_PPHWSP_SIZE SZ_4K
>> +#define LRC_INDIRECT_CTX_SIZE (63 * 64) /* bspec 45954: 63
>> cachelines */
>> +#define LRC_INDIRECT_CTX_BO_SIZE SZ_4K
>> #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
>> #define LRC_WA_BB_SIZE SZ_4K
>>
>> @@ -48,6 +50,12 @@ lrc_to_xe(struct xe_lrc *lrc)
>> return gt_to_xe(lrc->fence_ctx.gt);
>> }
>>
>> +static bool
>> +gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class
>> class)
>> +{
>> + return false;
>> +}
>> +
>> size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
>> {
>> struct xe_device *xe = gt_to_xe(gt);
>> @@ -717,7 +725,18 @@ static u32
>> __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
>>
>> static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
>> {
>> - return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
>> LRC_INDIRECT_RING_STATE_SIZE;
>> + u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
>> + LRC_INDIRECT_RING_STATE_SIZE;
>> +
>> + if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
>> + offset -= LRC_INDIRECT_CTX_BO_SIZE;
>> +
>> + return offset;
>> +}
>> +
>> +static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
>> +{
>> + return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
>> LRC_INDIRECT_CTX_BO_SIZE;
>> }
>>
>> static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
>> @@ -1077,6 +1096,59 @@ static int setup_wa_bb(struct xe_lrc *lrc,
>> struct xe_hw_engine *hwe)
>> return 0;
>> }
>>
>> +static int
>> +setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>> +{
>> + static struct bo_setup rcs_funcs[] = {
>> + };
>> + struct bo_setup_state state = {
>> + .lrc = lrc,
>> + .hwe = hwe,
>> + .max_size = LRC_INDIRECT_CTX_SIZE,
>
> instead of the define, I'd just do
> LRC_INDIRECT_CTX_SIZE - 64, then the comment here... Because it may be
> tempting to use the wrong macro in other places, but this is the only
> one where it makes sense.
>
>> + .reserve_dw = 15,
>
> we don't need reserve dwords... we only need to make sure we write N
> cache lines, because we are passing to HW the size in terms of cache
> lines.
>
> without much refactoring I think it's sufficient to just pass
> reserve_dw = 0 and then in this function you just continue
> aligning it as you are doing. But then it probably means we don't even
> need reserve_dw (previous commit)... just hardcoding the 1 would be ok
> for simplicity.
I think I kind of get it what you are after.
Problem is reserve_dw in the case of indirect_ctx is a "maybe". Ie. it
wouldn't be wrong to use up to max_size, if it is exactly max size.
But with the "LRC_INDIRECT_CTX_SIZE - 64" approach hardcoding reserve of
1 dword would be wrong too.
The reserve_dw = 0 idea sounds acceptable. Only weakness of it is that
it limits the indirectx ctx usable size to a tiny bit less than what it
could hold but we don't need that much and I will put a comment to
record it.
Regards,
Tvrtko
>
> Lucas De Marchi
>
>> + .offset = __xe_lrc_indirect_ctx_offset(lrc),
>> + };
>> + int ret;
>> +
>> + if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
>> + return 0;
>> +
>> + if (hwe->class == XE_ENGINE_CLASS_RENDER ||
>> + hwe->class == XE_ENGINE_CLASS_COMPUTE) {
>> + state.funcs = rcs_funcs;
>> + state.num_funcs = ARRAY_SIZE(rcs_funcs);
>> + }
>> +
>> + if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
>> + return 0;
>> +
>> + ret = setup_bo(&state);
>> + if (ret)
>> + return ret;
>> +
>> + /*
>> + * Align to 64B cacheline so there's no garbage at the end for CS to
>> + * execute: size for indirect ctx must be a multiple of 64.
>> + */
>> + while (state.written & 0xf) {
>> + *state.ptr++ = MI_NOOP;
>> + state.written++;
>> + }
>> +
>> + finish_bo(&state);
>> +
>> + xe_lrc_write_ctx_reg(lrc,
>> + CTX_CS_INDIRECT_CTX,
>> + (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
>> + /* Size in CLs. */
>> + (state.written * sizeof(u32) / 64));
>> + xe_lrc_write_ctx_reg(lrc,
>> + CTX_CS_INDIRECT_CTX_OFFSET,
>> + CTX_INDIRECT_CTX_OFFSET_DEFAULT);
>> +
>> + return 0;
>> +}
>> +
>> #define PVC_CTX_ASID (0x2e + 1)
>> #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
>>
>> @@ -1086,7 +1158,7 @@ static int xe_lrc_init(struct xe_lrc *lrc,
>> struct xe_hw_engine *hwe,
>> {
>> struct xe_gt *gt = hwe->gt;
>> const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
>> - const u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
>> + u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
>> struct xe_tile *tile = gt_to_tile(gt);
>> struct xe_device *xe = gt_to_xe(gt);
>> struct iosys_map map;
>> @@ -1101,6 +1173,12 @@ static int xe_lrc_init(struct xe_lrc *lrc,
>> struct xe_hw_engine *hwe,
>> lrc->flags = 0;
>> lrc->ring.size = ring_size;
>> lrc->ring.tail = 0;
>> +
>> + if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
>> + lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
>> + bo_size += LRC_INDIRECT_CTX_BO_SIZE;
>> + }
>> +
>> if (xe_gt_has_indirect_ring_state(gt))
>> lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
>>
>> @@ -1225,6 +1303,10 @@ static int xe_lrc_init(struct xe_lrc *lrc,
>> struct xe_hw_engine *hwe,
>> if (err)
>> goto err_lrc_finish;
>>
>> + err = setup_indirect_ctx(lrc, hwe);
>> + if (err)
>> + goto err_lrc_finish;
>> +
>> return 0;
>>
>> err_lrc_finish:
>> diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/
>> xe_lrc_types.h
>> index 2c7c81079801..e9883706e004 100644
>> --- a/drivers/gpu/drm/xe/xe_lrc_types.h
>> +++ b/drivers/gpu/drm/xe/xe_lrc_types.h
>> @@ -29,7 +29,8 @@ struct xe_lrc {
>> struct xe_gt *gt;
>>
>> /** @flags: LRC flags */
>> -#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1
>> +#define XE_LRC_FLAG_INDIRECT_CTX 0x1
>> +#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x2
>> u32 flags;
>>
>> /** @refcount: ref count of this lrc */
>> --
>> 2.48.0
>>
More information about the Intel-xe
mailing list