[PATCH v8 5/7] drm/xe: Add plumbing for indirect context workarounds
Lucas De Marchi
lucas.demarchi at intel.com
Wed Jul 9 13:30:22 UTC 2025
On Wed, Jul 09, 2025 at 11:44:41AM +0100, Tvrtko Ursulin wrote:
>
>On 09/07/2025 00:11, Lucas De Marchi wrote:
>>On Thu, Jul 03, 2025 at 09:20:57AM +0100, Tvrtko Ursulin wrote:
>>>Some upcoming workarounds need to be emitted from the indirect workaround
>>>context so lets add some plumbing where they will be able to easily slot
>>>in.
>>>
>>>No functional changes for now since everything is still deactivated.
>>>
>>>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
>>>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>>>Cc: Matt Roper <matthew.d.roper at intel.com>
>>>---
>>>drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 4 ++
>>>drivers/gpu/drm/xe/xe_lrc.c | 80 ++++++++++++++++++++++++-
>>>drivers/gpu/drm/xe/xe_lrc_types.h | 3 +-
>>>3 files changed, 84 insertions(+), 3 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>>>b/drivers/gpu/ drm/xe/regs/xe_lrc_layout.h
>>>index 994af591a2e8..cfa6db15488e 100644
>>>--- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>>>+++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>>>@@ -12,6 +12,8 @@
>>>#define CTX_RING_START (0x08 + 1)
>>>#define CTX_RING_CTL (0x0a + 1)
>>>#define CTX_BB_PER_CTX_PTR (0x12 + 1)
>>>+#define CTX_CS_INDIRECT_CTX (0x14 + 1)
>>>+#define CTX_CS_INDIRECT_CTX_OFFSET (0x16 + 1)
>>>#define CTX_TIMESTAMP (0x22 + 1)
>>>#define CTX_TIMESTAMP_UDW (0x24 + 1)
>>>#define CTX_INDIRECT_RING_STATE (0x26 + 1)
>>>@@ -36,4 +38,6 @@
>>>#define INDIRECT_CTX_RING_START_UDW (0x08 + 1)
>>>#define INDIRECT_CTX_RING_CTL (0x0a + 1)
>>>
>>>+#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0xd
>>
>>this is already the default, so we can probably skip writting it?
>>If we are writting it, then we should use a REG_FIELD_PREP so
>>the `<< 6` doesn't feel like just a magic number
>
>Done.
>
>>>+
>>>#endif
>>>diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>>>index e278c12a41f9..70db4c5764e6 100644
>>>--- a/drivers/gpu/drm/xe/xe_lrc.c
>>>+++ b/drivers/gpu/drm/xe/xe_lrc.c
>>>@@ -39,6 +39,7 @@
>>>#define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
>>>
>>>#define LRC_PPHWSP_SIZE SZ_4K
>>>+#define LRC_INDIRECT_CTX_SIZE SZ_4K
>>>#define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
>>>#define LRC_WA_BB_SIZE SZ_4K
>>>
>>>@@ -48,6 +49,12 @@ lrc_to_xe(struct xe_lrc *lrc)
>>> return gt_to_xe(lrc->fence_ctx.gt);
>>>}
>>>
>>>+static bool
>>>+gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum
>>>xe_engine_class class)
>>>+{
>>>+ return false;
>>>+}
>>>+
>>>size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
>>>{
>>> struct xe_device *xe = gt_to_xe(gt);
>>>@@ -717,7 +724,18 @@ static u32
>>>__xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
>>>
>>>static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
>>>{
>>>- return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
>>>LRC_INDIRECT_RING_STATE_SIZE;
>>>+ u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
>>>+ LRC_INDIRECT_RING_STATE_SIZE;
>>>+
>>>+ if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
>>>+ offset -= LRC_INDIRECT_CTX_SIZE;
>>>+
>>>+ return offset;
>>>+}
>>>+
>>>+static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
>>>+{
>>>+ return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_SIZE;
>>>}
>>>
>>>static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
>>>@@ -1065,6 +1083,54 @@ static int setup_wa_bb(struct xe_lrc *lrc,
>>>struct xe_hw_engine *hwe)
>>> return 0;
>>>}
>>>
>>>+static int
>>>+setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
>>>+{
>>>+ static struct bo_setup rcs_funcs[] = {
>>>+ };
>>>+ unsigned int offset, num_funcs, written = 0;
>>>+ struct bo_setup *funcs = NULL;
>>>+ u32 *cmd, *buf = NULL;
>>>+
>>>+ if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
>>>+ return 0;
>>>+
>>>+ if (hwe->class == XE_ENGINE_CLASS_RENDER ||
>>>+ hwe->class == XE_ENGINE_CLASS_COMPUTE) {
>>>+ funcs = rcs_funcs;
>>>+ num_funcs = ARRAY_SIZE(rcs_funcs);
>>>+ }
>>>+
>>>+ if (xe_gt_WARN_ON(lrc->gt, !funcs))
>>>+ return 0;
>>>+
>>>+ offset = __xe_lrc_indirect_ctx_offset(lrc);
>>>+
>>>+ cmd = setup_bo(lrc, hwe, LRC_INDIRECT_CTX_SIZE, 15, offset, funcs,
>>
>>this 15 is directly related to the 64B align below, right? It's not
>>something in addition, but rather the size alignment since the size we
>>give to INDIRECT_CTX is in # of cachelines (and we don't want to execute
>>garbage.
>>
>>Maximum size per bspec 45954 is 63 cachelines: by passing max_size
>>== LRC_INDIRECT_CTX_SIZE and reserved_dw = 15 dwords,
>>we would write up to 4036 and the alignment below will make the size go
>>up to 64 instead of 63, since max size is then off by 4.
>>Or did I miscalculate?
>>
>>instead of passing a "reserved_dw" why are we not just passing
>>max_size + reserved_dw, why are we not just passing max number of
>>dwords, calculated here? This would also help stop mixing the units.
>
>I am not sure there was a bug with the alignment code (if there was
>exactly 4036 of content, then align loop wouldn't run, no?), but
Below it does:
while ((unsigned long)cmd & 0x3f) {
and `4036 & 0x3f == 4`. So it will align it to 4096 (64 cache lines)
rather than 4032.
>anyway, I also did not know about the bspec 45954. So in the respin I
>have changed it to separate the BO size from the max indirect context
>size.
>
>See how you like the latest.
thanks, will take a look
Lucas De Marchi
>
>Regards,
>
>Tvrtko
>
>>
>>>+ num_funcs, &buf, &written);
>>>+ if (IS_ERR(cmd))
>>>+ return PTR_ERR(cmd);
>>>+
>>>+ /* Align to 64B cacheline. */
>>
>> /*
>> * Align to 64B cacheline so there's no garbage at the end for
>> * CS to execute: size for indirect ctx must be a multiple of
>> * 64.
>> */
>>
>>Lucas De Marchi
>>
>>>+ while ((unsigned long)cmd & 0x3f) {
>>>+ *cmd++ = MI_NOOP;
>>>+ written++;
>>>+ }
>>>+
>>>+ finish_bo(lrc, offset, written, buf);
>>>+
>>>+ xe_lrc_write_ctx_reg(lrc,
>>>+ CTX_CS_INDIRECT_CTX,
>>>+ (xe_bo_ggtt_addr(lrc->bo) + offset) |
>>>+ /* Size in CLs. */
>>>+ (written * sizeof(u32) / 64));
>>>+ xe_lrc_write_ctx_reg(lrc,
>>>+ CTX_CS_INDIRECT_CTX_OFFSET,
>>>+ CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6);
>>>+
>>>+ return 0;
>>>+}
>>>+
>>>#define PVC_CTX_ASID (0x2e + 1)
>>>#define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
>>>
>>>@@ -1074,7 +1140,7 @@ static int xe_lrc_init(struct xe_lrc *lrc,
>>>struct xe_hw_engine *hwe,
>>>{
>>> struct xe_gt *gt = hwe->gt;
>>> const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
>>>- const u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
>>>+ u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
>>> struct xe_tile *tile = gt_to_tile(gt);
>>> struct xe_device *xe = gt_to_xe(gt);
>>> struct iosys_map map;
>>>@@ -1089,6 +1155,12 @@ static int xe_lrc_init(struct xe_lrc *lrc,
>>>struct xe_hw_engine *hwe,
>>> lrc->flags = 0;
>>> lrc->ring.size = ring_size;
>>> lrc->ring.tail = 0;
>>>+
>>>+ if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
>>>+ lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
>>>+ bo_size += LRC_INDIRECT_CTX_SIZE;
>>>+ }
>>>+
>>> if (xe_gt_has_indirect_ring_state(gt))
>>> lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
>>>
>>>@@ -1213,6 +1285,10 @@ static int xe_lrc_init(struct xe_lrc *lrc,
>>>struct xe_hw_engine *hwe,
>>> if (err)
>>> goto err_lrc_finish;
>>>
>>>+ err = setup_indirect_ctx(lrc, hwe);
>>>+ if (err)
>>>+ goto err_lrc_finish;
>>>+
>>> return 0;
>>>
>>>err_lrc_finish:
>>>diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h
>>>b/drivers/gpu/drm/xe/ xe_lrc_types.h
>>>index 2c7c81079801..e9883706e004 100644
>>>--- a/drivers/gpu/drm/xe/xe_lrc_types.h
>>>+++ b/drivers/gpu/drm/xe/xe_lrc_types.h
>>>@@ -29,7 +29,8 @@ struct xe_lrc {
>>> struct xe_gt *gt;
>>>
>>> /** @flags: LRC flags */
>>>-#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1
>>>+#define XE_LRC_FLAG_INDIRECT_CTX 0x1
>>>+#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x2
>>> u32 flags;
>>>
>>> /** @refcount: ref count of this lrc */
>>>--
>>>2.48.0
>>>
>
More information about the Intel-xe
mailing list