[PATCH 04/12] drm/xe: Add ring buffer handling for AuxCCS
Tvrtko Ursulin
tvrtko.ursulin at igalia.com
Fri Feb 21 10:17:23 UTC 2025
Align the ring buffer handling of required AuxCCS flushes and
invalidations with the reference implementation from i915.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
---
.../gpu/drm/xe/instructions/xe_gpu_commands.h | 1 +
.../gpu/drm/xe/instructions/xe_mi_commands.h | 6 +
drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 +
drivers/gpu/drm/xe/xe_ring_ops.c | 173 ++++++++++--------
drivers/gpu/drm/xe/xe_ring_ops_types.h | 2 +-
5 files changed, 106 insertions(+), 77 deletions(-)
diff --git a/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h b/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
index a255946b6f77..93e4687feb71 100644
--- a/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
+++ b/drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
@@ -45,6 +45,7 @@
#define PIPE_CONTROL_COMMAND_CACHE_INVALIDATE (1<<29)
#define PIPE_CONTROL_TILE_CACHE_FLUSH (1<<28)
+#define PIPE_CONTROL_FLUSH_L3 (1<<27)
#define PIPE_CONTROL_AMFS_FLUSH (1<<25)
#define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24)
#define PIPE_CONTROL_LRI_POST_SYNC BIT(23)
diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
index 167fb0f742de..1695656d1dc1 100644
--- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
+++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
@@ -71,4 +71,10 @@
#define MI_SET_APPID_SESSION_ID_MASK REG_GENMASK(6, 0)
#define MI_SET_APPID_SESSION_ID(x) REG_FIELD_PREP(MI_SET_APPID_SESSION_ID_MASK, x)
+#define MI_SEMAPHORE_WAIT_TOKEN (__MI_INSTR(0x1c) | XE_INSTR_NUM_DW(3)) /* XeLP+ */
+#define MI_SEMAPHORE_REGISTER_POLL REG_BIT(16)
+#define MI_SEMAPHORE_POLL REG_BIT(15)
+#define MI_SEMAPHORE_CMP_OP_MASK REG_GENMASK(14, 12)
+#define MI_SEMAPHORE_SAD_EQ_SDD REG_FIELD_PREP(MI_SEMAPHORE_CMP_OP_MASK, 4)
+
#endif
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index bc7bfffabd0b..9c074d32eb6a 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -89,6 +89,7 @@
#define CCS_AUX_INV XE_REG(0x4208)
#define VD0_AUX_INV XE_REG(0x4218)
+#define BCS_AUX_INV XE_REG(0x4248)
#define VE0_AUX_INV XE_REG(0x4238)
#define VE1_AUX_INV XE_REG(0x42b8)
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index 21520ff97599..84c02a310079 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -56,7 +56,14 @@ static int emit_aux_table_inv(struct xe_gt *gt, struct xe_reg reg,
dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1) | MI_LRI_MMIO_REMAP_EN;
dw[i++] = reg.addr + gt->mmio.adj_offset;
dw[i++] = AUX_INV;
- dw[i++] = MI_NOOP;
+ dw[i++] = MI_SEMAPHORE_WAIT_TOKEN |
+ MI_SEMAPHORE_REGISTER_POLL |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ dw[i++] = 0;
+ dw[i++] = reg.addr + gt->mmio.adj_offset;
+ dw[i++] = 0;
+ dw[i++] = 0;
return i;
}
@@ -167,9 +174,11 @@ static int emit_store_imm_ppgtt_posted(u64 addr, u64 value,
return i;
}
-static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
+static int emit_render_cache_flush(struct xe_sched_job *job, bool flush_l3,
+ u32 *dw, int i)
{
struct xe_gt *gt = job->q->gt;
+ struct xe_device *xe = gt_to_xe(gt);
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
u32 flags;
@@ -180,6 +189,16 @@ static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
PIPE_CONTROL_DC_FLUSH_ENABLE |
PIPE_CONTROL_FLUSH_ENABLE);
+ /*
+ * L3 fabric flush is needed for AUX CCS invalidation
+ * which happens as part of pipe-control so we can
+ * ignore PIPE_CONTROL_FLUSH_L3. Also PIPE_CONTROL_FLUSH_L3
+ * deals with Protected Memory which is not needed for
+ * AUX CCS invalidation and lead to unwanted side effects.
+ */
+ if (flush_l3 && GRAPHICS_VERx100(xe) < 1270)
+ flags |= PIPE_CONTROL_FLUSH_L3;
+
if (XE_WA(gt, 1409600907))
flags |= PIPE_CONTROL_DEPTH_STALL;
@@ -236,44 +255,6 @@ static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i)
return i;
}
-/* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
-static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
- u64 batch_addr, u32 seqno)
-{
- u32 dw[MAX_JOB_SIZE_DW], i = 0;
- u32 ppgtt_flag = get_ppgtt_flag(job);
- struct xe_gt *gt = job->q->gt;
-
- i = emit_copy_timestamp(lrc, dw, i);
-
- if (job->ring_ops_flush_tlb) {
- dw[i++] = preparser_disable(true);
- i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
- seqno, MI_INVALIDATE_TLB, dw, i);
- dw[i++] = preparser_disable(false);
- } else {
- i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
- seqno, dw, i);
- }
-
- i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
-
- if (job->user_fence.used) {
- i = emit_flush_dw(dw, i);
- i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
- job->user_fence.value,
- dw, i);
- }
-
- i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, 0, dw, i);
-
- i = emit_user_interrupt(dw, i);
-
- xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
-
- xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
-}
-
static bool has_aux_ccs(struct xe_device *xe)
{
/*
@@ -288,36 +269,51 @@ static bool has_aux_ccs(struct xe_device *xe)
return !xe->info.has_flat_ccs;
}
-static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
- u64 batch_addr, u32 seqno)
+static void __emit_job_gen12_xcs(struct xe_sched_job *job, struct xe_lrc *lrc,
+ u64 batch_addr, u32 seqno)
{
u32 dw[MAX_JOB_SIZE_DW], i = 0;
u32 ppgtt_flag = get_ppgtt_flag(job);
struct xe_gt *gt = job->q->gt;
struct xe_device *xe = gt_to_xe(gt);
- bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
+ const unsigned int class = job->q->class;
+ const bool aux_ccs = has_aux_ccs(xe) &&
+ (class == XE_ENGINE_CLASS_COPY ||
+ class == XE_ENGINE_CLASS_VIDEO_DECODE ||
+ class == XE_ENGINE_CLASS_VIDEO_ENHANCE);
+ const bool invalidate_tlb = aux_ccs || job->ring_ops_flush_tlb;
i = emit_copy_timestamp(lrc, dw, i);
- dw[i++] = preparser_disable(true);
-
- /* hsdes: 1809175790 */
- if (has_aux_ccs(xe)) {
- if (decode)
- i = emit_aux_table_inv(gt, VD0_AUX_INV, dw, i);
- else
- i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
- }
-
- if (job->ring_ops_flush_tlb)
+ if (invalidate_tlb) {
+ dw[i++] = preparser_disable(true);
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
- seqno, MI_INVALIDATE_TLB, dw, i);
+ seqno,
+ MI_INVALIDATE_TLB |
+ (aux_ccs ? MI_FLUSH_DW_CCS : 0),
+ dw, i);
+ /* hsdes: 1809175790 */
+ if (aux_ccs) {
+ struct xe_reg reg;
- dw[i++] = preparser_disable(false);
+ switch (job->q->class) {
+ case XE_ENGINE_CLASS_COPY:
+ reg = BCS_AUX_INV;
+ break;
+ case XE_ENGINE_CLASS_VIDEO_DECODE:
+ reg = VD0_AUX_INV;
+ break;
+ default:
+ reg = VE0_AUX_INV;
+ };
- if (!job->ring_ops_flush_tlb)
+ i = emit_aux_table_inv(gt, reg, dw, i);
+ }
+ dw[i++] = preparser_disable(false);
+ } else {
i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
seqno, dw, i);
+ }
i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
@@ -346,10 +342,16 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
struct xe_gt *gt = job->q->gt;
struct xe_device *xe = gt_to_xe(gt);
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
+ const bool aux_ccs = has_aux_ccs(xe);
+ const bool invalidate_tlb = aux_ccs || job->ring_ops_flush_tlb;
u32 mask_flags = 0;
i = emit_copy_timestamp(lrc, dw, i);
+ /* hsdes: 1809175790 */
+ if (aux_ccs)
+ i = emit_render_cache_flush(job, 0, dw, i);
+
dw[i++] = preparser_disable(true);
if (lacks_render)
mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS;
@@ -357,10 +359,10 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
- i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
+ i = emit_pipe_invalidate(mask_flags, invalidate_tlb, dw, i);
/* hsdes: 1809175790 */
- if (has_aux_ccs(xe))
+ if (aux_ccs)
i = emit_aux_table_inv(gt, CCS_AUX_INV, dw, i);
dw[i++] = preparser_disable(false);
@@ -370,7 +372,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
- i = emit_render_cache_flush(job, dw, i);
+ i = emit_render_cache_flush(job, true, dw, i);
if (job->user_fence.used)
i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
@@ -391,12 +393,29 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
static void emit_migration_job_gen12(struct xe_sched_job *job,
struct xe_lrc *lrc, u32 seqno)
{
+ struct xe_gt *gt = job->q->gt;
+ struct xe_device *xe = gt_to_xe(gt);
+ const bool aux_ccs = has_aux_ccs(xe);
+ const bool invalidate_tlb = aux_ccs || job->ring_ops_flush_tlb;
u32 dw[MAX_JOB_SIZE_DW], i = 0;
i = emit_copy_timestamp(lrc, dw, i);
- i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
- seqno, dw, i);
+ if (invalidate_tlb) {
+ dw[i++] = preparser_disable(true);
+ i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
+ seqno,
+ MI_INVALIDATE_TLB |
+ (aux_ccs ? MI_FLUSH_DW_CCS : 0),
+ dw, i);
+ /* hsdes: 1809175790 */
+ if (aux_ccs)
+ i = emit_aux_table_inv(gt, BCS_AUX_INV, dw, i);
+ dw[i++] = preparser_disable(false);
+ } else {
+ i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
+ seqno, dw, i);
+ }
dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */
@@ -406,16 +425,18 @@ static void emit_migration_job_gen12(struct xe_sched_job *job,
/* XXX: Do we need this? Leaving for now. */
dw[i++] = preparser_disable(true);
i = emit_flush_invalidate(dw, i);
+ if (aux_ccs)
+ i = emit_aux_table_inv(gt, BCS_AUX_INV, dw, i);
dw[i++] = preparser_disable(false);
}
i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i);
- dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags |
- MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
- dw[i++] = xe_lrc_seqno_ggtt_addr(lrc) | MI_FLUSH_DW_USE_GTT;
- dw[i++] = 0;
- dw[i++] = seqno; /* value */
+ i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno,
+ MI_INVALIDATE_TLB |
+ (aux_ccs ? MI_FLUSH_DW_CCS : 0) |
+ job->migrate_flush_flags,
+ dw, i);
i = emit_user_interrupt(dw, i);
@@ -430,9 +451,9 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */
- __emit_job_gen12_simple(job, job->q->lrc[0],
- job->ptrs[0].batch_addr,
- xe_sched_job_lrc_seqno(job));
+ __emit_job_gen12_xcs(job, job->q->lrc[0],
+ job->ptrs[0].batch_addr,
+ xe_sched_job_lrc_seqno(job));
}
static void emit_job_gen12_copy(struct xe_sched_job *job)
@@ -446,9 +467,9 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
}
for (i = 0; i < job->q->width; ++i)
- __emit_job_gen12_simple(job, job->q->lrc[i],
- job->ptrs[i].batch_addr,
- xe_sched_job_lrc_seqno(job));
+ __emit_job_gen12_xcs(job, job->q->lrc[i],
+ job->ptrs[i].batch_addr,
+ xe_sched_job_lrc_seqno(job));
}
static void emit_job_gen12_video(struct xe_sched_job *job)
@@ -457,9 +478,9 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
/* FIXME: Not doing parallel handshake for now */
for (i = 0; i < job->q->width; ++i)
- __emit_job_gen12_video(job, job->q->lrc[i],
- job->ptrs[i].batch_addr,
- xe_sched_job_lrc_seqno(job));
+ __emit_job_gen12_xcs(job, job->q->lrc[i],
+ job->ptrs[i].batch_addr,
+ xe_sched_job_lrc_seqno(job));
}
static void emit_job_gen12_render_compute(struct xe_sched_job *job)
diff --git a/drivers/gpu/drm/xe/xe_ring_ops_types.h b/drivers/gpu/drm/xe/xe_ring_ops_types.h
index 1ae56e2ee7b4..d7e3e150a9a5 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops_types.h
+++ b/drivers/gpu/drm/xe/xe_ring_ops_types.h
@@ -8,7 +8,7 @@
struct xe_sched_job;
-#define MAX_JOB_SIZE_DW 48
+#define MAX_JOB_SIZE_DW 58
#define MAX_JOB_SIZE_BYTES (MAX_JOB_SIZE_DW * 4)
/**
--
2.48.0
More information about the Intel-xe
mailing list