[PATCH 49/53] drm/i915/gt: Use client timeline address for seqno writes

Sat Jan 2 12:38:34 UTC 2021

If we allow for per-client timelines, even with legacy ring submission,
we open the door to a world full of possiblities [scheduling and
semaphores].

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/gen2_engine_cs.c      | 45 +++++++++-
 drivers/gpu/drm/i915/gt/gen2_engine_cs.h      |  2 +-
 drivers/gpu/drm/i915/gt/gen6_engine_cs.c      | 89 +++++++++++++------
 drivers/gpu/drm/i915/gt/gen8_engine_cs.c      | 23 ++---
 .../gpu/drm/i915/gt/intel_ring_submission.c   |  2 +-
 drivers/gpu/drm/i915/i915_request.h           | 13 +++
 6 files changed, 127 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
index b491a64919c8..cc5de8038e02 100644
--- a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
@@ -172,9 +172,50 @@ u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
 }
 
-u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+static u32 *__gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs,
+				   int flush, int post)
+{
+	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
+	u32 offset = __i915_request_hwsp_offset(rq);
+
+	GEM_BUG_ON(tl->mode == INTEL_TIMELINE_RELATIVE_CONTEXT);
+
+	*cs++ = MI_FLUSH;
+
+	while (flush--) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
+		*cs++ = rq->fence.seqno;
+	}
+
+	if (intel_timeline_is_relative(tl)) {
+		offset = offset_in_page(offset);
+		while (post--) {
+			*cs++ = MI_STORE_DWORD_INDEX;
+			*cs++ = offset;
+			*cs++ = rq->fence.seqno;
+			*cs++ = MI_NOOP;
+		}
+	} else {
+		while (post--) {
+			*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+			*cs++ = 0;
+			*cs++ = offset;
+			*cs++ = rq->fence.seqno;
+		}
+	}
+
+	*cs++ = MI_USER_INTERRUPT;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
+}
+
+u32 *gen5_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 {
-	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
+	return __gen5_emit_breadcrumb(rq, cs, 8, 8);
 }
 
 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.h b/drivers/gpu/drm/i915/gt/gen2_engine_cs.h
index a5cd64a65c9e..b56b64646d0c 100644
--- a/drivers/gpu/drm/i915/gt/gen2_engine_cs.h
+++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.h
@@ -16,7 +16,7 @@ int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode);
 int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode);
 
 u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs);
-u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs);
+u32 *gen5_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs);
 
 int i830_emit_bb_start(struct i915_request *rq,
 		       u64 offset, u32 len,
diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
index 2f59dd3bdc18..14cab4c726ce 100644
--- a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
@@ -141,6 +141,12 @@ int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
 
 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
+	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
+	u32 offset = __i915_request_hwsp_offset(rq);
+	unsigned int flags;
+
+	GEM_BUG_ON(tl->mode == INTEL_TIMELINE_RELATIVE_CONTEXT);
+
 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 	*cs++ = GFX_OP_PIPE_CONTROL(4);
 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
@@ -154,15 +160,22 @@ u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 		PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = 0;
 
-	/* Finally we can flush and with it emit the breadcrumb */
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+	flags = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 		 PIPE_CONTROL_QW_WRITE |
 		 PIPE_CONTROL_CS_STALL);
-	*cs++ = i915_request_active_timeline(rq)->ggtt_offset |
-		PIPE_CONTROL_GLOBAL_GTT;
+	if (intel_timeline_is_relative(tl)) {
+		offset = offset_in_page(offset);
+		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+	}
+	if (!intel_timeline_in_context(tl))
+		offset |= PIPE_CONTROL_GLOBAL_GTT;
+
+	/* Finally we can flush and with it emit the breadcrumb */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = flags;
+	*cs++ = offset;
 	*cs++ = rq->fence.seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
@@ -351,15 +364,28 @@ int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
 
 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
+	u32 offset = __i915_request_hwsp_offset(rq);
+	unsigned int flags;
+
+	GEM_BUG_ON(tl->mode == INTEL_TIMELINE_RELATIVE_CONTEXT);
+
+	flags = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 		 PIPE_CONTROL_FLUSH_ENABLE |
 		 PIPE_CONTROL_QW_WRITE |
-		 PIPE_CONTROL_GLOBAL_GTT_IVB |
 		 PIPE_CONTROL_CS_STALL);
-	*cs++ = i915_request_active_timeline(rq)->ggtt_offset;
+	if (intel_timeline_is_relative(tl)) {
+		offset = offset_in_page(offset);
+		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+	}
+	if (!intel_timeline_in_context(tl))
+		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = flags;
+	*cs++ = offset;
 	*cs++ = rq->fence.seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
@@ -373,11 +399,21 @@ u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 
 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 {
-	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
-	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
+	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
+	u32 offset = __i915_request_hwsp_offset(rq);
+	unsigned int flags = 0;
+
+	GEM_BUG_ON(tl->mode == INTEL_TIMELINE_RELATIVE_CONTEXT);
 
-	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
-	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+	if (intel_timeline_is_relative(tl)) {
+		offset = offset_in_page(offset);
+		flags |= MI_FLUSH_DW_STORE_INDEX;
+	}
+	if (!intel_timeline_in_context(tl))
+		offset |= MI_FLUSH_DW_USE_GTT;
+
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | flags;
+	*cs++ = offset;
 	*cs++ = rq->fence.seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
@@ -391,28 +427,31 @@ u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 #define GEN7_XCS_WA 32
 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 {
+	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
+	u32 offset = __i915_request_hwsp_offset(rq);
+	u32 cmd = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
 	int i;
 
-	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
-	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
+	GEM_BUG_ON(tl->mode == INTEL_TIMELINE_RELATIVE_CONTEXT);
 
-	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
-		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
-	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+	if (intel_timeline_is_relative(tl)) {
+		offset = offset_in_page(offset);
+		cmd |= MI_FLUSH_DW_STORE_INDEX;
+	}
+	if (!intel_timeline_in_context(tl))
+		offset |= MI_FLUSH_DW_USE_GTT;
+
+	*cs++ = cmd;
+	*cs++ = offset;
 	*cs++ = rq->fence.seqno;
 
 	for (i = 0; i < GEN7_XCS_WA; i++) {
-		*cs++ = MI_STORE_DWORD_INDEX;
-		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
+		*cs++ = cmd;
+		*cs++ = offset;
 		*cs++ = rq->fence.seqno;
 	}
 
-	*cs++ = MI_FLUSH_DW;
-	*cs++ = 0;
-	*cs++ = 0;
-
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 5f0485cb9e3b..171874589174 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -336,19 +336,6 @@ static inline u32 preempt_address(struct intel_engine_cs *engine)
 		I915_GEM_HWS_PREEMPT_ADDR);
 }
 
-static u32 hwsp_offset(const struct i915_request *rq)
-{
-	const struct intel_timeline_cacheline *cl;
-
-	/* Before the request is executed, the timeline/cachline is fixed */
-
-	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
-	if (cl)
-		return cl->ggtt_offset;
-
-	return rcu_dereference_protected(rq->timeline, 1)->ggtt_offset;
-}
-
 int gen8_emit_init_breadcrumb(struct i915_request *rq)
 {
 	u32 *cs;
@@ -371,7 +358,7 @@ int gen8_emit_init_breadcrumb(struct i915_request *rq)
 	*cs++ = MI_NOOP;
 
 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
-	*cs++ = hwsp_offset(rq);
+	*cs++ = __i915_request_hwsp_offset(rq);
 	*cs++ = 0;
 	*cs++ = rq->fence.seqno - 1;
 
@@ -504,7 +491,7 @@ static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
 	unsigned int flags = MI_FLUSH_DW_OP_STOREDW;
-	u32 offset = hwsp_offset(rq);
+	u32 offset = __i915_request_hwsp_offset(rq);
 
 	if (intel_timeline_is_relative(tl)) {
 		offset = offset_in_page(offset);
@@ -526,7 +513,7 @@ u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
 	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
 	unsigned int flags = PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL;
-	u32 offset = hwsp_offset(rq);
+	u32 offset = __i915_request_hwsp_offset(rq);
 
 	if (intel_timeline_is_relative(tl)) {
 		offset = offset_in_page(offset);
@@ -551,7 +538,7 @@ u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
 	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
-	u32 offset = hwsp_offset(rq);
+	u32 offset = __i915_request_hwsp_offset(rq);
 	unsigned int flags;
 
 	flags = (PIPE_CONTROL_CS_STALL |
@@ -633,7 +620,7 @@ u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 {
 	struct intel_timeline *tl = rcu_dereference_protected(rq->timeline, 1);
-	u32 offset = hwsp_offset(rq);
+	u32 offset = __i915_request_hwsp_offset(rq);
 	unsigned int flags;
 
 	flags = (PIPE_CONTROL_CS_STALL |
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index cab775a78912..4c65772ff8dc 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -1122,7 +1122,7 @@ static void setup_common(struct intel_engine_cs *engine)
 	 */
 	engine->emit_fini_breadcrumb = gen3_emit_breadcrumb;
 	if (IS_GEN(i915, 5))
-		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb_xcs;
 
 	engine->set_default_submission = i9xx_set_default_submission;
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index adfe863f778e..560a0ef0c80d 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -615,4 +615,17 @@ i915_request_active_timeline(const struct i915_request *rq)
 					 lockdep_is_held(&rq->engine->active.lock));
 }
 
+static inline u32 __i915_request_hwsp_offset(const struct i915_request *rq)
+{
+	const struct intel_timeline_cacheline *cl;
+
+	/* Before the request is executed, the timeline/cachline is fixed */
+
+	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
+	if (cl)
+		return cl->ggtt_offset;
+
+	return rcu_dereference_protected(rq->timeline, 1)->ggtt_offset;
+}
+
 #endif /* I915_REQUEST_H */
-- 
2.20.1