[PATCH 78/80] drm/i915/gt: Stall around xcs invalidations on tgl

Chris Wilson chris at chris-wilson.co.uk
Fri Aug 7 23:23:29 UTC 2020


Whether this is an arbitrary stall or a vital ingredient, neverthess the
impact is noticeable. If we do not have the stall around the xcs
invalidation before a request, writes within that request sometimes go
astray.

v2: Split between flush/invalidate, as it seems we can then beat the
mysterious incoherency at a fraction of the cost, but only reduces
the risk [substantially]

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2169
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Acked-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 57 ++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 4b46b112cd42..961c73ff0d82 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -4158,6 +4158,24 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
 }
 
+static u32 *emit_mi_flush(u32 *cs, u32 flags)
+{
+	*cs++ = (MI_FLUSH_DW + 1) | flags;
+	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
+	*cs++ = 0; /* upper addr */
+	*cs++ = 0; /* value */
+
+	return cs;
+}
+
+static u32 *emit_xcs_invalidate(u32 *cs)
+{
+	return emit_mi_flush(cs,
+			     MI_FLUSH_DW_STORE_INDEX |
+			     MI_FLUSH_DW_OP_STOREDW |
+			     MI_INVALIDATE_TLB);
+}
+
 static int gen8_emit_flush(struct i915_request *request, u32 mode)
 {
 	u32 cmd, *cs;
@@ -4166,14 +4184,13 @@ static int gen8_emit_flush(struct i915_request *request, u32 mode)
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	cmd = MI_FLUSH_DW + 1;
-
-	/* We always require a command barrier so that subsequent
+	/*
+	 * We always require a command barrier so that subsequent
 	 * commands, such as breadcrumb interrupts, are strictly ordered
 	 * wrt the contents of the write cache being flushed to memory
 	 * (and thus being coherent from the CPU).
 	 */
-	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
+	cmd = MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 
 	if (mode & EMIT_INVALIDATE) {
 		cmd |= MI_INVALIDATE_TLB;
@@ -4181,10 +4198,8 @@ static int gen8_emit_flush(struct i915_request *request, u32 mode)
 			cmd |= MI_INVALIDATE_BSD;
 	}
 
-	*cs++ = cmd;
-	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
-	*cs++ = 0; /* upper addr */
-	*cs++ = 0; /* value */
+	cs = emit_mi_flush(cs, cmd);
+
 	intel_ring_advance(request, cs);
 
 	return 0;
@@ -4423,10 +4438,12 @@ static int gen12_emit_flush_render(struct i915_request *request,
 
 static int gen12_emit_flush(struct i915_request *request, u32 mode)
 {
+#define WA_CNT 2 /* Magic delay? */
 	intel_engine_mask_t aux_inv = 0;
 	u32 cmd, *cs;
+	int n;
 
-	cmd = 4;
+	cmd = 4 * WA_CNT;
 	if (mode & EMIT_INVALIDATE)
 		cmd += 2;
 	if (mode & EMIT_INVALIDATE)
@@ -4441,25 +4458,21 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode)
 	if (mode & EMIT_INVALIDATE)
 		*cs++ = preparser_disable(true);
 
-	cmd = MI_FLUSH_DW + 1;
-
-	/* We always require a command barrier so that subsequent
+	/*
+	 * We always require a command barrier so that subsequent
 	 * commands, such as breadcrumb interrupts, are strictly ordered
 	 * wrt the contents of the write cache being flushed to memory
 	 * (and thus being coherent from the CPU).
 	 */
-	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
-
+	cmd = MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 	if (mode & EMIT_INVALIDATE) {
 		cmd |= MI_INVALIDATE_TLB;
 		if (request->engine->class == VIDEO_DECODE_CLASS)
 			cmd |= MI_INVALIDATE_BSD;
 	}
 
-	*cs++ = cmd;
-	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
-	*cs++ = 0; /* upper addr */
-	*cs++ = 0; /* value */
+	for (n = 0; n < WA_CNT; n++)
+		cs = emit_mi_flush(cs, cmd);
 
 	if (aux_inv) { /* hsdes: 1809175790 */
 		struct intel_engine_cs *engine;
@@ -4480,6 +4493,7 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode)
 	intel_ring_advance(request, cs);
 
 	return 0;
+#undef WA_CNT
 }
 
 static void assert_request_valid(struct i915_request *rq)
@@ -4633,7 +4647,14 @@ gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
 
 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
 {
+#define WA_CNT 2
+	int i;
+
+	for (i = 0; i < WA_CNT; i++)
+		cs = emit_xcs_invalidate(cs);
+
 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
+#undef WA_CNT
 }
 
 static u32 *
-- 
2.20.1



More information about the Intel-gfx-trybot mailing list