[Intel-gfx] [PATCH 1/2] drm/i915: Engine relative MMIO

John.C.Harrison at Intel.com John.C.Harrison at Intel.com
Thu Sep 19 06:47:09 UTC 2019


From: John Harrison <John.C.Harrison at Intel.com>

With virtual engines, it is no longer possible to know which specific
physical engine a given request will be executed on at the time that
request is generated. This means that the request itself must be engine
agnostic - any direct register writes must be relative to the engine
and not absolute addresses.

The LRI command has support for engine relative addressing. However,
the mechanism is not transparent to the driver. The scheme for Gen11
(MI_LRI_ADD_CS_MMIO_START) requires the LRI address to have no
absolute engine base component. The hardware then adds on the correct
engine offset at execution time.

Due to the non-trivial and differing schemes on different hardware, it
is not possible to simply set a remap flag and let the hardware get on
with it. Instead, this patch adds 'lri_cmd_mode' and 'lri_mmio_base'
fields to the engine structure. The former should be ORd in when
creating an LRI command. The latter should be used as the register
base address (as opposed to just engine->mmio_base). They are set per
engine according to the currently available hardware scheme.

v2: Fix build break in GVT. Remove flags parameter [review feedback
from Chris W].

v3: Fix build break in selftest. Rebase to newer base tree and fix
merge conflict.

v4: More rebasing. Rmoved relative addressing support from Gen7-9 only
code paths [review feedback from Chris W].

v5: More rebasing (new 'gt' directory). Fix white space issue. Use
COPY class rather than BCS0 id for checking against BCS engine.

v6: Change to MI_LRI for generic code and __MI_LRI for legacy code.
MI_LRI is now a macro wrapper around an engine function pointer to
reduce runtime checks on LRI relative support. [review feedback from
Tvrtko]

v7: Replace engine function pointer with a per engine flags field that
is added by a common function. [Daniele]

v8: Re-write on top of patch series by Mika - greatly simplified to
not bother with any of the abstraction of the previous version.

Bspec: 45606
Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
CC: Rodrigo Vivi <rodrigo.vivi at intel.com>
CC: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
CC: Chris P Wilson <chris.p.wilson at intel.com>
CC: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c  |  7 ++--
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    | 24 ++++++++++++
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  3 ++
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  7 +++-
 drivers/gpu/drm/i915/gt/intel_lrc.c          | 40 ++++++++++----------
 drivers/gpu/drm/i915/gt/intel_mocs.c         |  6 +--
 drivers/gpu/drm/i915/gt/intel_ringbuffer.c   | 12 +++---
 drivers/gpu/drm/i915/gt/intel_workarounds.c  |  2 +-
 drivers/gpu/drm/i915/i915_perf.c             |  8 +++-
 9 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index f1c0e5d958f3..a17aa78017cb 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -980,7 +980,7 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
 {
 	struct i915_address_space *vm = rq->hw_context->vm;
 	struct intel_engine_cs *engine = rq->engine;
-	u32 base = engine->mmio_base;
+	u32 base = engine->lri_mmio_base;
 	u32 *cs;
 	int i;
 
@@ -992,7 +992,7 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
 		if (IS_ERR(cs))
 			return PTR_ERR(cs);
 
-		*cs++ = MI_LOAD_REGISTER_IMM(2);
+		*cs++ = MI_LOAD_REGISTER_IMM(2) | engine->lri_cmd_mode;
 
 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
 		*cs++ = upper_32_bits(pd_daddr);
@@ -1014,7 +1014,8 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
 		if (IS_ERR(cs))
 			return PTR_ERR(cs);
 
-		*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
+		*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) |
+			MI_LRI_FORCE_POSTED | engine->lri_cmd_mode;
 		for (i = GEN8_3LVL_PDPES; i--; ) {
 			const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 3c176b0f4b45..6e51269ab2ec 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -236,6 +236,28 @@ static u32 __engine_mmio_base(struct drm_i915_private *i915,
 	return bases[i].base;
 }
 
+static bool i915_engine_has_relative_lri(const struct intel_engine_cs *engine)
+{
+	if (INTEL_GEN(engine->i915) < 11)
+		return false;
+
+	if (engine->class == COPY_ENGINE_CLASS)
+		return false;
+
+	return true;
+}
+
+static void lri_init(struct intel_engine_cs *engine)
+{
+	if (i915_engine_has_relative_lri(engine)) {
+		engine->lri_cmd_mode = MI_LRI_ADD_CS_MMIO_START_GEN11;
+		engine->lri_mmio_base = 0;
+	} else {
+		engine->lri_cmd_mode = 0;
+		engine->lri_mmio_base = engine->mmio_base;
+	}
+}
+
 static void __sprint_engine_name(struct intel_engine_cs *engine)
 {
 	/*
@@ -320,6 +342,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
 	/* Nothing to do here, execute in order of dependencies */
 	engine->schedule = NULL;
 
+	lri_init(engine);
+
 	seqlock_init(&engine->stats.lock);
 
 	ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 943f0663837e..38f486f7f7e3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -306,6 +306,9 @@ struct intel_engine_cs {
 	u32 context_size;
 	u32 mmio_base;
 
+	u32 lri_mmio_base;
+	u32 lri_cmd_mode;
+
 	u32 uabi_capabilities;
 
 	struct rb_node uabi_node;
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index f78b13d74e17..bfb51d8d7ce2 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -133,11 +133,16 @@
  *   simply ignores the register load under certain conditions.
  * - One can actually load arbitrary many arbitrary registers: Simply issue x
  *   address/value pairs. Don't overdue it, though, x <= 2^4 must hold!
+ * - Newer hardware supports engine relative addressing but older hardware does
+ *   not. This is required for hw engine load balancing. Hence the MI_LRI
+ *   instruction itself is prefixed with '__' and should only be used on
+ *   legacy hardware code paths. Generic code must always use the MI_LRI
+ *   and i915_get_lri_reg() helper functions instead.
  */
 #define MI_LOAD_REGISTER_IMM(x)	MI_INSTR(0x22, 2*(x)-1)
 /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
-#define   MI_LRI_CS_MMIO		(1<<19)
 #define   MI_LRI_FORCE_POSTED		(1<<12)
+#define   MI_LRI_ADD_CS_MMIO_START_GEN11	BIT(19)
 #define MI_STORE_REGISTER_MEM        MI_INSTR(0x24, 1)
 #define MI_STORE_REGISTER_MEM_GEN8   MI_INSTR(0x24, 2)
 #define   MI_SRM_LRM_GLOBAL_GTT		(1<<22)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index a99166a2d2eb..d5ea6f7d3b5c 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1986,11 +1986,12 @@ struct lri {
 	u32 value;
 };
 
-static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
+static u32 *emit_lri(struct intel_engine_cs *engine, u32 *batch,
+		     const struct lri *lri, unsigned int count)
 {
 	GEM_BUG_ON(!count || count > 63);
 
-	*batch++ = MI_LOAD_REGISTER_IMM(count);
+	*batch++ = MI_LOAD_REGISTER_IMM(count) | engine->lri_cmd_mode;
 	do {
 		*batch++ = i915_mmio_reg_offset(lri->reg);
 		*batch++ = lri->value;
@@ -2030,7 +2031,7 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
 
-	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
+	batch = emit_lri(engine, batch, lri, ARRAY_SIZE(lri));
 
 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
 	if (HAS_POOLED_EU(engine->i915)) {
@@ -3186,7 +3187,7 @@ static void init_common_reg_state(u32 * const regs,
 				  struct intel_engine_cs *engine,
 				  struct intel_ring *ring)
 {
-	const u32 base = engine->mmio_base;
+	const u32 base = engine->lri_mmio_base;
 
 	CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
@@ -3211,7 +3212,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
 				 u32 pos_bb_per_ctx)
 {
 	struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
-	const u32 base = engine->mmio_base;
+	const u32 base = engine->lri_mmio_base;
 	const u32 pos_indirect_ctx = pos_bb_per_ctx + 2;
 	const u32 pos_indirect_ctx_offset = pos_indirect_ctx + 2;
 
@@ -3239,6 +3240,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
 }
 
 static void init_ppgtt_reg_state(u32 *regs, u32 base,
+				 struct intel_engine_cs *engine,
 				 struct i915_ppgtt *ppgtt)
 {
 	/* PDP values well be assigned later if needed */
@@ -3280,14 +3282,12 @@ static void gen8_init_reg_state(u32 * const regs,
 {
 	struct i915_ppgtt * const ppgtt = vm_alias(ce->vm);
 	const bool rcs = engine->class == RENDER_CLASS;
-	const u32 base = engine->mmio_base;
-	const u32 lri_base =
-		intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
+	const u32 base = engine->lri_mmio_base;
 
 	regs[CTX_LRI_HEADER_0] =
 		MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
 		MI_LRI_FORCE_POSTED |
-		lri_base;
+		engine->lri_cmd_mode;
 
 	init_common_reg_state(regs, ppgtt, engine, ring);
 	CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
@@ -3299,15 +3299,17 @@ static void gen8_init_reg_state(u32 * const regs,
 	regs[CTX_LRI_HEADER_1] =
 		MI_LOAD_REGISTER_IMM(9) |
 		MI_LRI_FORCE_POSTED |
-		lri_base;
+		engine->lri_cmd_mode;
 
 	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
 
-	init_ppgtt_reg_state(regs, base, ppgtt);
+	init_ppgtt_reg_state(regs, base, engine, ppgtt);
 
 	if (rcs) {
-		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) | lri_base;
-		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
+		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) |
+					 engine->lri_cmd_mode;
+		CTX_REG(regs, CTX_R_PWR_CLK_STATE,
+			GEN8_R_PWR_CLK_STATE, 0);
 	}
 
 	regs[CTX_END] = MI_BATCH_BUFFER_END;
@@ -3322,14 +3324,12 @@ static void gen12_init_reg_state(u32 * const regs,
 {
 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(ce->vm);
 	const bool rcs = engine->class == RENDER_CLASS;
-	const u32 base = engine->mmio_base;
-	const u32 lri_base =
-		intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
+	const u32 base = engine->lri_mmio_base;
 
 	regs[CTX_LRI_HEADER_0] =
 		MI_LOAD_REGISTER_IMM(rcs ? 11 : 9) |
 		MI_LRI_FORCE_POSTED |
-		lri_base;
+		engine->lri_cmd_mode;
 
 	init_common_reg_state(regs, ppgtt, engine, ring);
 
@@ -3339,15 +3339,15 @@ static void gen12_init_reg_state(u32 * const regs,
 	regs[CTX_LRI_HEADER_1] =
 		MI_LOAD_REGISTER_IMM(9) |
 		MI_LRI_FORCE_POSTED |
-		lri_base;
+		engine->lri_cmd_mode;
 
 	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
 
-	init_ppgtt_reg_state(regs, base, ppgtt);
+	init_ppgtt_reg_state(regs, base, engine, ppgtt);
 
 	if (rcs) {
 		regs[GEN12_CTX_LRI_HEADER_3] =
-			MI_LOAD_REGISTER_IMM(1) | lri_base;
+			MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
 		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
 
 		/* TODO: oa_init_reg_state ? */
diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c
index 728704bbbe18..8e8fe3deb95c 100644
--- a/drivers/gpu/drm/i915/gt/intel_mocs.c
+++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
@@ -368,9 +368,6 @@ static u32 get_entry_control(const struct drm_i915_mocs_table *table,
 /**
  * intel_mocs_init_engine() - emit the mocs control table
  * @engine:	The engine for whom to emit the registers.
- *
- * This function simply emits a MI_LOAD_REGISTER_IMM command for the
- * given table starting at the given address.
  */
 void intel_mocs_init_engine(struct intel_engine_cs *engine)
 {
@@ -456,7 +453,8 @@ static int emit_mocs_control_table(struct i915_request *rq,
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	*cs++ = MI_LOAD_REGISTER_IMM(table->n_entries);
+	*cs++ = MI_LOAD_REGISTER_IMM(table->n_entries) |
+		rq->engine->lri_cmd_mode;
 
 	for (index = 0; index < table->size; index++) {
 		u32 value = get_entry_control(table, index);
diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
index a25b84b12ef1..947f74161d91 100644
--- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
@@ -1535,12 +1535,13 @@ static int load_pd_dir(struct i915_request *rq, const struct i915_ppgtt *ppgtt)
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	*cs++ = MI_LOAD_REGISTER_IMM(1);
-	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
+	/* Can these not be merged into a single LRI??? */
+	*cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
+	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->lri_mmio_base));
 	*cs++ = PP_DIR_DCLV_2G;
 
-	*cs++ = MI_LOAD_REGISTER_IMM(1);
-	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
+	*cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
+	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->lri_mmio_base));
 	*cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
 
 	intel_ring_advance(rq, cs);
@@ -1708,7 +1709,8 @@ static int remap_l3_slice(struct i915_request *rq, int slice)
 	 * here because no other code should access these registers other than
 	 * at initialization time.
 	 */
-	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
+	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE / 4) |
+		rq->engine->lri_cmd_mode;
 	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
 		*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
 		*cs++ = remap_info[i];
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 41d0f786e06d..895f12009208 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -633,7 +633,7 @@ int intel_engine_emit_ctx_wa(struct i915_request *rq)
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
+	*cs++ = MI_LOAD_REGISTER_IMM(wal->count) | rq->engine->lri_cmd_mode;
 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 		*cs++ = i915_mmio_reg_offset(wa->reg);
 		*cs++ = wa->val;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index c1b764233761..8b85cdfada21 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1693,6 +1693,11 @@ gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
 	};
 	int i;
 
+	/*
+	 * NB: The LRI instruction is generated by the hardware.
+	 * Should we read it in and assert that the offset flag is set?
+	 */
+
 	CTX_REG(reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
 		(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
 		(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
@@ -1752,8 +1757,9 @@ gen8_load_flex(struct i915_request *rq,
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	*cs++ = MI_LOAD_REGISTER_IMM(count);
+	*cs++ = MI_LOAD_REGISTER_IMM(count) | rq->engine->lri_cmd_mode;
 	do {
+		/* FIXME: Is this table LRI remap/offset friendly? */
 		*cs++ = i915_mmio_reg_offset(flex->reg);
 		*cs++ = flex->value;
 	} while (flex++, --count);
-- 
2.21.0.5.gaeb582a983



More information about the Intel-gfx mailing list