[Intel-gfx] [PATCH 1/2] drm/i915: Engine relative MMIO

Tue Sep 24 09:16:02 UTC 2019

On 24/09/2019 00:51, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
> 
> With virtual engines, it is no longer possible to know which specific
> physical engine a given request will be executed on at the time that
> request is generated. This means that the request itself must be engine
> agnostic - any direct register writes must be relative to the engine
> and not absolute addresses.
> 
> The LRI command has support for engine relative addressing. However,
> the mechanism is not transparent to the driver. The scheme for Gen11
> (MI_LRI_ADD_CS_MMIO_START) requires the LRI address to have no
> absolute engine base component. The hardware then adds on the correct
> engine offset at execution time.
> 
> Due to the non-trivial and differing schemes on different hardware, it
> is not possible to simply update the code that creates the LRI
> commands to set a remap flag and let the hardware get on with it.
> Instead, this patch adds function wrappers for generating the LRI
> command itself and then for constructing the correct address to use
> with the LRI.
> 
> v2: Fix build break in GVT. Remove flags parameter [review feedback
> from Chris W].
> 
> v3: Fix build break in selftest. Rebase to newer base tree and fix
> merge conflict.
> 
> v4: More rebasing. Rmoved relative addressing support from Gen7-9 only
> code paths [review feedback from Chris W].
> 
> v5: More rebasing (new 'gt' directory). Fix white space issue. Use
> COPY class rather than BCS0 id for checking against BCS engine.
> 
> v6: Change to MI_LRI for generic code and __MI_LRI for legacy code.
> MI_LRI is now a macro wrapper around an engine function pointer to
> reduce runtime checks on LRI relative support. [review feedback from
> Tvrtko]
> 
> v7: Replace engine function pointer with a per engine flags field that
> is added by a common function. [Daniele]
> 
> v8: Re-write on top of patch series by Mika.
> 
> v9: Fix workaround IGT failure. Not sure why but my local test runs
> were passing by claiming no workarounds whereas the pre-commit check
> failed. As all the workaround registers are currently engine
> independent, it is safe to simply not add the engine relative flag
> (which is what the selftest was expecting).
> 
> Bspec: 45606
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> CC: Rodrigo Vivi <rodrigo.vivi at intel.com>
> CC: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> CC: Chris P Wilson <chris.p.wilson at intel.com>
> CC: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c  |  7 ++--
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c    | 24 ++++++++++++
>   drivers/gpu/drm/i915/gt/intel_engine_types.h |  3 ++
>   drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  7 +++-
>   drivers/gpu/drm/i915/gt/intel_lrc.c          | 40 ++++++++++----------
>   drivers/gpu/drm/i915/gt/intel_mocs.c         |  6 +--
>   drivers/gpu/drm/i915/gt/intel_ringbuffer.c   | 12 +++---
>   drivers/gpu/drm/i915/i915_perf.c             |  8 +++-
>   8 files changed, 73 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 4a34c4f62065..c8d7075a481d 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -980,7 +980,7 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
>   {
>   	struct i915_address_space *vm = rq->hw_context->vm;
>   	struct intel_engine_cs *engine = rq->engine;
> -	u32 base = engine->mmio_base;
> +	u32 base = engine->lri_mmio_base;
>   	u32 *cs;
>   	int i;
>   
> @@ -992,7 +992,7 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
>   		if (IS_ERR(cs))
>   			return PTR_ERR(cs);
>   
> -		*cs++ = MI_LOAD_REGISTER_IMM(2);
> +		*cs++ = MI_LOAD_REGISTER_IMM(2) | engine->lri_cmd_mode;
>   
>   		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
>   		*cs++ = upper_32_bits(pd_daddr);
> @@ -1014,7 +1014,8 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
>   		if (IS_ERR(cs))
>   			return PTR_ERR(cs);
>   
> -		*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> +		*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) |
> +			MI_LRI_FORCE_POSTED | engine->lri_cmd_mode;
>   		for (i = GEN8_3LVL_PDPES; i--; ) {
>   			const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index f451d5076bde..5aa58e069806 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -236,6 +236,28 @@ static u32 __engine_mmio_base(struct drm_i915_private *i915,
>   	return bases[i].base;
>   }
>   
> +static bool i915_engine_has_relative_lri(const struct intel_engine_cs *engine)
> +{
> +	if (INTEL_GEN(engine->i915) < 11)
> +		return false;
> +
> +	if (engine->class == COPY_ENGINE_CLASS)
> +		return false;
> +
> +	return true;
> +}
> +
> +static void lri_init(struct intel_engine_cs *engine)
> +{
> +	if (i915_engine_has_relative_lri(engine)) {
> +		engine->lri_cmd_mode = MI_LRI_ADD_CS_MMIO_START_GEN11;
> +		engine->lri_mmio_base = 0;
> +	} else {
> +		engine->lri_cmd_mode = 0;
> +		engine->lri_mmio_base = engine->mmio_base;
> +	}
> +}
> +
>   static void __sprint_engine_name(struct intel_engine_cs *engine)
>   {
>   	/*
> @@ -320,6 +342,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
>   	/* Nothing to do here, execute in order of dependencies */
>   	engine->schedule = NULL;
>   
> +	lri_init(engine);
> +
>   	seqlock_init(&engine->stats.lock);
>   
>   	ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 943f0663837e..38f486f7f7e3 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -306,6 +306,9 @@ struct intel_engine_cs {
>   	u32 context_size;
>   	u32 mmio_base;
>   
> +	u32 lri_mmio_base;
> +	u32 lri_cmd_mode;
> +
>   	u32 uabi_capabilities;
>   
>   	struct rb_node uabi_node;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index f78b13d74e17..bfb51d8d7ce2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -133,11 +133,16 @@
>    *   simply ignores the register load under certain conditions.
>    * - One can actually load arbitrary many arbitrary registers: Simply issue x
>    *   address/value pairs. Don't overdue it, though, x <= 2^4 must hold!
> + * - Newer hardware supports engine relative addressing but older hardware does
> + *   not. This is required for hw engine load balancing. Hence the MI_LRI
> + *   instruction itself is prefixed with '__' and should only be used on
> + *   legacy hardware code paths. Generic code must always use the MI_LRI
> + *   and i915_get_lri_reg() helper functions instead.
>    */
>   #define MI_LOAD_REGISTER_IMM(x)	MI_INSTR(0x22, 2*(x)-1)
>   /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
> -#define   MI_LRI_CS_MMIO		(1<<19)
>   #define   MI_LRI_FORCE_POSTED		(1<<12)
> +#define   MI_LRI_ADD_CS_MMIO_START_GEN11	BIT(19)
>   #define MI_STORE_REGISTER_MEM        MI_INSTR(0x24, 1)
>   #define MI_STORE_REGISTER_MEM_GEN8   MI_INSTR(0x24, 2)
>   #define   MI_SRM_LRM_GLOBAL_GTT		(1<<22)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 6cfdc0f9f2b9..7cd59d29c4e4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2010,11 +2010,12 @@ struct lri {
>   	u32 value;
>   };
>   
> -static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
> +static u32 *emit_lri(struct intel_engine_cs *engine, u32 *batch,
> +		     const struct lri *lri, unsigned int count)
>   {
>   	GEM_BUG_ON(!count || count > 63);
>   
> -	*batch++ = MI_LOAD_REGISTER_IMM(count);
> +	*batch++ = MI_LOAD_REGISTER_IMM(count) | engine->lri_cmd_mode;
>   	do {
>   		*batch++ = i915_mmio_reg_offset(lri->reg);
>   		*batch++ = lri->value;
> @@ -2054,7 +2055,7 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
>   	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
>   	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
>   
> -	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
> +	batch = emit_lri(engine, batch, lri, ARRAY_SIZE(lri));
>   
>   	/* WaMediaPoolStateCmdInWABB:bxt,glk */
>   	if (HAS_POOLED_EU(engine->i915)) {
> @@ -3282,7 +3283,7 @@ static void init_common_reg_state(u32 * const regs,
>   				  struct intel_engine_cs *engine,
>   				  struct intel_ring *ring)
>   {
> -	const u32 base = engine->mmio_base;
> +	const u32 base = engine->lri_mmio_base;
>   
>   	CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
>   		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
> @@ -3307,7 +3308,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
>   				 u32 pos_bb_per_ctx)
>   {
>   	struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
> -	const u32 base = engine->mmio_base;
> +	const u32 base = engine->lri_mmio_base;
>   	const u32 pos_indirect_ctx = pos_bb_per_ctx + 2;
>   	const u32 pos_indirect_ctx_offset = pos_indirect_ctx + 2;
>   
> @@ -3335,6 +3336,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
>   }
>   
>   static void init_ppgtt_reg_state(u32 *regs, u32 base,
> +				 struct intel_engine_cs *engine,
>   				 struct i915_ppgtt *ppgtt)
>   {
>   	/* PDP values well be assigned later if needed */
> @@ -3376,14 +3378,12 @@ static void gen8_init_reg_state(u32 * const regs,
>   {
>   	struct i915_ppgtt * const ppgtt = vm_alias(ce->vm);
>   	const bool rcs = engine->class == RENDER_CLASS;
> -	const u32 base = engine->mmio_base;
> -	const u32 lri_base =
> -		intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
> +	const u32 base = engine->lri_mmio_base;
>   
>   	regs[CTX_LRI_HEADER_0] =
>   		MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
>   		MI_LRI_FORCE_POSTED |
> -		lri_base;
> +		engine->lri_cmd_mode;
>   
>   	init_common_reg_state(regs, ppgtt, engine, ring);
>   	CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
> @@ -3395,15 +3395,17 @@ static void gen8_init_reg_state(u32 * const regs,
>   	regs[CTX_LRI_HEADER_1] =
>   		MI_LOAD_REGISTER_IMM(9) |
>   		MI_LRI_FORCE_POSTED |
> -		lri_base;
> +		engine->lri_cmd_mode;
>   
>   	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
>   
> -	init_ppgtt_reg_state(regs, base, ppgtt);
> +	init_ppgtt_reg_state(regs, base, engine, ppgtt);

Is passed in base always equal to engine->mmio_base? If so then there is 
redundancy and you could drop this change.

Regards,

Tvrtko

>   
>   	if (rcs) {
> -		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) | lri_base;
> -		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
> +		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) |
> +					 engine->lri_cmd_mode;
> +		CTX_REG(regs, CTX_R_PWR_CLK_STATE,
> +			GEN8_R_PWR_CLK_STATE, 0);
>   	}
>   
>   	regs[CTX_END] = MI_BATCH_BUFFER_END;
> @@ -3418,14 +3420,12 @@ static void gen12_init_reg_state(u32 * const regs,
>   {
>   	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(ce->vm);
>   	const bool rcs = engine->class == RENDER_CLASS;
> -	const u32 base = engine->mmio_base;
> -	const u32 lri_base =
> -		intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
> +	const u32 base = engine->lri_mmio_base;
>   
>   	regs[CTX_LRI_HEADER_0] =
>   		MI_LOAD_REGISTER_IMM(rcs ? 11 : 9) |
>   		MI_LRI_FORCE_POSTED |
> -		lri_base;
> +		engine->lri_cmd_mode;
>   
>   	init_common_reg_state(regs, ppgtt, engine, ring);
>   
> @@ -3435,15 +3435,15 @@ static void gen12_init_reg_state(u32 * const regs,
>   	regs[CTX_LRI_HEADER_1] =
>   		MI_LOAD_REGISTER_IMM(9) |
>   		MI_LRI_FORCE_POSTED |
> -		lri_base;
> +		engine->lri_cmd_mode;
>   
>   	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
>   
> -	init_ppgtt_reg_state(regs, base, ppgtt);
> +	init_ppgtt_reg_state(regs, base, engine, ppgtt);
>   
>   	if (rcs) {
>   		regs[GEN12_CTX_LRI_HEADER_3] =
> -			MI_LOAD_REGISTER_IMM(1) | lri_base;
> +			MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
>   		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
>   
>   		/* TODO: oa_init_reg_state ? */
> diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c
> index 728704bbbe18..8e8fe3deb95c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_mocs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
> @@ -368,9 +368,6 @@ static u32 get_entry_control(const struct drm_i915_mocs_table *table,
>   /**
>    * intel_mocs_init_engine() - emit the mocs control table
>    * @engine:	The engine for whom to emit the registers.
> - *
> - * This function simply emits a MI_LOAD_REGISTER_IMM command for the
> - * given table starting at the given address.
>    */
>   void intel_mocs_init_engine(struct intel_engine_cs *engine)
>   {
> @@ -456,7 +453,8 @@ static int emit_mocs_control_table(struct i915_request *rq,
>   	if (IS_ERR(cs))
>   		return PTR_ERR(cs);
>   
> -	*cs++ = MI_LOAD_REGISTER_IMM(table->n_entries);
> +	*cs++ = MI_LOAD_REGISTER_IMM(table->n_entries) |
> +		rq->engine->lri_cmd_mode;
>   
>   	for (index = 0; index < table->size; index++) {
>   		u32 value = get_entry_control(table, index);
> diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
> index 0747b8c9f768..7027367b1f1a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
> @@ -1536,12 +1536,13 @@ static int load_pd_dir(struct i915_request *rq, const struct i915_ppgtt *ppgtt)
>   	if (IS_ERR(cs))
>   		return PTR_ERR(cs);
>   
> -	*cs++ = MI_LOAD_REGISTER_IMM(1);
> -	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
> +	/* Can these not be merged into a single LRI??? */
> +	*cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
> +	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->lri_mmio_base));
>   	*cs++ = PP_DIR_DCLV_2G;
>   
> -	*cs++ = MI_LOAD_REGISTER_IMM(1);
> -	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
> +	*cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
> +	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->lri_mmio_base));
>   	*cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
>   
>   	intel_ring_advance(rq, cs);
> @@ -1709,7 +1710,8 @@ static int remap_l3_slice(struct i915_request *rq, int slice)
>   	 * here because no other code should access these registers other than
>   	 * at initialization time.
>   	 */
> -	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
> +	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE / 4) |
> +		rq->engine->lri_cmd_mode;
>   	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
>   		*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
>   		*cs++ = remap_info[i];
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index c1b764233761..8b85cdfada21 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1693,6 +1693,11 @@ gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
>   	};
>   	int i;
>   
> +	/*
> +	 * NB: The LRI instruction is generated by the hardware.
> +	 * Should we read it in and assert that the offset flag is set?
> +	 */
> +
>   	CTX_REG(reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
>   		(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
>   		(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
> @@ -1752,8 +1757,9 @@ gen8_load_flex(struct i915_request *rq,
>   	if (IS_ERR(cs))
>   		return PTR_ERR(cs);
>   
> -	*cs++ = MI_LOAD_REGISTER_IMM(count);
> +	*cs++ = MI_LOAD_REGISTER_IMM(count) | rq->engine->lri_cmd_mode;
>   	do {
> +		/* FIXME: Is this table LRI remap/offset friendly? */
>   		*cs++ = i915_mmio_reg_offset(flex->reg);
>   		*cs++ = flex->value;
>   	} while (flex++, --count);
>