[Intel-gfx] [PATCH 1/2] drm/i915: Engine relative MMIO
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Sep 24 09:16:02 UTC 2019
On 24/09/2019 00:51, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> With virtual engines, it is no longer possible to know which specific
> physical engine a given request will be executed on at the time that
> request is generated. This means that the request itself must be engine
> agnostic - any direct register writes must be relative to the engine
> and not absolute addresses.
>
> The LRI command has support for engine relative addressing. However,
> the mechanism is not transparent to the driver. The scheme for Gen11
> (MI_LRI_ADD_CS_MMIO_START) requires the LRI address to have no
> absolute engine base component. The hardware then adds on the correct
> engine offset at execution time.
>
> Due to the non-trivial and differing schemes on different hardware, it
> is not possible to simply update the code that creates the LRI
> commands to set a remap flag and let the hardware get on with it.
> Instead, this patch adds function wrappers for generating the LRI
> command itself and then for constructing the correct address to use
> with the LRI.
>
> v2: Fix build break in GVT. Remove flags parameter [review feedback
> from Chris W].
>
> v3: Fix build break in selftest. Rebase to newer base tree and fix
> merge conflict.
>
> v4: More rebasing. Rmoved relative addressing support from Gen7-9 only
> code paths [review feedback from Chris W].
>
> v5: More rebasing (new 'gt' directory). Fix white space issue. Use
> COPY class rather than BCS0 id for checking against BCS engine.
>
> v6: Change to MI_LRI for generic code and __MI_LRI for legacy code.
> MI_LRI is now a macro wrapper around an engine function pointer to
> reduce runtime checks on LRI relative support. [review feedback from
> Tvrtko]
>
> v7: Replace engine function pointer with a per engine flags field that
> is added by a common function. [Daniele]
>
> v8: Re-write on top of patch series by Mika.
>
> v9: Fix workaround IGT failure. Not sure why but my local test runs
> were passing by claiming no workarounds whereas the pre-commit check
> failed. As all the workaround registers are currently engine
> independent, it is safe to simply not add the engine relative flag
> (which is what the selftest was expecting).
>
> Bspec: 45606
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> CC: Rodrigo Vivi <rodrigo.vivi at intel.com>
> CC: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> CC: Chris P Wilson <chris.p.wilson at intel.com>
> CC: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> ---
> drivers/gpu/drm/i915/gem/i915_gem_context.c | 7 ++--
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 24 ++++++++++++
> drivers/gpu/drm/i915/gt/intel_engine_types.h | 3 ++
> drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 7 +++-
> drivers/gpu/drm/i915/gt/intel_lrc.c | 40 ++++++++++----------
> drivers/gpu/drm/i915/gt/intel_mocs.c | 6 +--
> drivers/gpu/drm/i915/gt/intel_ringbuffer.c | 12 +++---
> drivers/gpu/drm/i915/i915_perf.c | 8 +++-
> 8 files changed, 73 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 4a34c4f62065..c8d7075a481d 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -980,7 +980,7 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
> {
> struct i915_address_space *vm = rq->hw_context->vm;
> struct intel_engine_cs *engine = rq->engine;
> - u32 base = engine->mmio_base;
> + u32 base = engine->lri_mmio_base;
> u32 *cs;
> int i;
>
> @@ -992,7 +992,7 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
> if (IS_ERR(cs))
> return PTR_ERR(cs);
>
> - *cs++ = MI_LOAD_REGISTER_IMM(2);
> + *cs++ = MI_LOAD_REGISTER_IMM(2) | engine->lri_cmd_mode;
>
> *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
> *cs++ = upper_32_bits(pd_daddr);
> @@ -1014,7 +1014,8 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
> if (IS_ERR(cs))
> return PTR_ERR(cs);
>
> - *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> + *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) |
> + MI_LRI_FORCE_POSTED | engine->lri_cmd_mode;
> for (i = GEN8_3LVL_PDPES; i--; ) {
> const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index f451d5076bde..5aa58e069806 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -236,6 +236,28 @@ static u32 __engine_mmio_base(struct drm_i915_private *i915,
> return bases[i].base;
> }
>
> +static bool i915_engine_has_relative_lri(const struct intel_engine_cs *engine)
> +{
> + if (INTEL_GEN(engine->i915) < 11)
> + return false;
> +
> + if (engine->class == COPY_ENGINE_CLASS)
> + return false;
> +
> + return true;
> +}
> +
> +static void lri_init(struct intel_engine_cs *engine)
> +{
> + if (i915_engine_has_relative_lri(engine)) {
> + engine->lri_cmd_mode = MI_LRI_ADD_CS_MMIO_START_GEN11;
> + engine->lri_mmio_base = 0;
> + } else {
> + engine->lri_cmd_mode = 0;
> + engine->lri_mmio_base = engine->mmio_base;
> + }
> +}
> +
> static void __sprint_engine_name(struct intel_engine_cs *engine)
> {
> /*
> @@ -320,6 +342,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
> /* Nothing to do here, execute in order of dependencies */
> engine->schedule = NULL;
>
> + lri_init(engine);
> +
> seqlock_init(&engine->stats.lock);
>
> ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 943f0663837e..38f486f7f7e3 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -306,6 +306,9 @@ struct intel_engine_cs {
> u32 context_size;
> u32 mmio_base;
>
> + u32 lri_mmio_base;
> + u32 lri_cmd_mode;
> +
> u32 uabi_capabilities;
>
> struct rb_node uabi_node;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index f78b13d74e17..bfb51d8d7ce2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -133,11 +133,16 @@
> * simply ignores the register load under certain conditions.
> * - One can actually load arbitrary many arbitrary registers: Simply issue x
> * address/value pairs. Don't overdue it, though, x <= 2^4 must hold!
> + * - Newer hardware supports engine relative addressing but older hardware does
> + * not. This is required for hw engine load balancing. Hence the MI_LRI
> + * instruction itself is prefixed with '__' and should only be used on
> + * legacy hardware code paths. Generic code must always use the MI_LRI
> + * and i915_get_lri_reg() helper functions instead.
> */
> #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1)
> /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
> -#define MI_LRI_CS_MMIO (1<<19)
> #define MI_LRI_FORCE_POSTED (1<<12)
> +#define MI_LRI_ADD_CS_MMIO_START_GEN11 BIT(19)
> #define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1)
> #define MI_STORE_REGISTER_MEM_GEN8 MI_INSTR(0x24, 2)
> #define MI_SRM_LRM_GLOBAL_GTT (1<<22)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 6cfdc0f9f2b9..7cd59d29c4e4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2010,11 +2010,12 @@ struct lri {
> u32 value;
> };
>
> -static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
> +static u32 *emit_lri(struct intel_engine_cs *engine, u32 *batch,
> + const struct lri *lri, unsigned int count)
> {
> GEM_BUG_ON(!count || count > 63);
>
> - *batch++ = MI_LOAD_REGISTER_IMM(count);
> + *batch++ = MI_LOAD_REGISTER_IMM(count) | engine->lri_cmd_mode;
> do {
> *batch++ = i915_mmio_reg_offset(lri->reg);
> *batch++ = lri->value;
> @@ -2054,7 +2055,7 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
> /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
> batch = gen8_emit_flush_coherentl3_wa(engine, batch);
>
> - batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
> + batch = emit_lri(engine, batch, lri, ARRAY_SIZE(lri));
>
> /* WaMediaPoolStateCmdInWABB:bxt,glk */
> if (HAS_POOLED_EU(engine->i915)) {
> @@ -3282,7 +3283,7 @@ static void init_common_reg_state(u32 * const regs,
> struct intel_engine_cs *engine,
> struct intel_ring *ring)
> {
> - const u32 base = engine->mmio_base;
> + const u32 base = engine->lri_mmio_base;
>
> CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
> _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
> @@ -3307,7 +3308,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
> u32 pos_bb_per_ctx)
> {
> struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
> - const u32 base = engine->mmio_base;
> + const u32 base = engine->lri_mmio_base;
> const u32 pos_indirect_ctx = pos_bb_per_ctx + 2;
> const u32 pos_indirect_ctx_offset = pos_indirect_ctx + 2;
>
> @@ -3335,6 +3336,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
> }
>
> static void init_ppgtt_reg_state(u32 *regs, u32 base,
> + struct intel_engine_cs *engine,
> struct i915_ppgtt *ppgtt)
> {
> /* PDP values well be assigned later if needed */
> @@ -3376,14 +3378,12 @@ static void gen8_init_reg_state(u32 * const regs,
> {
> struct i915_ppgtt * const ppgtt = vm_alias(ce->vm);
> const bool rcs = engine->class == RENDER_CLASS;
> - const u32 base = engine->mmio_base;
> - const u32 lri_base =
> - intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
> + const u32 base = engine->lri_mmio_base;
>
> regs[CTX_LRI_HEADER_0] =
> MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
> MI_LRI_FORCE_POSTED |
> - lri_base;
> + engine->lri_cmd_mode;
>
> init_common_reg_state(regs, ppgtt, engine, ring);
> CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
> @@ -3395,15 +3395,17 @@ static void gen8_init_reg_state(u32 * const regs,
> regs[CTX_LRI_HEADER_1] =
> MI_LOAD_REGISTER_IMM(9) |
> MI_LRI_FORCE_POSTED |
> - lri_base;
> + engine->lri_cmd_mode;
>
> CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
>
> - init_ppgtt_reg_state(regs, base, ppgtt);
> + init_ppgtt_reg_state(regs, base, engine, ppgtt);
Is passed in base always equal to engine->mmio_base? If so then there is
redundancy and you could drop this change.
Regards,
Tvrtko
>
> if (rcs) {
> - regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) | lri_base;
> - CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
> + regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) |
> + engine->lri_cmd_mode;
> + CTX_REG(regs, CTX_R_PWR_CLK_STATE,
> + GEN8_R_PWR_CLK_STATE, 0);
> }
>
> regs[CTX_END] = MI_BATCH_BUFFER_END;
> @@ -3418,14 +3420,12 @@ static void gen12_init_reg_state(u32 * const regs,
> {
> struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(ce->vm);
> const bool rcs = engine->class == RENDER_CLASS;
> - const u32 base = engine->mmio_base;
> - const u32 lri_base =
> - intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
> + const u32 base = engine->lri_mmio_base;
>
> regs[CTX_LRI_HEADER_0] =
> MI_LOAD_REGISTER_IMM(rcs ? 11 : 9) |
> MI_LRI_FORCE_POSTED |
> - lri_base;
> + engine->lri_cmd_mode;
>
> init_common_reg_state(regs, ppgtt, engine, ring);
>
> @@ -3435,15 +3435,15 @@ static void gen12_init_reg_state(u32 * const regs,
> regs[CTX_LRI_HEADER_1] =
> MI_LOAD_REGISTER_IMM(9) |
> MI_LRI_FORCE_POSTED |
> - lri_base;
> + engine->lri_cmd_mode;
>
> CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
>
> - init_ppgtt_reg_state(regs, base, ppgtt);
> + init_ppgtt_reg_state(regs, base, engine, ppgtt);
>
> if (rcs) {
> regs[GEN12_CTX_LRI_HEADER_3] =
> - MI_LOAD_REGISTER_IMM(1) | lri_base;
> + MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
> CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
>
> /* TODO: oa_init_reg_state ? */
> diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c
> index 728704bbbe18..8e8fe3deb95c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_mocs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
> @@ -368,9 +368,6 @@ static u32 get_entry_control(const struct drm_i915_mocs_table *table,
> /**
> * intel_mocs_init_engine() - emit the mocs control table
> * @engine: The engine for whom to emit the registers.
> - *
> - * This function simply emits a MI_LOAD_REGISTER_IMM command for the
> - * given table starting at the given address.
> */
> void intel_mocs_init_engine(struct intel_engine_cs *engine)
> {
> @@ -456,7 +453,8 @@ static int emit_mocs_control_table(struct i915_request *rq,
> if (IS_ERR(cs))
> return PTR_ERR(cs);
>
> - *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries);
> + *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries) |
> + rq->engine->lri_cmd_mode;
>
> for (index = 0; index < table->size; index++) {
> u32 value = get_entry_control(table, index);
> diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
> index 0747b8c9f768..7027367b1f1a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
> @@ -1536,12 +1536,13 @@ static int load_pd_dir(struct i915_request *rq, const struct i915_ppgtt *ppgtt)
> if (IS_ERR(cs))
> return PTR_ERR(cs);
>
> - *cs++ = MI_LOAD_REGISTER_IMM(1);
> - *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
> + /* Can these not be merged into a single LRI??? */
> + *cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
> + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->lri_mmio_base));
> *cs++ = PP_DIR_DCLV_2G;
>
> - *cs++ = MI_LOAD_REGISTER_IMM(1);
> - *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
> + *cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
> + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->lri_mmio_base));
> *cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
>
> intel_ring_advance(rq, cs);
> @@ -1709,7 +1710,8 @@ static int remap_l3_slice(struct i915_request *rq, int slice)
> * here because no other code should access these registers other than
> * at initialization time.
> */
> - *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
> + *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE / 4) |
> + rq->engine->lri_cmd_mode;
> for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
> *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
> *cs++ = remap_info[i];
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index c1b764233761..8b85cdfada21 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1693,6 +1693,11 @@ gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
> };
> int i;
>
> + /*
> + * NB: The LRI instruction is generated by the hardware.
> + * Should we read it in and assert that the offset flag is set?
> + */
> +
> CTX_REG(reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
> (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
> (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
> @@ -1752,8 +1757,9 @@ gen8_load_flex(struct i915_request *rq,
> if (IS_ERR(cs))
> return PTR_ERR(cs);
>
> - *cs++ = MI_LOAD_REGISTER_IMM(count);
> + *cs++ = MI_LOAD_REGISTER_IMM(count) | rq->engine->lri_cmd_mode;
> do {
> + /* FIXME: Is this table LRI remap/offset friendly? */
> *cs++ = i915_mmio_reg_offset(flex->reg);
> *cs++ = flex->value;
> } while (flex++, --count);
>
More information about the Intel-gfx
mailing list