[Intel-gfx] [PATCH 2/2] drm/i915: Engine relative MMIO for Gen12
Daniele Ceraolo Spurio
daniele.ceraolospurio at intel.com
Thu Aug 22 20:44:25 UTC 2019
On 8/22/19 11:02 AM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> Gen12 introduces a completely new and different scheme for
> implementing engine relative MMIO accesses - MI_LRI_MMIO_REMAP. This
> requires using the base address of instance zero of the relevant
> engine class. And then, it is only valid if the register in
> question falls within a certain range as specified by a table.
>
Bspec: 45606
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> CC: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 185 ++++++++++++++++++-
> drivers/gpu/drm/i915/gt/intel_engine_types.h | 7 +
> drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 9 +-
> drivers/gpu/drm/i915/i915_perf.c | 3 +-
> 4 files changed, 195 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index efe1c377d797..a65e8ccd9d8d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -236,6 +236,127 @@ static u32 __engine_mmio_base(struct drm_i915_private *i915,
> return bases[i].base;
> }
>
> +static void lri_init_remap_base(struct intel_engine_cs *engine)
> +{
> + struct intel_engine_cs *remap_engine;
> +
> + engine->lri_mmio_base = 0;
> +
> + if (INTEL_GEN(engine->i915) < 12)
> + return;
> +
> + remap_engine = engine->i915->gt.engine_class[engine->class][0];
you can just use engine->gt. But anyway instance 0 is not guaranteed to
exist as it might've been fused off. if we add a static table of first
instance per class we can then fetch it directly from the info table. e.g.
static const enum first_instance[] = {
[RENDER_CLASS] = RCS0,
...
}
u32 id;
GEM_BUG_ON(engine->class > ARRAY_SIZE(first_instance));
id = first_instance[engine->class];
engine->lri_mmio_base =
__engine_mmio_base(i915, intel_engines[id].mmio_bases);
> + GEM_BUG_ON(!remap_engine);
> +
> + engine->lri_mmio_base = remap_engine->mmio_base; > +}
> +
> +static void lri_add_range(struct intel_engine_cs *engine, u32 min, u32 max)
I think it'd be cleaner at the call point if we go with base and size
instead of min and max.
> +{
> + GEM_BUG_ON(engine->lri_num_ranges >= INTEL_MAX_LRI_RANGES);
> +
> + engine->lri_ranges[engine->lri_num_ranges].min = min;
> + engine->lri_ranges[engine->lri_num_ranges].max = max;
> + engine->lri_num_ranges++;
> +}
> +
> +static void lri_init_remap_ranges(struct intel_engine_cs *engine)
> +{
> + bool has_aux_tables = true; /* Removed after TGL? */
This should really be a device flag. But do we actually need to write
any of the registers in the aux tables from the kernel? if not, maybe we
can get away not setting these in the ranges, adding a nice comment
about it in case we get failures later on. That way we'll also be able
to keep lri_num_ranges fixed at 2 and just use a define for it (we can
use a variable local to this function for lri_add_range)
> + u32 offset;
> +
> + engine->lri_num_ranges = 0;
> +
> + if (INTEL_GEN(engine->i915) < 12)
> + return;
> +
> + switch (engine->class) {
> + case RENDER_CLASS:
> + /* Hardware Front End */
> + lri_add_range(engine, 0x000 + engine->mmio_base,
> + 0x7FF + engine->mmio_base);
> +
> + /* TRTT */
> + lri_add_range(engine, 0x4400, 0x441F);
> +
> + /* Aux Tables - REMOVEDBY(GEN:HAS:1406929672) */
> + if (has_aux_tables)
> + lri_add_range(engine, 0x4200, 0x420F);
> + break;
matches
> +
> + case VIDEO_DECODE_CLASS:
> + lri_add_range(engine, 0x0000 + engine->mmio_base,
> + 0x3FFF + engine->mmio_base);
> +
> + /* TRTT */
> + offset = ((engine->instance & 0x1) * 0x20) +
> + ((engine->instance >> 1) * 0x100);
> + lri_add_range(engine, 0x4420 + offset, 0x443F + offset);
Matches. A bit convoluted, but I can't suggest a better option. Maybe a
comment?
> +
> + /* Aux Tables - REMOVEDBY(GEN:HAS:1406929672) */
> + if (has_aux_tables) {
> + switch (engine->instance) {
> + case 0:
> + lri_add_range(engine, 0x4210, 0x421F);
> + break;
> +
> + case 1:
> + lri_add_range(engine, 0x4220, 0x422F);
> + break;
> +
> + case 2:
> + lri_add_range(engine, 0x4290, 0x429F);
> + break;
> +
> + case 3:
> + lri_add_range(engine, 0x42A0, 0x42AF);
> + break;
> +
> + default:
> + break;
> + }
> + }
> + break;
> +
> + case VIDEO_ENHANCEMENT_CLASS:
> + lri_add_range(engine, 0x0000 + engine->mmio_base,
> + 0x3FFF + engine->mmio_base);
> +
> + /* TRTT */
> + offset = engine->instance * 0x100;
> + lri_add_range(engine, 0x4460 + offset, 0x447F + offset);
matches
> +
> + /* Aux Tables - REMOVEDBY(GEN:HAS:1406929672) */
> + if (has_aux_tables) {
> + switch (engine->instance) {
> + case 0:
> + lri_add_range(engine, 0x4230, 0x423F);
> + break;
> +
> + case 1:
> + lri_add_range(engine, 0x42B0, 0x42BF);
> + break;
> +
> + case 2:
> + lri_add_range(engine, 0x4290, 0x429F);
> + break;
> +
> + case 3:
> + // Same address as instance 1???
> + lri_add_range(engine, 0x42B0, 0x42BF);
> + break;
> +
We do not have this many VECS engines defined.
Also, can we do the check on the updated offset? that way we only need
to record and validate the offsets from instance 0.
> + default:
> + break;
> + }
> + }
> + break;
> +
> + default:
> + break;
> + }
> +}
> +
> static u32 i915_get_lri_cmd_legacy(const struct intel_engine_cs *engine,
> u32 word_count)
> {
> @@ -249,6 +370,27 @@ static u32 i915_get_lri_cmd_add_offset(const struct intel_engine_cs *engine,
> MI_LRI_ADD_CS_MMIO_START_GEN11;
> }
>
> +static u32 i915_get_lri_cmd_remap(const struct intel_engine_cs *engine,
> + u32 word_count)
> +{
> + u32 word;
> +
> + word = __MI_LOAD_REGISTER_IMM(word_count);
> +
> + /* if (lri_is_reg_in_remap_table(engine, reg)) ??? */
> + word |= MI_LRI_MMIO_REMAP_GEN12;
> +
> + /*
> + * NB: To gate this on the reg address will require knowing
> + * all reg addresses in advance. This is not currently the
> + * case as some LRI commands are built from multiple sources.
> + * Also, what if some regs require remap and some do not?
> + * The LRI command would need to be split into multiple pieces.
> + */
All the register in the engine ranges (i.e. the ones repeated for all
engines) are covered, so we should expect that all registers require
remap. If we want to write a register that it's in the global range then
we should use __MI_LOAD_REGISTER_IMM directly. Therefore, I would just do:
return __MI_LOAD_REGISTER_IMM(word_count) | MI_LRI_MMIO_REMAP_GEN12;
> +
> + return word;
> +}
> +
> static bool i915_engine_has_relative_lri(const struct intel_engine_cs *engine)
> {
> if (INTEL_GEN(engine->i915) < 11)
> @@ -262,18 +404,53 @@ static bool i915_engine_has_relative_lri(const struct intel_engine_cs *engine)
>
> static void lri_init(struct intel_engine_cs *engine)
> {
> - if (i915_engine_has_relative_lri(engine))
> - engine->get_lri_cmd = i915_get_lri_cmd_add_offset;
> - else
> + if (i915_engine_has_relative_lri(engine)) {
> + if (INTEL_GEN(engine->i915) < 12)
> + engine->get_lri_cmd = i915_get_lri_cmd_add_offset;
> + else {
> + engine->get_lri_cmd = i915_get_lri_cmd_remap;
the only difference in the cmd between the various modes is the flag we
set, can't we just save that instead of having a full-blown virtual
function?
> +
> + lri_init_remap_base(engine);
> + lri_init_remap_ranges(engine);
> + }
> + } else
> engine->get_lri_cmd = i915_get_lri_cmd_legacy;
> }
>
> +static bool lri_is_reg_in_remap_table(const struct intel_engine_cs *engine,
> + i915_reg_t reg)
> +{
> + int i;
> + u32 offset = i915_mmio_reg_offset(reg);
> +
> + for (i = 0; i < engine->lri_num_ranges; i++) {
> + if (offset < engine->lri_ranges[i].min)
> + continue;
> +
> + if (offset > engine->lri_ranges[i].max)
> + continue;
> +
> + return true;
> + }
> +
> + return false;
> +}
> +
> u32 i915_get_lri_reg(const struct intel_engine_cs *engine, i915_reg_t reg)
> {
> if (!i915_engine_has_relative_lri(engine))
> return i915_mmio_reg_offset(reg);
>
> - return i915_mmio_reg_offset(reg) - engine->mmio_base;
> + if (INTEL_GEN(engine->i915) < 12)
> + return i915_mmio_reg_offset(reg) - engine->mmio_base;
> +
> + if (!WARN_ON(lri_is_reg_in_remap_table(engine, reg))) {
> + /* Is this meant to happen? */
No, so just go with:
GEM_DEBUG_BUG_ON(!lri_is_reg_in_remap_table(engine, reg));
and make all the remap table code conditionally compiled based on
CONFIG_DRM_I915_DEBUG_GEM.
> + return i915_mmio_reg_offset(reg);
> + }
> +
> + return i915_mmio_reg_offset(reg) - engine->mmio_base +
> + engine->lri_mmio_base;
This doesn't work. The reason why we have the new mechanism is that the
register for different instances are no at the same delta from the MMIO
base. E.g. the TRTT registers you have above.
> }
if we set engine->lri_mmio_base appropriately on all platforms, can't we
instead just use that instead of engine->mmio_base when emitting the
registers with the relative LRI? to keep the check we can do something like:
#define lri_reg(engine__, reg__) \
({ \
GEM_DEBUG_BUG_ON(INTEL_GEN(engine->i915) >= 12 && \
!lri_is_reg_in_remap_table(engine__, reg__)); \
reg__; \
})
Daniele
>
> static void __sprint_engine_name(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 7ca6c86a33f6..1e26f668e73b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -306,6 +306,13 @@ struct intel_engine_cs {
> u32 context_size;
> u32 mmio_base;
>
> +#define INTEL_MAX_LRI_RANGES 3
> + struct lri_range {
> + u32 min, max;
> + } lri_ranges[INTEL_MAX_LRI_RANGES];
> + u32 lri_num_ranges;
> + u32 lri_mmio_base;
> +
> u32 (*get_lri_cmd)(const struct intel_engine_cs *engine,
> u32 word_count);
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index eaa019df0ce7..0ee62a61d7b5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -130,14 +130,15 @@
> * simply ignores the register load under certain conditions.
> * - One can actually load arbitrary many arbitrary registers: Simply issue x
> * address/value pairs. Don't overdue it, though, x <= 2^4 must hold!
> - * - Newer hardware supports engine relative addressing but older hardware does
> - * not. This is required for hw engine load balancing. Hence the MI_LRI
> - * instruction itself is prefixed with '__' and should only be used on
> - * legacy hardware code paths. Generic code must always use the MI_LRI
> + * - Newer hardware supports engine relative addressing but using multiple
> + * incompatible schemes. This is required for hw engine load balancing. Hence
> + * the MI_LRI instruction itself is prefixed with '__' and should only be
> + * used on legacy hardware code paths. Generic code must always use the MI_LRI
> * and i915_get_lri_reg() helper functions instead.
> */
> #define __MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1)
> #define MI_LRI_FORCE_POSTED (1<<12)
> +#define MI_LRI_MMIO_REMAP_GEN12 (1<<17)
> #define MI_LRI_ADD_CS_MMIO_START_GEN11 (1<<19)
> #define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1)
> #define MI_STORE_REGISTER_MEM_GEN8 MI_INSTR(0x24, 2)
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 83abdda05ba2..f88642209283 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1694,7 +1694,8 @@ gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
>
> /*
> * NB: The LRI instruction is generated by the hardware.
> - * Should we read it in and assert that the offset flag is set?
> + * Should we read it in and assert that the appropriate
> + * offset flag is set?
> */
>
> CTX_REG(ce->engine, reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
>
More information about the Intel-gfx
mailing list