[Intel-gfx] [PATCH 1/3] drm/i915: Add per ctx batchbuffer wa for timestamp
Chris Wilson
chris at chris-wilson.co.uk
Fri Apr 17 15:07:17 UTC 2020
Quoting Mika Kuoppala (2020-04-17 15:44:27)
> Restoration of a previous timestamp can collide
> with updating the timestamp, causing a value corruption.
>
> Combat this issue by using indirect ctx bb to
> modify the context image during restoring process.
>
> For render engine, we can preload value into
> scratch register. From which we then do the actual
> write with LRR. LRR is faster and thus less error prone.
> For other engines, no scratch is available so we
> must do a more complex sequence of sync and async LRMs.
> As the LRM is slower, the probablity of racy write
> raises and thus we still see corruption sometimes.
>
> References: HSDES#16010904313
> Testcase: igt/i915_selftest/gt_lrc
> Suggested-by: Joseph Koston <joseph.koston at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
>
> bug on fix
> ---
> drivers/gpu/drm/i915/gt/intel_context_types.h | 3 +
> drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 3 +-
> drivers/gpu/drm/i915/gt/intel_lrc.c | 196 ++++++++++++++----
> drivers/gpu/drm/i915/gt/intel_lrc_reg.h | 1 +
> 4 files changed, 165 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 07cb83a0d017..c7573d565f58 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -70,6 +70,9 @@ struct intel_context {
>
> u32 *lrc_reg_state;
> u64 lrc_desc;
> +
> + u32 ctx_bb_offset;
> +
> u32 tag; /* cookie passed to HW to track this context on submission */
>
> /* Time on GPU as tracked by the hw. */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index f04214a54f75..0c2adb4078a7 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -138,7 +138,7 @@
> */
> #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1)
> /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
> -#define MI_LRI_CS_MMIO (1<<19)
> +#define MI_LRI_LRM_CS_MMIO (1<<19)
> #define MI_LRI_FORCE_POSTED (1<<12)
> #define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
> #define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1)
> @@ -155,6 +155,7 @@
> #define MI_FLUSH_DW_USE_PPGTT (0<<2)
> #define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1)
> #define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
> +#define MI_LRM_ASYNC (1<<21)
> #define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1)
> #define MI_BATCH_BUFFER MI_INSTR(0x30, 1)
> #define MI_BATCH_NON_SECURE (1)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 6fbad5e2343f..531884b9050c 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -234,7 +234,7 @@ static void execlists_init_reg_state(u32 *reg_state,
> const struct intel_ring *ring,
> bool close);
> static void
> -__execlists_update_reg_state(const struct intel_context *ce,
> +__execlists_update_reg_state(struct intel_context *ce,
> const struct intel_engine_cs *engine,
> u32 head);
>
> @@ -537,7 +537,7 @@ static void set_offsets(u32 *regs,
> if (flags & POSTED)
> *regs |= MI_LRI_FORCE_POSTED;
> if (INTEL_GEN(engine->i915) >= 11)
> - *regs |= MI_LRI_CS_MMIO;
> + *regs |= MI_LRI_LRM_CS_MMIO;
> regs++;
>
> GEM_BUG_ON(!count);
> @@ -3142,8 +3142,152 @@ static void execlists_context_unpin(struct intel_context *ce)
> i915_gem_object_unpin_map(ce->state->obj);
> }
>
> +static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
> +{
> + u32 indirect_ctx_offset;
> +
> + switch (INTEL_GEN(engine->i915)) {
> + default:
> + MISSING_CASE(INTEL_GEN(engine->i915));
> + fallthrough;
> + case 12:
> + indirect_ctx_offset =
> + GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> + break;
> + case 11:
> + indirect_ctx_offset =
> + GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> + break;
> + case 10:
> + indirect_ctx_offset =
> + GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> + break;
> + case 9:
> + indirect_ctx_offset =
> + GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> + break;
> + case 8:
> + indirect_ctx_offset =
> + GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> + break;
> + }
> +
> + return indirect_ctx_offset;
> +}
> +
> +static u32 *
> +gen12_emit_timestamp_wa_lrm(struct intel_context *ce, u32 *cs)
> +{
> + const u32 lrc_offset = i915_ggtt_offset(ce->state) +
> + LRC_STATE_PN * PAGE_SIZE;
> + const u32 lrm = MI_LOAD_REGISTER_MEM_GEN8 |
> + MI_SRM_LRM_GLOBAL_GTT | MI_LRI_LRM_CS_MMIO;
> +
> + *cs++ = lrm;
> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
> + *cs++ = 0;
> +
> + *cs++ = lrm | MI_LRM_ASYNC;
> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
> + *cs++ = 0;
> +
> + *cs++ = lrm | MI_LRM_ASYNC;
> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
> + *cs++ = 0;
> +
> + *cs++ = lrm;
> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
> + *cs++ = 0;
> +
> + return cs;
> +}
> +
> +static u32 *
> +gen12_emit_timestamp_wa_lrr(struct intel_context *ce, u32 *cs)
> +{
> + const u32 lrc_offset = i915_ggtt_offset(ce->state) +
> + LRC_STATE_PN * PAGE_SIZE;
Repetitive.
> + const u32 lrm = MI_LOAD_REGISTER_MEM_GEN8 |
> + MI_SRM_LRM_GLOBAL_GTT | MI_LRI_LRM_CS_MMIO;
Used once, might as just use the constant inplace.
> + const u32 scratch_reg = 0x2198;
> +
> + *cs++ = lrm;
> + *cs++ = scratch_reg;
> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
> + *cs++ = 0;
> +
> + *cs++ = MI_LOAD_REGISTER_REG | MI_LRI_LRM_CS_MMIO;
> + *cs++ = scratch_reg;
> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
> +
> + *cs++ = MI_LOAD_REGISTER_REG | MI_LRI_LRM_CS_MMIO;
> + *cs++ = scratch_reg;
> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
> +
> + return cs;
> +}
> +
> +static u32 *
> +execlists_emit_ctx_bb(struct intel_context *ce,
> + u32 *(*emit)(struct intel_context *, u32 *))
> +{
> + u32 *cs = (void *)(ce->lrc_reg_state) -
> + LRC_STATE_PN * PAGE_SIZE + ce->ctx_bb_offset;
static u32 *context_indirect_bb(struct intel_context *ce)
{
void *ptr;
GEM_BUG_ON(!intel_context_is_pinned(ce));
GEM_BUG_ON(!ce->ctx_bb_offset);
ptr = ce->lrc_reg_state;
ptr -= LRC_STATE_PN * PAGE_SIZE; /* back to start of context image */
ptr += ce->ctx_bb_offset;
return ptr;
}
> + const u32 * const batch_start = cs;
> +
> + GEM_DEBUG_BUG_ON(!ce->ctx_bb_offset);
> +
> + cs = emit(ce, cs);
> +
> + GEM_DEBUG_BUG_ON(cs - batch_start >
> + (I915_GTT_PAGE_SIZE - 4)/sizeof(*cs));
> +
> + return cs;
> +}
> +
> +static void
> +execlists_setup_indirect_ctx_bb(struct intel_context *ce,
> + u32 *(*emit)(struct intel_context *, u32 *))
> +{
> + const u32 indirect_ptr_offset =
> + INTEL_GEN(ce->engine->i915) >= 12 ?
> + GEN12_CTX_BB_PER_CTX_PTR + 2 :
> + CTX_BB_PER_CTX_PTR + 2;
Do we have this in live_lrc_layout yet?
> + const u32 * const start = (void *)(ce->lrc_reg_state) -
> + LRC_STATE_PN * PAGE_SIZE + ce->ctx_bb_offset;
> + u32 *cs;
> +
> + cs = execlists_emit_ctx_bb(ce, emit);
> +
> + while ((unsigned long)cs % CACHELINE_BYTES)
> + *cs++ = MI_NOOP;
> +
> + ce->lrc_reg_state[indirect_ptr_offset] =
> + (i915_ggtt_offset(ce->state) + ce->ctx_bb_offset) |
> + (cs - start) * sizeof(*cs) /
> + CACHELINE_BYTES;
> +
> + ce->lrc_reg_state[indirect_ptr_offset + 2] =
> + intel_lr_indirect_ctx_offset(ce->engine) << 6;
> +}
> +
> static void
> -__execlists_update_reg_state(const struct intel_context *ce,
> +gen12_setup_timestamp_ctx_wa(struct intel_context *ce)
> +{
> + if (ce->engine->class == RENDER_CLASS)
> + execlists_setup_indirect_ctx_bb(ce,
I'd be tempted to drop the execlists_ at this depth.
> + gen12_emit_timestamp_wa_lrr);
> + else
> + execlists_setup_indirect_ctx_bb(ce,
> + gen12_emit_timestamp_wa_lrm);
u32 *(*fn)(struct intel_context *ce, u32 *cs);
fn = gen12_emit_timestamp_wa_lrm;
if (ce->engine->class == RENDER_CLASS)
fn = gen12_emit_timestamp_wa_lrr;
setup_indirect_ctx_bb(ce, fn);
> +static u32 *
> +gen12_emit_timestamp_wa_lrr(struct intel_context *ce, u32 *cs)
> +}
> +
> +static void
> +__execlists_update_reg_state(struct intel_context *ce,
> const struct intel_engine_cs *engine,
> u32 head)
> {
> @@ -3164,6 +3308,13 @@ __execlists_update_reg_state(const struct intel_context *ce,
> intel_sseu_make_rpcs(engine->i915, &ce->sseu);
>
> i915_oa_init_reg_state(ce, engine);
> +
> + }
> +
> + if (ce->ctx_bb_offset) {
> + /* Mutually exclusive wrt to global indirect bb */
> + GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
> + gen12_setup_timestamp_ctx_wa(ce);
> }
> }
>
> @@ -4667,40 +4818,6 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
> return 0;
> }
>
> -static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
> -{
> - u32 indirect_ctx_offset;
> -
> - switch (INTEL_GEN(engine->i915)) {
> - default:
> - MISSING_CASE(INTEL_GEN(engine->i915));
> - /* fall through */
> - case 12:
> - indirect_ctx_offset =
> - GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> - break;
> - case 11:
> - indirect_ctx_offset =
> - GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> - break;
> - case 10:
> - indirect_ctx_offset =
> - GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> - break;
> - case 9:
> - indirect_ctx_offset =
> - GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> - break;
> - case 8:
> - indirect_ctx_offset =
> - GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
> - break;
> - }
> -
> - return indirect_ctx_offset;
> -}
> -
> -
> static void init_common_reg_state(u32 * const regs,
> const struct intel_engine_cs *engine,
> const struct intel_ring *ring,
> @@ -4867,6 +4984,11 @@ static int __execlists_context_alloc(struct intel_context *ce,
> if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> context_size += I915_GTT_PAGE_SIZE; /* for redzone */
>
> + if (INTEL_GEN(engine->i915) == 12) {
> + ce->ctx_bb_offset = context_size;
> + context_size += PAGE_SIZE;
> + }
> +
> ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
> if (IS_ERR(ctx_obj))
> return PTR_ERR(ctx_obj);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h
> index d39b72590e40..bb444614f33b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h
> @@ -32,6 +32,7 @@
>
> /* GEN12+ Reg State Context */
> #define GEN12_CTX_BB_PER_CTX_PTR (0x12 + 1)
> +#define CTX_BB_PER_CTX_PTR_VALID BIT(0)
>
> #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \
> u32 *reg_state__ = (reg_state); \
Was I imagining things, but didn't you devise a selftest to prove we
could use the indirect-ctx-bb to do interesting things?
-Chris
More information about the Intel-gfx
mailing list