[Intel-gfx] [PATCH 1/3] drm/i915: Add per ctx batchbuffer wa for timestamp
Mika Kuoppala
mika.kuoppala at linux.intel.com
Tue Apr 21 10:01:32 UTC 2020
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Quoting Mika Kuoppala (2020-04-17 15:44:27)
>> Restoration of a previous timestamp can collide
>> with updating the timestamp, causing a value corruption.
>>
>> Combat this issue by using indirect ctx bb to
>> modify the context image during restoring process.
>>
>> For render engine, we can preload value into
>> scratch register. From which we then do the actual
>> write with LRR. LRR is faster and thus less error prone.
>> For other engines, no scratch is available so we
>> must do a more complex sequence of sync and async LRMs.
>> As the LRM is slower, the probablity of racy write
>> raises and thus we still see corruption sometimes.
>>
>> References: HSDES#16010904313
>> Testcase: igt/i915_selftest/gt_lrc
>> Suggested-by: Joseph Koston <joseph.koston at intel.com>
>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
>>
>> bug on fix
>> ---
>> drivers/gpu/drm/i915/gt/intel_context_types.h | 3 +
>> drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 3 +-
>> drivers/gpu/drm/i915/gt/intel_lrc.c | 196 ++++++++++++++----
>> drivers/gpu/drm/i915/gt/intel_lrc_reg.h | 1 +
>> 4 files changed, 165 insertions(+), 38 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
>> index 07cb83a0d017..c7573d565f58 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>> @@ -70,6 +70,9 @@ struct intel_context {
>>
>> u32 *lrc_reg_state;
>> u64 lrc_desc;
>> +
>> + u32 ctx_bb_offset;
>> +
>> u32 tag; /* cookie passed to HW to track this context on submission */
>>
>> /* Time on GPU as tracked by the hw. */
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> index f04214a54f75..0c2adb4078a7 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> @@ -138,7 +138,7 @@
>> */
>> #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1)
>> /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
>> -#define MI_LRI_CS_MMIO (1<<19)
>> +#define MI_LRI_LRM_CS_MMIO (1<<19)
>> #define MI_LRI_FORCE_POSTED (1<<12)
>> #define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
>> #define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1)
>> @@ -155,6 +155,7 @@
>> #define MI_FLUSH_DW_USE_PPGTT (0<<2)
>> #define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1)
>> #define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
>> +#define MI_LRM_ASYNC (1<<21)
>> #define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1)
>> #define MI_BATCH_BUFFER MI_INSTR(0x30, 1)
>> #define MI_BATCH_NON_SECURE (1)
>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> index 6fbad5e2343f..531884b9050c 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>> @@ -234,7 +234,7 @@ static void execlists_init_reg_state(u32 *reg_state,
>> const struct intel_ring *ring,
>> bool close);
>> static void
>> -__execlists_update_reg_state(const struct intel_context *ce,
>> +__execlists_update_reg_state(struct intel_context *ce,
>> const struct intel_engine_cs *engine,
>> u32 head);
>>
>> @@ -537,7 +537,7 @@ static void set_offsets(u32 *regs,
>> if (flags & POSTED)
>> *regs |= MI_LRI_FORCE_POSTED;
>> if (INTEL_GEN(engine->i915) >= 11)
>> - *regs |= MI_LRI_CS_MMIO;
>> + *regs |= MI_LRI_LRM_CS_MMIO;
>> regs++;
>>
>> GEM_BUG_ON(!count);
>> @@ -3142,8 +3142,152 @@ static void execlists_context_unpin(struct intel_context *ce)
>> i915_gem_object_unpin_map(ce->state->obj);
>> }
>>
>> +static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
>> +{
>> + u32 indirect_ctx_offset;
>> +
>> + switch (INTEL_GEN(engine->i915)) {
>> + default:
>> + MISSING_CASE(INTEL_GEN(engine->i915));
>> + fallthrough;
>> + case 12:
>> + indirect_ctx_offset =
>> + GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> + break;
>> + case 11:
>> + indirect_ctx_offset =
>> + GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> + break;
>> + case 10:
>> + indirect_ctx_offset =
>> + GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> + break;
>> + case 9:
>> + indirect_ctx_offset =
>> + GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> + break;
>> + case 8:
>> + indirect_ctx_offset =
>> + GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> + break;
>> + }
>> +
>> + return indirect_ctx_offset;
>> +}
>> +
>> +static u32 *
>> +gen12_emit_timestamp_wa_lrm(struct intel_context *ce, u32 *cs)
>> +{
>> + const u32 lrc_offset = i915_ggtt_offset(ce->state) +
>> + LRC_STATE_PN * PAGE_SIZE;
>> + const u32 lrm = MI_LOAD_REGISTER_MEM_GEN8 |
>> + MI_SRM_LRM_GLOBAL_GTT | MI_LRI_LRM_CS_MMIO;
>> +
>> + *cs++ = lrm;
>> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
>> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
>> + *cs++ = 0;
>> +
>> + *cs++ = lrm | MI_LRM_ASYNC;
>> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
>> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
>> + *cs++ = 0;
>> +
>> + *cs++ = lrm | MI_LRM_ASYNC;
>> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
>> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
>> + *cs++ = 0;
>> +
>> + *cs++ = lrm;
>> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
>> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
>> + *cs++ = 0;
>> +
>> + return cs;
>> +}
>> +
>> +static u32 *
>> +gen12_emit_timestamp_wa_lrr(struct intel_context *ce, u32 *cs)
>> +{
>> + const u32 lrc_offset = i915_ggtt_offset(ce->state) +
>> + LRC_STATE_PN * PAGE_SIZE;
>
> Repetitive.
I will introduce LRC_STATE_OFFSET driverwide :P
>
>> + const u32 lrm = MI_LOAD_REGISTER_MEM_GEN8 |
>> + MI_SRM_LRM_GLOBAL_GTT | MI_LRI_LRM_CS_MMIO;
>
> Used once, might as just use the constant inplace.
>
>> + const u32 scratch_reg = 0x2198;
>> +
>> + *cs++ = lrm;
>> + *cs++ = scratch_reg;
>> + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32);
>> + *cs++ = 0;
>> +
>> + *cs++ = MI_LOAD_REGISTER_REG | MI_LRI_LRM_CS_MMIO;
>> + *cs++ = scratch_reg;
>> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
>> +
>> + *cs++ = MI_LOAD_REGISTER_REG | MI_LRI_LRM_CS_MMIO;
>> + *cs++ = scratch_reg;
>> + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
>> +
>> + return cs;
>> +}
>> +
>> +static u32 *
>> +execlists_emit_ctx_bb(struct intel_context *ce,
>> + u32 *(*emit)(struct intel_context *, u32 *))
>> +{
>> + u32 *cs = (void *)(ce->lrc_reg_state) -
>> + LRC_STATE_PN * PAGE_SIZE + ce->ctx_bb_offset;
>
> static u32 *context_indirect_bb(struct intel_context *ce)
> {
> void *ptr;
>
> GEM_BUG_ON(!intel_context_is_pinned(ce));
This happens during pinning as we need different
contents for each pinned location.
Apparently the pinned state will be set only
after until all these preparations are done.
So I can't use
GEM_BUG_ON(!intel_context_is_pinned(ce));
on this stage.
> GEM_BUG_ON(!ce->ctx_bb_offset);
>
> ptr = ce->lrc_reg_state;
> ptr -= LRC_STATE_PN * PAGE_SIZE; /* back to start of context image */
> ptr += ce->ctx_bb_offset;
>
> return ptr;
> }
>
>> + const u32 * const batch_start = cs;
>> +
>> + GEM_DEBUG_BUG_ON(!ce->ctx_bb_offset);
>> +
>> + cs = emit(ce, cs);
>> +
>> + GEM_DEBUG_BUG_ON(cs - batch_start >
>> + (I915_GTT_PAGE_SIZE - 4)/sizeof(*cs));
>> +
>> + return cs;
>> +}
>> +
>> +static void
>> +execlists_setup_indirect_ctx_bb(struct intel_context *ce,
>> + u32 *(*emit)(struct intel_context *, u32 *))
>> +{
>> + const u32 indirect_ptr_offset =
>> + INTEL_GEN(ce->engine->i915) >= 12 ?
>> + GEN12_CTX_BB_PER_CTX_PTR + 2 :
>> + CTX_BB_PER_CTX_PTR + 2;
>
> Do we have this in live_lrc_layout yet?
Yes it is there.
>
>> + const u32 * const start = (void *)(ce->lrc_reg_state) -
>> + LRC_STATE_PN * PAGE_SIZE + ce->ctx_bb_offset;
>> + u32 *cs;
>> +
>> + cs = execlists_emit_ctx_bb(ce, emit);
>> +
>> + while ((unsigned long)cs % CACHELINE_BYTES)
>> + *cs++ = MI_NOOP;
>> +
>> + ce->lrc_reg_state[indirect_ptr_offset] =
>> + (i915_ggtt_offset(ce->state) + ce->ctx_bb_offset) |
>> + (cs - start) * sizeof(*cs) /
>> + CACHELINE_BYTES;
>> +
>> + ce->lrc_reg_state[indirect_ptr_offset + 2] =
>> + intel_lr_indirect_ctx_offset(ce->engine) << 6;
>> +}
>> +
>> static void
>> -__execlists_update_reg_state(const struct intel_context *ce,
>> +gen12_setup_timestamp_ctx_wa(struct intel_context *ce)
>> +{
>> + if (ce->engine->class == RENDER_CLASS)
>> + execlists_setup_indirect_ctx_bb(ce,
>
> I'd be tempted to drop the execlists_ at this depth.
Dropped.
>
>> + gen12_emit_timestamp_wa_lrr);
>> + else
>> + execlists_setup_indirect_ctx_bb(ce,
>> + gen12_emit_timestamp_wa_lrm);
>
> u32 *(*fn)(struct intel_context *ce, u32 *cs);
>
> fn = gen12_emit_timestamp_wa_lrm;
> if (ce->engine->class == RENDER_CLASS)
> fn = gen12_emit_timestamp_wa_lrr;
>
> setup_indirect_ctx_bb(ce, fn);
>
>
>> +static u32 *
>> +gen12_emit_timestamp_wa_lrr(struct intel_context *ce, u32 *cs)
>> +}
>> +
>> +static void
>> +__execlists_update_reg_state(struct intel_context *ce,
>> const struct intel_engine_cs *engine,
>> u32 head)
>> {
>> @@ -3164,6 +3308,13 @@ __execlists_update_reg_state(const struct intel_context *ce,
>> intel_sseu_make_rpcs(engine->i915, &ce->sseu);
>>
>> i915_oa_init_reg_state(ce, engine);
>> +
>> + }
>> +
>> + if (ce->ctx_bb_offset) {
>> + /* Mutually exclusive wrt to global indirect bb */
>> + GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
>> + gen12_setup_timestamp_ctx_wa(ce);
>> }
>> }
>>
>> @@ -4667,40 +4818,6 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
>> return 0;
>> }
>>
>> -static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
>> -{
>> - u32 indirect_ctx_offset;
>> -
>> - switch (INTEL_GEN(engine->i915)) {
>> - default:
>> - MISSING_CASE(INTEL_GEN(engine->i915));
>> - /* fall through */
>> - case 12:
>> - indirect_ctx_offset =
>> - GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> - break;
>> - case 11:
>> - indirect_ctx_offset =
>> - GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> - break;
>> - case 10:
>> - indirect_ctx_offset =
>> - GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> - break;
>> - case 9:
>> - indirect_ctx_offset =
>> - GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> - break;
>> - case 8:
>> - indirect_ctx_offset =
>> - GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
>> - break;
>> - }
>> -
>> - return indirect_ctx_offset;
>> -}
>> -
>> -
>> static void init_common_reg_state(u32 * const regs,
>> const struct intel_engine_cs *engine,
>> const struct intel_ring *ring,
>> @@ -4867,6 +4984,11 @@ static int __execlists_context_alloc(struct intel_context *ce,
>> if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
>> context_size += I915_GTT_PAGE_SIZE; /* for redzone */
>>
>> + if (INTEL_GEN(engine->i915) == 12) {
>> + ce->ctx_bb_offset = context_size;
>> + context_size += PAGE_SIZE;
>> + }
>> +
>> ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
>> if (IS_ERR(ctx_obj))
>> return PTR_ERR(ctx_obj);
>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h
>> index d39b72590e40..bb444614f33b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h
>> @@ -32,6 +32,7 @@
>>
>> /* GEN12+ Reg State Context */
>> #define GEN12_CTX_BB_PER_CTX_PTR (0x12 + 1)
>> +#define CTX_BB_PER_CTX_PTR_VALID BIT(0)
>>
>> #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \
>> u32 *reg_state__ = (reg_state); \
>
> Was I imagining things, but didn't you devise a selftest to prove we
> could use the indirect-ctx-bb to do interesting things?
I have simple selftest and then some followup patches to show
how we can mold the context image.
-Mika
> -Chris
More information about the Intel-gfx
mailing list