[Intel-gfx] [PATCH 3/3] drm/i915: Pipeline PDP updates for Braswell
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Dec 7 10:30:53 UTC 2018
On 07/12/2018 09:02, Chris Wilson wrote:
> Currently we face a severe problem on Braswell that manifests as invalid
> ppGTT accesses. The code tries to maintain the PDP (page directory
> pointers) inside the context in two ways, direct write into the context
> and a pipelined LRI update. The direct write into the context is
> fundamentally racy as it is unserialised with any access (read or write)
> the GPU is doing. By asserting that Braswell is not used with vGPU
> (currently an unsupported platform) we can eliminate the dangerous
> direct write into the context image and solely use the pipelined update.
>
> However, the LRI of the PDP fouls up the GPU, causing it to freeze and
> take out the machine with "forcewake ack timeouts". This seems possible
> to workaround by preventing the GPU from sleeping (via means of
> disabling the power-state management interface, i.e. forcing each ring
> to remain awake) around the update.
Changelog is missing.
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
> References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem_gtt.c | 2 -
> drivers/gpu/drm/i915/intel_lrc.c | 140 ++++++++++++++--------------
> 2 files changed, 69 insertions(+), 73 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index add1fe7aeb93..62bde517d383 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1423,8 +1423,6 @@ static int gen8_ppgtt_alloc_pdp(struct i915_address_space *vm,
> gen8_initialize_pd(vm, pd);
> gen8_ppgtt_set_pdpe(vm, pdp, pd, pdpe);
> GEM_BUG_ON(pdp->used_pdpes > i915_pdpes_per_pdp(vm));
> -
> - mark_tlbs_dirty(i915_vm_to_ppgtt(vm));
> }
>
> ret = gen8_ppgtt_alloc_pd(vm, pd, start, length);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b1f5db3442eb..c84bdc21bcce 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -363,31 +363,12 @@ execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
> trace_i915_request_out(rq);
> }
>
> -static void
> -execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
> -{
> - ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> - ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> - ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> - ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> -}
> -
> static u64 execlists_update_context(struct i915_request *rq)
> {
> - struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
> struct intel_context *ce = rq->hw_context;
> - u32 *reg_state = ce->lrc_reg_state;
>
> - reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
> -
> - /*
> - * True 32b PPGTT with dynamic page allocation: update PDP
> - * registers and point the unallocated PDPs to scratch page.
> - * PML4 is allocated during ppgtt init, so this is not needed
> - * in 48-bit mode.
> - */
> - if (!i915_vm_is_48bit(&ppgtt->vm))
> - execlists_update_context_pdps(ppgtt, reg_state);
> + ce->lrc_reg_state[CTX_RING_TAIL + 1] =
> + intel_ring_set_tail(rq->ring, rq->tail);
>
> /*
> * Make sure the context image is complete before we submit it to HW.
> @@ -1247,6 +1228,59 @@ execlists_context_pin(struct intel_engine_cs *engine,
> return __execlists_context_pin(engine, ctx, ce);
> }
>
> +static int emit_pdps(struct i915_request *rq)
> +{
> + const struct intel_engine_cs * const engine = rq->engine;
> + struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
> + int err, i;
> + u32 *cs;
> +
> + GEM_BUG_ON(intel_vgpu_active(rq->i915));
> +
> + /*
> + * Beware ye of the dragons, this sequence is magic!
> + *
> + * Small changes to this sequence can cause anything from
> + * GPU hangs to forcewake errors and machine lockups!
> + */
> +
> + /* Flush any residual operations from the context load */
> + err = engine->emit_flush(rq, EMIT_FLUSH);
> + if (err)
> + return err;
> +
> + /* Magic required to prevent forcewake errors! */
> + err = engine->emit_flush(rq, EMIT_INVALIDATE);
> + if (err)
> + return err;
> +
> + cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
> + if (IS_ERR(cs))
> + return PTR_ERR(cs);
> +
> + /* Ensure the LRI have landed before we invalidate & continue */
> + *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> + for (i = GEN8_3LVL_PDPES; i--; ) {
> + const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> +
> + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> + *cs++ = upper_32_bits(pd_daddr);
> + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> + *cs++ = lower_32_bits(pd_daddr);
> + }
> + *cs++ = MI_NOOP;
> +
> + intel_ring_advance(rq, cs);
> +
> + /* Be doubly sure the LRI have landed before proceeding */
> + err = engine->emit_flush(rq, EMIT_FLUSH);
> + if (err)
> + return err;
> +
> + /* Re-invalidate the TLB for luck */
> + return engine->emit_flush(rq, EMIT_INVALIDATE);
> +}
> +
> static int execlists_request_alloc(struct i915_request *request)
> {
> int ret;
> @@ -1260,11 +1294,6 @@ static int execlists_request_alloc(struct i915_request *request)
> */
> request->reserved_space += EXECLISTS_REQUEST_SIZE;
>
> - /* Unconditionally invalidate GPU caches and TLBs. */
> - ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> - if (ret)
> - return ret;
> -
> /*
> * Note that after this point, we have committed to using
> * this request as it is being used to both track the
> @@ -1273,6 +1302,14 @@ static int execlists_request_alloc(struct i915_request *request)
> * to cancel/unwind this request now.
> */
>
> + /* Unconditionally invalidate GPU caches and TLBs. */
> + if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm))
> + ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> + else
> + ret = emit_pdps(request);
> + if (ret)
> + return ret;
> +
> request->reserved_space -= EXECLISTS_REQUEST_SIZE;
> return 0;
> }
> @@ -1808,56 +1845,11 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
> atomic_read(&execlists->tasklet.count));
> }
>
> -static int intel_logical_ring_emit_pdps(struct i915_request *rq)
> -{
> - struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
> - struct intel_engine_cs *engine = rq->engine;
> - const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
> - u32 *cs;
> - int i;
> -
> - cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
> - if (IS_ERR(cs))
> - return PTR_ERR(cs);
> -
> - *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
> - for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
> - const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> -
> - *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> - *cs++ = upper_32_bits(pd_daddr);
> - *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> - *cs++ = lower_32_bits(pd_daddr);
> - }
> -
> - *cs++ = MI_NOOP;
> - intel_ring_advance(rq, cs);
> -
> - return 0;
> -}
> -
> static int gen8_emit_bb_start(struct i915_request *rq,
> u64 offset, u32 len,
> const unsigned int flags)
> {
> u32 *cs;
> - int ret;
> -
> - /* Don't rely in hw updating PDPs, specially in lite-restore.
> - * Ideally, we should set Force PD Restore in ctx descriptor,
> - * but we can't. Force Restore would be a second option, but
> - * it is unsafe in case of lite-restore (because the ctx is
> - * not idle). PML4 is allocated during ppgtt init so this is
> - * not needed in 48-bit.*/
> - if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
> - !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
> - !intel_vgpu_active(rq->i915)) {
> - ret = intel_logical_ring_emit_pdps(rq);
> - if (ret)
> - return ret;
> -
> - rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
> - }
>
> cs = intel_ring_begin(rq, 6);
> if (IS_ERR(cs))
> @@ -1890,6 +1882,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>
> *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> *cs++ = MI_NOOP;
> +
> intel_ring_advance(rq, cs);
>
> return 0;
> @@ -2501,6 +2494,11 @@ static void execlists_init_reg_state(u32 *regs,
> * other PDP Descriptors are ignored.
> */
> ASSIGN_CTX_PML4(ctx->ppgtt, regs);
> + } else {
> + ASSIGN_CTX_PDP(ctx->ppgtt, regs, 3);
> + ASSIGN_CTX_PDP(ctx->ppgtt, regs, 2);
> + ASSIGN_CTX_PDP(ctx->ppgtt, regs, 1);
> + ASSIGN_CTX_PDP(ctx->ppgtt, regs, 0);
> }
>
> if (rcs) {
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Is BSW fixed now? Or just a tiny bit better?
Regards,
Tvrtko
More information about the Intel-gfx
mailing list