[Intel-gfx] [PATCH 2/3] drm/i915: Pipeline PDP updates for Braswell

Thu Dec 6 13:10:41 UTC 2018

On 06/12/2018 08:44, Chris Wilson wrote:
> Currently we face a severe problem on Braswell that manifests as invalid
> ppGTT accesses. The code tries to maintain the PDP (page directory
> pointers) inside the context in two ways, direct write into the context
> and a pipelined LRI update. The direct write into the context is
> fundamentally racy as it is unserialised with any access (read or write)
> the GPU is doing. By asserting that Braswell is not used with vGPU
> (currently an unsupported platform) we can eliminate the dangerous
> direct write into the context image and solely use the pipelined update.
> 
> However, the LRI of the PDP fouls up the GPU, causing it to freeze and
> take out the machine with "forcewake ack timeouts". This seems possible
> to workaround by preventing the GPU from sleeping (via means of
> disabling the power-state management interface, i.e. forcing each ring
> to remain awake) around the update.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108656
> References: https://bugs.freedesktop.org/show_bug.cgi?id=108714
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_gtt.c     |   2 -
>   drivers/gpu/drm/i915/i915_request.c     |   5 -
>   drivers/gpu/drm/i915/intel_lrc.c        | 143 ++++++++++++------------
>   drivers/gpu/drm/i915/intel_ringbuffer.c |   5 +-
>   4 files changed, 74 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index add1fe7aeb93..62bde517d383 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1423,8 +1423,6 @@ static int gen8_ppgtt_alloc_pdp(struct i915_address_space *vm,
>   			gen8_initialize_pd(vm, pd);
>   			gen8_ppgtt_set_pdpe(vm, pdp, pd, pdpe);
>   			GEM_BUG_ON(pdp->used_pdpes > i915_pdpes_per_pdp(vm));
> -
> -			mark_tlbs_dirty(i915_vm_to_ppgtt(vm));
>   		}
>   
>   		ret = gen8_ppgtt_alloc_pd(vm, pd, start, length);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index ca95ab2f4cfa..8ab8e8e6a086 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -719,11 +719,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 */
>   	rq->head = rq->ring->emit;
>   
> -	/* Unconditionally invalidate GPU caches and TLBs. */
> -	ret = engine->emit_flush(rq, EMIT_INVALIDATE);
> -	if (ret)
> -		goto err_unwind;
> -
>   	ret = engine->request_alloc(rq);
>   	if (ret)
>   		goto err_unwind;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d7fa301b5ec7..de1e9dc6aec0 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -363,31 +363,12 @@ execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
>   	trace_i915_request_out(rq);
>   }
>   
> -static void
> -execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
> -{
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> -	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> -}
> -
>   static u64 execlists_update_context(struct i915_request *rq)
>   {
> -	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
>   	struct intel_context *ce = rq->hw_context;
> -	u32 *reg_state = ce->lrc_reg_state;
> -
> -	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
>   
> -	/*
> -	 * True 32b PPGTT with dynamic page allocation: update PDP
> -	 * registers and point the unallocated PDPs to scratch page.
> -	 * PML4 is allocated during ppgtt init, so this is not needed
> -	 * in 48-bit mode.
> -	 */
> -	if (!i915_vm_is_48bit(&ppgtt->vm))
> -		execlists_update_context_pdps(ppgtt, reg_state);
> +	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
> +		intel_ring_set_tail(rq->ring, rq->tail);
>   
>   	/*
>   	 * Make sure the context image is complete before we submit it to HW.
> @@ -1242,29 +1223,86 @@ execlists_context_pin(struct intel_engine_cs *engine,
>   	return __execlists_context_pin(engine, ctx, ce);
>   }
>   
> +static int emit_pdps(struct i915_request *rq)
> +{
> +	const struct intel_engine_cs * const engine = rq->engine;
> +	struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
> +	int err, i;
> +	u32 *cs;
> +
> +	/*
> +	 * Beware ye of the dragons, this sequence is magic!
> +	 *
> +	 * Small changes to this sequence can cause anything from
> +	 * GPU hangs to forcewake errors and machine lockups!
> +	 */
> +
> +	err = engine->emit_flush(rq, EMIT_FLUSH);
> +	if (err)
> +		return err;
> +
> +	err = engine->emit_flush(rq, EMIT_INVALIDATE);
> +	if (err)
> +		return err;
> +
> +	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/* Ensure the LRI have landed before we invalidate & continue */
> +	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
> +	for (i = GEN8_3LVL_PDPES; i--; ) {
> +		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> +
> +		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> +		*cs++ = upper_32_bits(pd_daddr);
> +		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> +		*cs++ = lower_32_bits(pd_daddr);
> +	}
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	err = engine->emit_flush(rq, EMIT_FLUSH);
> +	if (err)
> +		return err;
> +
> +	return engine->emit_flush(rq, EMIT_INVALIDATE);
> +}
> +
>   static int execlists_request_alloc(struct i915_request *request)
>   {
>   	int ret;
>   
>   	GEM_BUG_ON(!request->hw_context->pin_count);
>   
> -	/* Flush enough space to reduce the likelihood of waiting after
> +	/*
> +	 * Flush enough space to reduce the likelihood of waiting after
>   	 * we start building the request - in which case we will just
>   	 * have to repeat work.
>   	 */
>   	request->reserved_space += EXECLISTS_REQUEST_SIZE;
>   
> -	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
> -	if (ret)
> -		return ret;
> -
> -	/* Note that after this point, we have committed to using
> +	/*
> +	 * Note that after this point, we have committed to using
>   	 * this request as it is being used to both track the
>   	 * state of engine initialisation and liveness of the
>   	 * golden renderstate above. Think twice before you try
>   	 * to cancel/unwind this request now.
>   	 */
>   
> +	/* Unconditionally invalidate GPU caches and TLBs. */
> +	if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm)) {
> +		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
> +		if (ret)
> +			return ret;
> +	} else {
> +		GEM_BUG_ON(intel_vgpu_active(request->i915));
> +		ret = emit_pdps(request);
> +		if (ret)
> +			return ret;
> +	}

Can pull "if (ret).." after the loop to save, hm, almost no lines after 
added whitespace.

> +
>   	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
>   	return 0;
>   }
> @@ -1836,56 +1874,11 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
>   		  atomic_read(&execlists->tasklet.count));
>   }
>   
> -static int intel_logical_ring_emit_pdps(struct i915_request *rq)
> -{
> -	struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
> -	struct intel_engine_cs *engine = rq->engine;
> -	const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
> -	u32 *cs;
> -	int i;
> -
> -	cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
> -	if (IS_ERR(cs))
> -		return PTR_ERR(cs);
> -
> -	*cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
> -	for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
> -		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
> -
> -		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
> -		*cs++ = upper_32_bits(pd_daddr);
> -		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
> -		*cs++ = lower_32_bits(pd_daddr);
> -	}
> -
> -	*cs++ = MI_NOOP;
> -	intel_ring_advance(rq, cs);
> -
> -	return 0;
> -}
> -
>   static int gen8_emit_bb_start(struct i915_request *rq,
>   			      u64 offset, u32 len,
>   			      const unsigned int flags)
>   {
>   	u32 *cs;
> -	int ret;
> -
> -	/* Don't rely in hw updating PDPs, specially in lite-restore.
> -	 * Ideally, we should set Force PD Restore in ctx descriptor,
> -	 * but we can't. Force Restore would be a second option, but
> -	 * it is unsafe in case of lite-restore (because the ctx is
> -	 * not idle). PML4 is allocated during ppgtt init so this is
> -	 * not needed in 48-bit.*/
> -	if ((intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
> -	    !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
> -	    !intel_vgpu_active(rq->i915)) {
> -		ret = intel_logical_ring_emit_pdps(rq);
> -		if (ret)
> -			return ret;
> -
> -		rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
> -	}
>   
>   	cs = intel_ring_begin(rq, 6);
>   	if (IS_ERR(cs))
> @@ -1918,6 +1911,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
>   	*cs++ = MI_NOOP;
> +
>   	intel_ring_advance(rq, cs);
>   
>   	return 0;
> @@ -2533,6 +2527,11 @@ static void execlists_init_reg_state(u32 *regs,
>   		 * other PDP Descriptors are ignored.
>   		 */
>   		ASSIGN_CTX_PML4(ctx->ppgtt, regs);
> +	} else {
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 3);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 2);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 1);
> +		ASSIGN_CTX_PDP(ctx->ppgtt, regs, 0);
>   	}
>   
>   	if (rcs) {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index c5eb26a7ee79..ad5b1fc79498 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1826,11 +1826,12 @@ static int ring_request_alloc(struct i915_request *request)
>   	 */
>   	request->reserved_space += LEGACY_REQUEST_SIZE;
>   
> -	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);

Can I convince you to extract this changelet to a separate patch? (For 
both backends.)

> +	ret = switch_context(request);
>   	if (ret)
>   		return ret;
>   
> -	ret = switch_context(request);
> +	/* Unconditionally invalidate GPU caches and TLBs. */
> +	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
>   	if (ret)
>   		return ret;
>   
> 

Looks good to me.

Regards,

Tvrtko