[Intel-gfx] [PATCH 1/3] drm/i915/execlists: Switch to using STOP_RING instead of a semaphore

Tue Jun 25 10:25:21 UTC 2019

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Not all mediated environments yet support HW semaphores, so we should
> avoid using one for our preempt-to-busy barrier and instead request that
> the CS be paused and not advance on to the next execlist.
>
> There's a natural advantage with the register writes being serialised
> with the writes to the ELSP register that should allow the barrier to be
> much more constrained. On the other hand, we can no longer track the
> extents of the barrier and so must be a lot more lenient in accepting
> early CS events.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Zhenyu Wang <zhenyuw at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_types.h |  8 +++
>  drivers/gpu/drm/i915/gt/intel_lrc.c          | 69 ++++++++------------
>  2 files changed, 35 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 7e056114344e..1353df264236 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -166,6 +166,8 @@ struct intel_engine_execlists {
>  	 */
>  	bool no_priolist;
>  
> +	u16 ring_mode;
> +
>  	/**
>  	 * @submit_reg: gen-specific execlist submission register
>  	 * set to the ExecList Submission Port (elsp) register pre-Gen11 and to
> @@ -179,6 +181,12 @@ struct intel_engine_execlists {
>  	 */
>  	u32 __iomem *ctrl_reg;
>  
> +	/**
> +	 * @ring_mode: the engine control register, used to freeze the command
> +	 * streamer around preempt-to-busy
> +	 */
> +	u32 __iomem *ring_reg;
> +
>  #define EXECLIST_MAX_PORTS 2
>  	/**
>  	 * @active: the currently known context executing on HW
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 28685ba91a2c..c2aaab4b743e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -224,24 +224,18 @@ static void execlists_init_reg_state(u32 *reg_state,
>  				     struct intel_engine_cs *engine,
>  				     struct intel_ring *ring);
>  
> -static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
> +static inline void ring_set_paused(struct intel_engine_cs *engine, u32 state)
>  {
> -	return (i915_ggtt_offset(engine->status_page.vma) +
> -		I915_GEM_HWS_PREEMPT_ADDR);
> -}
> +	u16 x;
>  
> -static inline void
> -ring_set_paused(const struct intel_engine_cs *engine, int state)
> -{
> -	/*
> -	 * We inspect HWS_PREEMPT with a semaphore inside
> -	 * engine->emit_fini_breadcrumb. If the dword is true,
> -	 * the ring is paused as the semaphore will busywait
> -	 * until the dword is false.
> -	 */
> -	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
> -	if (state)
> -		wmb();
> +	x = engine->execlists.ring_mode;
> +	x &= ~(state >> 16);
> +	x |= state;
> +	if (x == engine->execlists.ring_mode)
> +		return;

You want to keep the state to reduce noneffective writes?

I would like ring_freeze() and ring_thaw() and
the state parameter removed, as I don't see the
value in callsites to macro the masked bits.

> +
> +	engine->execlists.ring_mode = x;
> +	writel(state, engine->execlists.ring_reg);
>  }
>  
>  static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> @@ -981,7 +975,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			 * as we unwind (and until we resubmit) so that we do
>  			 * not accidentally tell it to go backwards.
>  			 */
> -			ring_set_paused(engine, 1);
> +			ring_set_paused(engine, _MASKED_BIT_ENABLE(STOP_RING));
>  
>  			/*
>  			 * Note that we have not stopped the GPU at this point,
> @@ -1010,7 +1004,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  				  last->sched.attr.priority,
>  				  execlists->queue_priority_hint);
>  
> -			ring_set_paused(engine, 1);
> +			ring_set_paused(engine, _MASKED_BIT_ENABLE(STOP_RING));
>  			defer_active(engine);
>  
>  			/*
> @@ -1244,9 +1238,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		*port = execlists_schedule_in(last, port - execlists->pending);
>  		memset(port + 1, 0, (last_port - port) * sizeof(*port));
>  		execlists_submit_ports(engine);
> -	} else {
> -		ring_set_paused(engine, 0);
>  	}
> +
> +	if (!inject_preempt_hang(execlists))
> +		ring_set_paused(engine, _MASKED_BIT_DISABLE(STOP_RING));

This is just a superset of the previous unpausing now?

>  }
>  
>  void
> @@ -1355,9 +1350,6 @@ static void process_csb(struct intel_engine_cs *engine)
>  
>  			if (enable_timeslice(engine))
>  				mod_timer(&execlists->timer, jiffies + 1);
> -
> -			if (!inject_preempt_hang(execlists))
> -				ring_set_paused(engine, 0);
>  		} else if (status & GEN8_CTX_STATUS_PREEMPTED) {
>  			struct i915_request * const *port = execlists->active;
>  
> @@ -1378,8 +1370,15 @@ static void process_csb(struct intel_engine_cs *engine)
>  			 * ordered, that the breadcrumb write is
>  			 * coherent (visible from the CPU) before the
>  			 * user interrupt and CSB is processed.
> +			 *
> +			 * The caveat here applies when we are injecting
> +			 * a completion event via manipulation of the
> +			 * RING_MI_MODE; this may occur before the
> +			 * request is completed and appears as a
> +			 * normal context-switch (0x14).

As in we unpause and get a completion event?

-Mika

>  			 */
> -			GEM_BUG_ON(!i915_request_completed(rq));
> +			GEM_BUG_ON(!i915_request_completed(rq) &&
> +				   !execlists->pending[0]);
>  			execlists_schedule_out(rq);
>  
>  			GEM_BUG_ON(execlists->active - execlists->inflight >
> @@ -2133,7 +2132,7 @@ static void reset_csb_pointers(struct intel_engine_cs *engine)
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
>  	const unsigned int reset_value = execlists->csb_size - 1;
>  
> -	ring_set_paused(engine, 0);
> +	ring_set_paused(engine, _MASKED_BIT_DISABLE(STOP_RING));
>  
>  	/*
>  	 * After a reset, the HW starts writing into CSB entry [0]. We
> @@ -2581,19 +2580,6 @@ static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
>  	return cs;
>  }
>  
> -static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
> -{
> -	*cs++ = MI_SEMAPHORE_WAIT |
> -		MI_SEMAPHORE_GLOBAL_GTT |
> -		MI_SEMAPHORE_POLL |
> -		MI_SEMAPHORE_SAD_EQ_SDD;
> -	*cs++ = 0;
> -	*cs++ = intel_hws_preempt_address(request->engine);
> -	*cs++ = 0;
> -
> -	return cs;
> -}
> -
>  static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
>  {
>  	cs = gen8_emit_ggtt_write(cs,
> @@ -2601,9 +2587,7 @@ static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
>  				  request->timeline->hwsp_offset,
>  				  0);
>  	*cs++ = MI_USER_INTERRUPT;
> -
>  	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> -	cs = emit_preempt_busywait(request, cs);
>  
>  	request->tail = intel_ring_offset(request, cs);
>  	assert_ring_tail_valid(request->ring, request->tail);
> @@ -2625,9 +2609,7 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>  				    PIPE_CONTROL_CS_STALL,
>  				    0);
>  	*cs++ = MI_USER_INTERRUPT;
> -
>  	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> -	cs = emit_preempt_busywait(request, cs);
>  
>  	request->tail = intel_ring_offset(request, cs);
>  	assert_ring_tail_valid(request->ring, request->tail);
> @@ -2807,6 +2789,9 @@ int intel_execlists_submission_init(struct intel_engine_cs *engine)
>  	execlists->csb_write =
>  		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
>  
> +	execlists->ring_reg =
> +		uncore->regs + i915_mmio_reg_offset(RING_MI_MODE(base));
> +
>  	if (INTEL_GEN(i915) < 11)
>  		execlists->csb_size = GEN8_CSB_ENTRIES;
>  	else
> -- 
> 2.20.1