[Intel-gfx] [PATCH v6] drm/i915/icl: Preempt-to-idle support in execlists.

Mon Dec 10 15:40:34 UTC 2018

On 09/11/2018 17:18, Tomasz Lis wrote:
> The patch adds support of preempt-to-idle requesting by setting a proper
> bit within Execlist Control Register, and receiving preemption result from
> Context Status Buffer.
> 
> Preemption in previous gens required a special batch buffer to be executed,
> so the Command Streamer never preempted to idle directly. In Icelake it is
> possible, as there is a hardware mechanism to inform the kernel about
> status of the preemption request.
> 
> This patch does not cover using the new preemption mechanism when GuC is
> active.
> 
> The advantage of this new preemption path is that one less context switch is
> needed, and returning information about preempion being complete is received
> earlier. This leads to significant improvement in our IGT latency test.
> 
> Test performed: `gem_exec_latency --run-subtest render-preemption`, executed
> 100 times, on the same platform, same kernel, without and with this patch.
> Then taken average of the execution latency times:
> 
> subcase		old preempt.	icl preempt.
> render-render	853.2036	840.1176
> render-bsd	2328.8708	2083.2576
> render-blt	2080.1501	1852.0792
> render-vebox	1553.5134	1428.762
> 
> Improvement observed:
> 
> subcase		improvement
> render-render	 1.53%
> render-bsd	10.55%
> render-blt	10.96%
> render-vebox	 8.03%

Who can explain what do the parts other than render-render mean? At 
least I can make sense of render-render - measure how long it takes for 
one context to preempt another, but render-$other draws a blank for me. 
How are engines pre-empting one another?

But anyway, even if only the 1.53% improvement is the real one, FWIW 
that's I think good enough to justify the patch. It is sufficiently 
small and contained that I don't see a problem. So:

Acked-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko

> 
> v2: Added needs_preempt_context() change so that it is not created when
>      preempt-to-idle is supported. (Chris)
>      Updated setting HWACK flag so that it is cleared after
>      preempt-to-dle. (Chris, Daniele)
>      Updated to use I915_ENGINE_HAS_PREEMPTION flag. (Chris)
> 
> v3: Fixed needs_preempt_context() change. (Chris)
>      Merged preemption trigger functions to one. (Chris)
>      Fixed conyext state tonot assume COMPLETED_MASK after preemption,
>      since idle-to-idle case will not have it set.
> 
> v4: Simplified needs_preempt_context() change. (Daniele)
>      Removed clearing HWACK flag in idle-to-idle preempt. (Daniele)
> 
> v5: Renamed inject_preempt_context(). (Daniele)
>      Removed duplicated GEM_BUG_ON() on HWACK (Daniele)
> 
> v6: Added performance test results.
> 
> Bspec: 18922
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Michal Winiarski <michal.winiarski at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Signed-off-by: Tomasz Lis <tomasz.lis at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h          |   2 +
>   drivers/gpu/drm/i915/i915_gem_context.c  |   3 +-
>   drivers/gpu/drm/i915/i915_pci.c          |   3 +-
>   drivers/gpu/drm/i915/intel_device_info.h |   1 +
>   drivers/gpu/drm/i915/intel_lrc.c         | 109 +++++++++++++++++++++----------
>   drivers/gpu/drm/i915/intel_lrc.h         |   1 +
>   6 files changed, 84 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 08d25aa..d2cc9f1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2579,6 +2579,8 @@ intel_info(const struct drm_i915_private *dev_priv)
>   		((dev_priv)->info.has_logical_ring_elsq)
>   #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
>   		((dev_priv)->info.has_logical_ring_preemption)
> +#define HAS_HW_PREEMPT_TO_IDLE(dev_priv) \
> +		((dev_priv)->info.has_hw_preempt_to_idle)
>   
>   #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index b97963d..10b1d61 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -529,7 +529,8 @@ static void init_contexts(struct drm_i915_private *i915)
>   
>   static bool needs_preempt_context(struct drm_i915_private *i915)
>   {
> -	return HAS_LOGICAL_RING_PREEMPTION(i915);
> +	return HAS_LOGICAL_RING_PREEMPTION(i915) &&
> +	       !HAS_HW_PREEMPT_TO_IDLE(i915);
>   }
>   
>   int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index 4ccab83..82125cf 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -600,7 +600,8 @@ static const struct intel_device_info intel_cannonlake_info = {
>   			   TRANSCODER_DSI0_OFFSET, TRANSCODER_DSI1_OFFSET}, \
>   	GEN(11), \
>   	.ddb_size = 2048, \
> -	.has_logical_ring_elsq = 1
> +	.has_logical_ring_elsq = 1, \
> +	.has_hw_preempt_to_idle = 1
>   
>   static const struct intel_device_info intel_icelake_11_info = {
>   	GEN11_FEATURES,
> diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
> index 86ce1db..a2ee278 100644
> --- a/drivers/gpu/drm/i915/intel_device_info.h
> +++ b/drivers/gpu/drm/i915/intel_device_info.h
> @@ -104,6 +104,7 @@ enum intel_ppgtt {
>   	func(has_logical_ring_contexts); \
>   	func(has_logical_ring_elsq); \
>   	func(has_logical_ring_preemption); \
> +	func(has_hw_preempt_to_idle); \
>   	func(has_overlay); \
>   	func(has_pooled_eu); \
>   	func(has_psr); \
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 08fd9b1..26b7062 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -155,6 +155,7 @@
>   #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
>   #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
>   #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
> +#define GEN11_CTX_STATUS_PREEMPT_IDLE	(1 << 29)
>   
>   #define GEN8_CTX_STATUS_COMPLETED_MASK \
>   	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
> @@ -500,29 +501,49 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq)
>   	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
>   }
>   
> -static void inject_preempt_context(struct intel_engine_cs *engine)
> +static void execlist_send_preempt_to_idle(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists *execlists = &engine->execlists;
> -	struct intel_context *ce =
> -		to_intel_context(engine->i915->preempt_context, engine);
> -	unsigned int n;
> +	GEM_TRACE("%s\n", engine->name);
>   
> -	GEM_BUG_ON(execlists->preempt_complete_status !=
> -		   upper_32_bits(ce->lrc_desc));
> +	if (HAS_HW_PREEMPT_TO_IDLE(engine->i915)) {
> +		/*
> +		 * hardware which HAS_HW_PREEMPT_TO_IDLE(), always also
> +		 * HAS_LOGICAL_RING_ELSQ(), so we can assume ctrl_reg is set
> +		 */
> +		GEM_BUG_ON(execlists->ctrl_reg == NULL);
>   
> -	/*
> -	 * Switch to our empty preempt context so
> -	 * the state of the GPU is known (idle).
> -	 */
> -	GEM_TRACE("%s\n", engine->name);
> -	for (n = execlists_num_ports(execlists); --n; )
> -		write_desc(execlists, 0, n);
> +		/*
> +		 * If we have hardware preempt-to-idle, we do not need to
> +		 * inject any job to the hardware. We only set a flag.
> +		 */
> +		writel(EL_CTRL_PREEMPT_TO_IDLE, execlists->ctrl_reg);
> +	} else {
> +		struct intel_context *ce =
> +			to_intel_context(engine->i915->preempt_context, engine);
> +		unsigned int n;
>   
> -	write_desc(execlists, ce->lrc_desc, n);
> +		GEM_BUG_ON(execlists->preempt_complete_status !=
> +			   upper_32_bits(ce->lrc_desc));
> +		GEM_BUG_ON((ce->lrc_reg_state[CTX_CONTEXT_CONTROL + 1] &
> +			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
> +					   CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT)) !=
> +			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
> +					   CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
>   
> -	/* we need to manually load the submit queue */
> -	if (execlists->ctrl_reg)
> -		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
> +		/*
> +		 * Switch to our empty preempt context so
> +		 * the state of the GPU is known (idle).
> +		 */
> +		for (n = execlists_num_ports(execlists); --n; )
> +			write_desc(execlists, 0, n);
> +
> +		write_desc(execlists, ce->lrc_desc, n);
> +
> +		/* we need to manually load the submit queue */
> +		if (execlists->ctrl_reg)
> +			writel(EL_CTRL_LOAD, execlists->ctrl_reg);
> +	}
>   
>   	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
>   	execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
> @@ -595,7 +616,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			return;
>   
>   		if (need_preempt(engine, last, execlists->queue_priority)) {
> -			inject_preempt_context(engine);
> +			execlist_send_preempt_to_idle(engine);
>   			return;
>   		}
>   
> @@ -922,22 +943,43 @@ static void process_csb(struct intel_engine_cs *engine)
>   			  execlists->active);
>   
>   		status = buf[2 * head];
> -		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> -			      GEN8_CTX_STATUS_PREEMPTED))
> -			execlists_set_active(execlists,
> -					     EXECLISTS_ACTIVE_HWACK);
> -		if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> -			execlists_clear_active(execlists,
> -					       EXECLISTS_ACTIVE_HWACK);
> -
> -		if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> -			continue;
> +		/*
> +		 * Check if preempted from idle to idle directly.
> +		 * The STATUS_IDLE_ACTIVE flag is used to mark
> +		 * such transition.
> +		 */
> +		if ((status & GEN8_CTX_STATUS_IDLE_ACTIVE) &&
> +		     (status & GEN11_CTX_STATUS_PREEMPT_IDLE)) {
>   
> -		/* We should never get a COMPLETED | IDLE_ACTIVE! */
> -		GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> +			/*
> +			 * We could not have COMPLETED anything
> +			 * if we were idle before preemption.
> +			 */
> +			GEM_BUG_ON(status & GEN8_CTX_STATUS_COMPLETED_MASK);
> +		} else {
> +			if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> +				      GEN8_CTX_STATUS_PREEMPTED))
> +				execlists_set_active(execlists,
> +						     EXECLISTS_ACTIVE_HWACK);
> +
> +			if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
> +				execlists_clear_active(execlists,
> +						       EXECLISTS_ACTIVE_HWACK);
> +
> +			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
> +				continue;
>   
> -		if (status & GEN8_CTX_STATUS_COMPLETE &&
> -		    buf[2*head + 1] == execlists->preempt_complete_status) {
> +			/* We should never get a COMPLETED | IDLE_ACTIVE! */
> +			GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
> +		}
> +
> +		/*
> +		 * Check if preempted to real idle, either directly or
> +		 * the preemptive context already finished executing
> +		 */
> +		if ((status & GEN11_CTX_STATUS_PREEMPT_IDLE) ||
> +		    (status & GEN8_CTX_STATUS_COMPLETE &&
> +		    buf[2*head + 1] == execlists->preempt_complete_status)) {
>   			GEM_TRACE("%s preempt-idle\n", engine->name);
>   			complete_preempt_context(execlists);
>   			continue;
> @@ -2150,7 +2192,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>   	engine->unpark = NULL;
>   
>   	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
> -	if (engine->i915->preempt_context)
> +	if (engine->i915->preempt_context ||
> +	    HAS_HW_PREEMPT_TO_IDLE(engine->i915))
>   		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>   
>   	engine->i915->caps.scheduler =
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index f5a5502..871901a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -43,6 +43,7 @@
>   #define RING_EXECLIST_SQ_CONTENTS(engine)	_MMIO((engine)->mmio_base + 0x510)
>   #define RING_EXECLIST_CONTROL(engine)		_MMIO((engine)->mmio_base + 0x550)
>   #define	  EL_CTRL_LOAD				(1 << 0)
> +#define	  EL_CTRL_PREEMPT_TO_IDLE		(1 << 1)
>   
>   /* The docs specify that the write pointer wraps around after 5h, "After status
>    * is written out to the last available status QW at offset 5h, this pointer
>