[Intel-gfx] [PATCH] drm/i915/execlists: Enable coarse preemption boundaries for gen8

Fri Apr 5 09:46:19 UTC 2019

On 29/03/2019 13:40, Chris Wilson wrote:
> When we introduced preemption, we chose to keep it disabled for gen8 as
> supporting preemption inside GPGPU user batches required various w/a in
> userspace. Since then, the desire to preempt long queues of requests
> between batches (e.g. within busywaiting semaphores) has grown. So allow
> arbitration within the busywaits and between requests, but disable
> arbitration within user batches so that we can preempt between requests
> and not risk breaking GPGPU.
> 
> However, since this preemption is much coarser and doesn't interfere
> with userspace, we decline to include it amongst the scheduler
> capabilities. (This is also required for us to skip over the preemption
> selftests that expect to be able to preempt user batches.)
> 
> Testcase: igt/gem_exec_scheduler/semaphore-user
> References: beecec901790 ("drm/i915/execlists: Preemption!")
> Fixes: e88619646971 ("drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michal Winiarski <michal.winiarski at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c    |   2 +-
>   drivers/gpu/drm/i915/intel_lrc.c           |  49 ++++--
>   drivers/gpu/drm/i915/selftests/intel_lrc.c | 180 +++++++++++++++++++++
>   3 files changed, 217 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 662da485e15f..92993db38aad 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -562,7 +562,7 @@ static void init_contexts(struct drm_i915_private *i915)
>   
>   static bool needs_preempt_context(struct drm_i915_private *i915)
>   {
> -	return HAS_LOGICAL_RING_PREEMPTION(i915);
> +	return HAS_EXECLISTS(i915);
>   }
>   
>   int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index bec232acc8d7..b380688a0c1c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -233,7 +233,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
>   {
>   	int last_prio;
>   
> -	if (!intel_engine_has_preemption(engine))
> +	if (!engine->preempt_context)
>   		return false;
>   
>   	if (i915_request_completed(rq))
> @@ -2035,7 +2035,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   {
>   	u32 *cs;
>   
> -	cs = intel_ring_begin(rq, 6);
> +	cs = intel_ring_begin(rq, 4);
>   	if (IS_ERR(cs))
>   		return PTR_ERR(cs);
>   
> @@ -2046,16 +2046,35 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   	 * particular all the gen that do not need the w/a at all!), if we
>   	 * took care to make sure that on every switch into this context
>   	 * (both ordinary and for preemption) that arbitrartion was enabled
> -	 * we would be fine. However, there doesn't seem to be a downside to
> -	 * being paranoid and making sure it is set before each batch and
> -	 * every context-switch.
> -	 *
> -	 * Note that if we fail to enable arbitration before the request
> -	 * is complete, then we do not see the context-switch interrupt and
> -	 * the engine hangs (with RING_HEAD == RING_TAIL).
> -	 *
> -	 * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
> +	 * we would be fine.  However, for gen8 there is another w/a that
> +	 * requires us to not preempt inside GPGPU execution, so we keep
> +	 * arbitration disabled for gen8 batches. Arbitration will be
> +	 * re-enabled before we close the request
> +	 * (engine->emit_fini_breadcrumb).
>   	 */
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
> +
> +	/* FIXME(BDW): Address space and security selectors. */
> +	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
> +		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
> +	*cs++ = lower_32_bits(offset);
> +	*cs++ = upper_32_bits(offset);
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int gen9_emit_bb_start(struct i915_request *rq,
> +			      u64 offset, u32 len,
> +			      const unsigned int flags)
> +{
> +	u32 *cs;
> +
> +	cs = intel_ring_begin(rq, 6);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>   
>   	/* FIXME(BDW): Address space and security selectors. */

This comment can now be removed.

> @@ -2316,7 +2335,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>   	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>   	if (!intel_vgpu_active(engine->i915))
>   		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
> -	if (engine->preempt_context)
> +	if (engine->preempt_context &&
> +	    HAS_LOGICAL_RING_PREEMPTION(engine->i915))
>   		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>   }
>   
> @@ -2350,7 +2370,10 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>   		 * until a more refined solution exists.
>   		 */
>   	}
> -	engine->emit_bb_start = gen8_emit_bb_start;
> +	if (IS_GEN(engine->i915, 8))
> +		engine->emit_bb_start = gen8_emit_bb_start;
> +	else
> +		engine->emit_bb_start = gen9_emit_bb_start;
>   }
>   
>   static inline void
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 0d3cae564db8..8bcd4e1d58ee 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -76,6 +76,185 @@ static int live_sanitycheck(void *arg)
>   	return err;
>   }
>   
> +static int live_busywait_preempt(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct i915_gem_context *ctx_hi, *ctx_lo;
> +	struct intel_engine_cs *engine;
> +	struct drm_i915_gem_object *obj;
> +	struct i915_vma *vma;
> +	enum intel_engine_id id;
> +	intel_wakeref_t wakeref;
> +	int err = -ENOMEM;
> +	u32 *map;
> +
> +	/*
> +	 * Verify that even without HAS_LOGICAL_RING_PREEMPTION, we can
> +	 * preempt the busywaits used to synchronise between rings.
> +	 */
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	ctx_hi = kernel_context(i915);
> +	if (!ctx_hi)
> +		goto err_unlock;
> +	ctx_hi->sched.priority = INT_MAX;
> +
> +	ctx_lo = kernel_context(i915);
> +	if (!ctx_lo)
> +		goto err_ctx_hi;
> +	ctx_lo->sched.priority = INT_MIN;
> +
> +	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(obj)) {
> +		err = PTR_ERR(obj);
> +		goto err_ctx_lo;
> +	}
> +
> +	map = i915_gem_object_pin_map(obj, I915_MAP_WC);
> +	if (IS_ERR(map)) {
> +		err = PTR_ERR(map);
> +		goto err_obj;
> +	}
> +
> +	vma = i915_vma_instance(obj, &i915->ggtt.vm, 0);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err_map;
> +	}
> +
> +	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
> +	if (err)
> +		goto err_map;
> +
> +	for_each_engine(engine, i915, id) {
> +		struct i915_request *lo, *hi;
> +		struct igt_live_test t;
> +		u32 *cs;
> +
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
> +
> +		if (igt_live_test_begin(&t, i915, __func__, engine->name)) {
> +			err = -EIO;
> +			goto err_vma;
> +		}
> +
> +		/*
> +		 * We create two requests. The low priority request
> +		 * busywaits on a semaphore (inside the ringbuffer where
> +		 * is should be preemptible) and the high priority requests
> +		 * uses a MI_STORE_DWORD_IMM to update the semaphore value
> +		 * allowing the first request to complete. If preemption
> +		 * fails, we hang instead.
> +		 */
> +
> +		lo = i915_request_alloc(engine, ctx_lo);
> +		if (IS_ERR(lo)) {
> +			err = PTR_ERR(lo);
> +			goto err_vma;
> +		}
> +
> +		cs = intel_ring_begin(lo, 8);
> +		if (IS_ERR(cs)) {
> +			err = PTR_ERR(cs);
> +			i915_request_add(lo);
> +			goto err_vma;
> +		}
> +
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*cs++ = i915_ggtt_offset(vma);
> +		*cs++ = 0;
> +		*cs++ = 1;
> +
> +		/* XXX Do we need a flush + invalidate here? */
> +
> +		*cs++ = MI_SEMAPHORE_WAIT |
> +			MI_SEMAPHORE_GLOBAL_GTT |
> +			MI_SEMAPHORE_POLL |
> +			MI_SEMAPHORE_SAD_EQ_SDD;
> +		*cs++ = 0;
> +		*cs++ = i915_ggtt_offset(vma);
> +		*cs++ = 0;
> +
> +		intel_ring_advance(lo, cs);
> +		i915_request_add(lo);
> +
> +		if (wait_for(READ_ONCE(*map), 10)) {
> +			err = -ETIMEDOUT;
> +			goto err_vma;
> +		}
> +
> +		/* Low priority request should be busywaiting now */
> +		if (i915_request_wait(lo, I915_WAIT_LOCKED, 1) != -ETIME) {
> +			pr_err("%s: Busywaiting request did not!\n",
> +			       engine->name);
> +			err = -EIO;
> +			goto err_vma;
> +		}
> +
> +		hi = i915_request_alloc(engine, ctx_hi);
> +		if (IS_ERR(hi)) {
> +			err = PTR_ERR(hi);
> +			goto err_vma;

Could release the spinner with a CPU write from here onwards but I guess 
it doesn't matter.

> +		}
> +
> +		cs = intel_ring_begin(hi, 4);
> +		if (IS_ERR(cs)) {
> +			err = PTR_ERR(cs);
> +			i915_request_add(hi);
> +			goto err_vma;
> +		}
> +
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*cs++ = i915_ggtt_offset(vma);
> +		*cs++ = 0;
> +		*cs++ = 0;
> +
> +		intel_ring_advance(hi, cs);
> +		i915_request_add(hi);
> +
> +		if (i915_request_wait(lo, I915_WAIT_LOCKED, HZ / 5) < 0) {
> +			struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> +			pr_err("%s: Failed to preempt semaphore busywait!\n",
> +			       engine->name);
> +
> +			intel_engine_dump(engine, &p, "%s\n", engine->name);
> +			GEM_TRACE_DUMP();
> +
> +			i915_gem_set_wedged(i915);
> +			err = -EIO;
> +			goto err_vma;
> +		}
> +		GEM_BUG_ON(READ_ONCE(*map));
> +
> +		if (igt_live_test_end(&t)) {
> +			err = -EIO;
> +			goto err_vma;
> +		}
> +	}
> +
> +	err = 0;
> +err_vma:
> +	i915_vma_unpin(vma);
> +err_map:
> +	i915_gem_object_unpin_map(obj);
> +err_obj:
> +	i915_gem_object_put(obj);
> +err_ctx_lo:
> +	kernel_context_close(ctx_lo);
> +err_ctx_hi:
> +	kernel_context_close(ctx_hi);
> +err_unlock:
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
>   static int live_preempt(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -1127,6 +1306,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
>   		SUBTEST(live_sanitycheck),
> +		SUBTEST(live_busywait_preempt),
>   		SUBTEST(live_preempt),
>   		SUBTEST(live_late_preempt),
>   		SUBTEST(live_suppress_self_preempt),
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko