[Intel-gfx] [PATCH] drm/i915/execlists: Don't merely skip submission if maybe timeslicing

Fri Oct 18 09:50:55 UTC 2019

On 17/10/2019 10:52, Chris Wilson wrote:
> Normally, we try and skip submission if ELSP[1] is filled. However, we
> may desire to enable timeslicing due to the queue priority, even if
> ELSP[1] itself does not require timeslicing. That is the queue is equal
> priority to ELSP[0] and higher priority then ELSP[1]. Previously, we
> would wait until the context switch to preempt the current ELSP[1], but
> with timeslicing, we want to preempt ELSP[0] and replace it with the
> queue.

Why we want to preempt ELSP[0] if the statement is queue is not higher 
priority from it?

> 
> In writing the test case, it become quickly apparent that we were also
> suppressing the tasklet during promotion and so failing to notice when
> the queue started requiring timeslicing.
> 
> Fixes: 2229adc81380 ("drm/i915/execlist: Trim immediate timeslice expiry")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> Kick execlists_submission_tasklet harder.
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c    |  22 +++-
>   drivers/gpu/drm/i915/gt/selftest_lrc.c | 164 ++++++++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_scheduler.c  |  17 ++-
>   drivers/gpu/drm/i915/i915_scheduler.h  |  18 ---
>   4 files changed, 192 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index e9fe9f79cedd..d0088d020220 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -352,10 +352,15 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
>   	 * However, the priority hint is a mere hint that we may need to
>   	 * preempt. If that hint is stale or we may be trying to preempt
>   	 * ourselves, ignore the request.
> +	 *
> +	 * More naturally we would write
> +	 *      prio >= max(0, last);
> +	 * except that we wish to prevent triggering preemption at the same
> +	 * priority level: the task that is running should remain running
> +	 * to preserve FIFO ordering of dependencies.
>   	 */
> -	last_prio = effective_prio(rq);
> -	if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
> -					 last_prio))
> +	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
> +	if (engine->execlists.queue_priority_hint <= last_prio)

Okay this eventually resolves to the exact same condition as before so 
it seems to only be churn as consequence of removing 
i915_scheduler_need_preempt. Then lets see..

>   		return false;
>   
>   	/*
> @@ -1509,8 +1514,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			 * submission.
>   			 */
>   			if (!list_is_last(&last->sched.link,
> -					  &engine->active.requests))
> +					  &engine->active.requests)) {
> +				/*
> +				 * Even if ELSP[1] is occupied and not worthy
> +				 * of timeslices, our queue might be.
> +				 */
> +				if (!execlists->timer.expires &&
> +				    need_timeslice(engine, last))
> +					mod_timer(&execlists->timer,
> +						  jiffies + 1);
>   				return;
> +			}
>   
>   			/*
>   			 * WaIdleLiteRestore:bdw,skl
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 2868371c609e..5bcfe4a2466f 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -325,7 +325,13 @@ semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx)
>   	if (IS_ERR(rq))
>   		goto out_ctx;
>   
> -	err = emit_semaphore_chain(rq, vma, idx);
> +	err = 0;
> +	if (rq->engine->emit_init_breadcrumb)
> +		err = rq->engine->emit_init_breadcrumb(rq);
> +	if (err == 0)
> +		err = emit_semaphore_chain(rq, vma, idx);
> +	if (err == 0)
> +		i915_request_get(rq);
>   	i915_request_add(rq);
>   	if (err)
>   		rq = ERR_PTR(err);
> @@ -338,10 +344,10 @@ semaphore_queue(struct intel_engine_cs *engine, struct i915_vma *vma, int idx)
>   static int
>   release_queue(struct intel_engine_cs *engine,
>   	      struct i915_vma *vma,
> -	      int idx)
> +	      int idx, int prio)
>   {
>   	struct i915_sched_attr attr = {
> -		.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
> +		.priority = prio,
>   	};
>   	struct i915_request *rq;
>   	u32 *cs;
> @@ -362,9 +368,15 @@ release_queue(struct intel_engine_cs *engine,
>   	*cs++ = 1;
>   
>   	intel_ring_advance(rq, cs);
> +
> +	i915_request_get(rq);
>   	i915_request_add(rq);
>   
> +	local_bh_disable();
>   	engine->schedule(rq, &attr);
> +	local_bh_enable(); /* kick tasklet */
> +
> +	i915_request_put(rq);
>   
>   	return 0;
>   }
> @@ -383,7 +395,6 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
>   	if (IS_ERR(head))
>   		return PTR_ERR(head);
>   
> -	i915_request_get(head);
>   	for_each_engine(engine, outer->i915, id) {
>   		for (i = 0; i < count; i++) {
>   			struct i915_request *rq;
> @@ -393,10 +404,12 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
>   				err = PTR_ERR(rq);
>   				goto out;
>   			}
> +
> +			i915_request_put(rq);
>   		}
>   	}
>   
> -	err = release_queue(outer, vma, n);
> +	err = release_queue(outer, vma, n, INT_MAX);
>   	if (err)
>   		goto out;
>   
> @@ -482,6 +495,146 @@ static int live_timeslice_preempt(void *arg)
>   	return err;
>   }
>   
> +static int nop_request(struct intel_engine_cs *engine)
> +{
> +	struct i915_request *rq;
> +
> +	rq = i915_request_create(engine->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	i915_request_add(rq);
> +	return 0;
> +}
> +
> +static void wait_for_submit(struct intel_engine_cs *engine)
> +{
> +	do {
> +		cond_resched();
> +		intel_engine_flush_submission(engine);
> +	} while (READ_ONCE(engine->execlists.pending[0]));
> +}
> +
> +static int live_timeslice_queue(void *arg)
> +{
> +	struct intel_gt *gt = arg;
> +	struct drm_i915_gem_object *obj;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	struct i915_vma *vma;
> +	void *vaddr;
> +	int err = 0;
> +
> +	/*
> +	 * Make sure that even if ELSP[0] and ELSP[1] are filled with
> +	 * timeslicing between them disabled, we *do* enable timeslicing
> +	 * if the queue demands it. (Normally, we do not submit if
> +	 * ELSP[1] is already occupied, so must rely on timeslicing to
> +	 * eject ELSP[0] in favour of the queue.)
> +	 */
> +
> +	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
> +	if (IS_ERR(obj))
> +		return PTR_ERR(obj);
> +
> +	vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err_obj;
> +	}
> +
> +	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WC);
> +	if (IS_ERR(vaddr)) {
> +		err = PTR_ERR(vaddr);
> +		goto err_obj;
> +	}
> +
> +	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
> +	if (err)
> +		goto err_map;
> +
> +	for_each_engine(engine, gt->i915, id) {

Make it gt now.

> +		struct i915_sched_attr attr = {
> +			.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
> +		};
> +		struct i915_request *rq;
> +
> +		if (!intel_engine_has_preemption(engine))
> +			continue;
> +
> +		memset(vaddr, 0, PAGE_SIZE);
> +
> +		/* ELSP[0]: semaphore wait */
> +		rq = semaphore_queue(engine, vma, 0);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			goto err_pin;
> +		}
> +		engine->schedule(rq, &attr);
> +		wait_for_submit(engine);
> +
> +		/* ELSP[1]: nop request */
> +		err = nop_request(engine);
> +		if (err) {
> +			i915_request_put(rq);
> +			goto err_pin;
> +		}
> +		wait_for_submit(engine);
> +
> +		GEM_BUG_ON(i915_request_completed(rq));
> +		GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
> +
> +		/* Queue: semaphore signal, matching priority as semaphore */
> +		err = release_queue(engine, vma, 1, effective_prio(rq));
> +		if (err) {
> +			i915_request_put(rq);
> +			goto err_pin;
> +		}
> +
> +		intel_engine_flush_submission(engine);
> +		if (!READ_ONCE(engine->execlists.timer.expires) &&
> +		    !i915_request_completed(rq)) {
> +			struct drm_printer p =
> +				drm_info_printer(gt->i915->drm.dev);
> +
> +			GEM_TRACE_ERR("%s: Failed to enable timeslicing!\n",
> +				      engine->name);
> +			intel_engine_dump(engine, &p,
> +					  "%s\n", engine->name);
> +			GEM_TRACE_DUMP();
> +
> +			memset(vaddr, 0xff, PAGE_SIZE);
> +			err = -EINVAL;
> +		}
> +
> +		/* Timeslice every jiffie, so within 2 we should signal */
> +		if (i915_request_wait(rq, 0, 3) < 0) {
> +			struct drm_printer p =
> +				drm_info_printer(gt->i915->drm.dev);
> +
> +			pr_err("%s: Failed to timeslice into queue\n",
> +			       engine->name);
> +			intel_engine_dump(engine, &p,
> +					  "%s\n", engine->name);
> +
> +			memset(vaddr, 0xff, PAGE_SIZE);
> +			err = -EIO;
> +		}
> +		i915_request_put(rq);
> +		if (err)
> +			break;
> +	}
> +
> +
> +err_pin:
> +	i915_vma_unpin(vma);
> +err_map:
> +	i915_gem_object_unpin_map(obj);
> +err_obj:
> +	i915_gem_object_put(obj);
> +	return err;
> +}
> +
>   static int live_busywait_preempt(void *arg)
>   {
>   	struct intel_gt *gt = arg;
> @@ -2437,6 +2590,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_unlite_switch),
>   		SUBTEST(live_unlite_preempt),
>   		SUBTEST(live_timeslice_preempt),
> +		SUBTEST(live_timeslice_queue),
>   		SUBTEST(live_busywait_preempt),
>   		SUBTEST(live_preempt),
>   		SUBTEST(live_late_preempt),
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 7b84ebca2901..0ca40f6bf08c 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -177,9 +177,22 @@ static inline int rq_prio(const struct i915_request *rq)
>   	return rq->sched.attr.priority | __NO_PREEMPTION;
>   }
>   
> +static inline bool need_preempt(int prio, int active)
> +{
> +	/*
> +	 * Allow preemption of low -> normal -> high, but we do
> +	 * not allow low priority tasks to preempt other low priority
> +	 * tasks under the impression that latency for low priority
> +	 * tasks does not matter (as much as background throughput),
> +	 * so kiss.
> +	 */
> +	return prio >= max(I915_PRIORITY_NORMAL, active);

Is this the same as the current:

   return prio > max(I915_PRIORITY_NORMAL - 1, active);

Hm no, it now allows for high prio tasks to preempt equal high prio.

So it will kick the tasklet and dequeue will then decide no to 
need_preempt after all. Where it is the catch? Okay catch is in the 
other execlist_dequeue hunk - that it will now enable the timeslice timer.

I have to say it is getting very difficult to tie everything together. I 
wish for a simpler design with more centralized "magic" on hadling 
priorities etc.

> +}
> +
>   static void kick_submission(struct intel_engine_cs *engine, int prio)
>   {
> -	const struct i915_request *inflight = *engine->execlists.active;
> +	const struct i915_request *inflight =
> +		execlists_active(&engine->execlists);
>   
>   	/*
>   	 * If we are already the currently executing context, don't
> @@ -188,7 +201,7 @@ static void kick_submission(struct intel_engine_cs *engine, int prio)
>   	 * tasklet, i.e. we have not change the priority queue
>   	 * sufficiently to oust the running context.
>   	 */
> -	if (!inflight || !i915_scheduler_need_preempt(prio, rq_prio(inflight)))
> +	if (!inflight || !need_preempt(prio, rq_prio(inflight)))
>   		return;
>   
>   	tasklet_hi_schedule(&engine->execlists.tasklet);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 7eefccff39bf..07d243acf553 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -52,22 +52,4 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   		__i915_priolist_free(p);
>   }
>   
> -static inline bool i915_scheduler_need_preempt(int prio, int active)
> -{
> -	/*
> -	 * Allow preemption of low -> normal -> high, but we do
> -	 * not allow low priority tasks to preempt other low priority
> -	 * tasks under the impression that latency for low priority
> -	 * tasks does not matter (as much as background throughput),
> -	 * so kiss.
> -	 *
> -	 * More naturally we would write
> -	 *	prio >= max(0, last);
> -	 * except that we wish to prevent triggering preemption at the same
> -	 * priority level: the task that is running should remain running
> -	 * to preserve FIFO ordering of dependencies.
> -	 */
> -	return prio > max(I915_PRIORITY_NORMAL - 1, active);
> -}
> -
>   #endif /* _I915_SCHEDULER_H_ */
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko