[Intel-gfx] [PATCH v2] drm/i915: Bump ready tasks ahead of busywaits

Tue Apr 9 15:38:37 UTC 2019

On 09/04/2019 16:29, Chris Wilson wrote:
> Consider two tasks that are running in parallel on a pair of engines
> (vcs0, vcs1), but then must complete on a shared engine (rcs0). To
> maximise throughput, we want to run the first ready task on rcs0 (i.e.
> the first task that completes on either of vcs0 or vcs1). When using
> semaphores, however, we will instead queue onto rcs in submission order.
> 
> To resolve this incorrect ordering, we want to re-evaluate the priority
> queue when each of the request is ready. Normally this happens because
> we only insert into the priority queue requests that are ready, but with
> semaphores we are inserting ahead of their readiness and to compensate
> we penalize those tasks with reduced priority (so that tasks that do not
> need to busywait should naturally be run first). However, given a series
> of tasks that each use semaphores, the queue degrades into submission
> fifo rather than readiness fifo, and so to counter this we give a small
> boost to semaphore users as their dependent tasks are completed (and so
> we no longer require any busywait prior to running the user task as they
> are then ready themselves).
> 
> v2: Fixup irqsave for schedule_lock (Tvrtko)
> 
> Testcase: igt/gem_exec_schedule/semaphore-codependency
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
> Cc: Dmitry Ermilov <dmitry.ermilov at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c   | 40 +++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_request.h   |  1 +
>   drivers/gpu/drm/i915/i915_scheduler.c | 21 +++++++-------
>   3 files changed, 52 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 96a9e8bcd805..a7d87cfaabcb 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -552,6 +552,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>   	return NOTIFY_DONE;
>   }
>   
> +static int __i915_sw_fence_call
> +semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	struct i915_request *request =
> +		container_of(fence, typeof(*request), semaphore);
> +
> +	switch (state) {
> +	case FENCE_COMPLETE:
> +		/*
> +		 * We only check a small portion of our dependencies
> +		 * and so cannot guarantee that there remains no
> +		 * semaphore chain across all. Instead of opting
> +		 * for the full NOSEMAPHORE boost, we go for the
> +		 * smaller (but still preempting) boost of
> +		 * NEWCLIENT. This will be enough to boost over
> +		 * a busywaiting request (as that cannot be
> +		 * NEWCLIENT) without accidentally boosting
> +		 * a busywait over real work elsewhere.
> +		 */
> +		i915_schedule_bump_priority(request, I915_PRIORITY_NEWCLIENT);
> +		break;
> +
> +	case FENCE_FREE:
> +		i915_request_put(request);
> +		break;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
>   static void ring_retire_requests(struct intel_ring *ring)
>   {
>   	struct i915_request *rq, *rn;
> @@ -702,6 +732,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   
>   	/* We bump the ref for the fence chain */
>   	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
> +	i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify);
>   
>   	i915_sched_node_init(&rq->sched);
>   
> @@ -784,6 +815,12 @@ emit_semaphore_wait(struct i915_request *to,
>   						     &from->fence, 0,
>   						     I915_FENCE_GFP);
>   
> +	err = i915_sw_fence_await_dma_fence(&to->semaphore,
> +					    &from->fence, 0,
> +					    I915_FENCE_GFP);
> +	if (err < 0)
> +		return err;
> +
>   	/* We need to pin the signaler's HWSP until we are finished reading. */
>   	err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
>   	if (err)
> @@ -1114,6 +1151,7 @@ void i915_request_add(struct i915_request *request)
>   	 * run at the earliest possible convenience.
>   	 */
>   	local_bh_disable();
> +	i915_sw_fence_commit(&request->semaphore);
>   	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
>   	if (engine->schedule) {
>   		struct i915_sched_attr attr = request->gem_context->sched;
> @@ -1320,7 +1358,9 @@ long i915_request_wait(struct i915_request *rq,
>   	if (flags & I915_WAIT_PRIORITY) {
>   		if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
>   			gen6_rps_boost(rq);
> +		local_bh_disable(); /* suspend tasklets for reprioritisation */
>   		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
> +		local_bh_enable(); /* kick tasklets en masse */
>   	}
>   
>   	wait.tsk = current;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 875be6f71412..a982664618c2 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -143,6 +143,7 @@ struct i915_request {
>   		struct i915_sw_dma_fence_cb dmaq;
>   	};
>   	struct list_head execute_cb;
> +	struct i915_sw_fence semaphore;
>   
>   	/*
>   	 * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 77dbf7d74e12..39bc4f54e272 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -64,7 +64,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
>   {
>   	bool ret = false;
>   
> -	spin_lock(&schedule_lock);
> +	spin_lock_irq(&schedule_lock);
>   
>   	if (!node_signaled(signal)) {
>   		INIT_LIST_HEAD(&dep->dfs_link);
> @@ -81,7 +81,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
>   		ret = true;
>   	}
>   
> -	spin_unlock(&schedule_lock);
> +	spin_unlock_irq(&schedule_lock);
>   
>   	return ret;
>   }
> @@ -108,7 +108,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
>   
>   	GEM_BUG_ON(!list_empty(&node->link));
>   
> -	spin_lock(&schedule_lock);
> +	spin_lock_irq(&schedule_lock);
>   
>   	/*
>   	 * Everyone we depended upon (the fences we wait to be signaled)
> @@ -135,7 +135,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
>   			i915_dependency_free(dep);
>   	}
>   
> -	spin_unlock(&schedule_lock);
> +	spin_unlock_irq(&schedule_lock);
>   }
>   
>   static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> @@ -356,7 +356,7 @@ static void __i915_schedule(struct i915_request *rq,
>   
>   	memset(&cache, 0, sizeof(cache));
>   	engine = rq->engine;
> -	spin_lock_irq(&engine->timeline.lock);
> +	spin_lock(&engine->timeline.lock);
>   
>   	/* Fifo and depth-first replacement ensure our deps execute before us */
>   	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
> @@ -407,32 +407,33 @@ static void __i915_schedule(struct i915_request *rq,
>   		tasklet_hi_schedule(&engine->execlists.tasklet);
>   	}
>   
> -	spin_unlock_irq(&engine->timeline.lock);
> +	spin_unlock(&engine->timeline.lock);
>   }
>   
>   void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
>   {
> -	spin_lock(&schedule_lock);
> +	spin_lock_irq(&schedule_lock);
>   	__i915_schedule(rq, attr);
> -	spin_unlock(&schedule_lock);
> +	spin_unlock_irq(&schedule_lock);
>   }
>   
>   void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
>   {
>   	struct i915_sched_attr attr;
> +	unsigned long flags;
>   
>   	GEM_BUG_ON(bump & ~I915_PRIORITY_MASK);
>   
>   	if (READ_ONCE(rq->sched.attr.priority) == I915_PRIORITY_INVALID)
>   		return;
>   
> -	spin_lock_bh(&schedule_lock);
> +	spin_lock_irqsave(&schedule_lock, flags);
>   
>   	attr = rq->sched.attr;
>   	attr.priority |= bump;
>   	__i915_schedule(rq, &attr);
>   
> -	spin_unlock_bh(&schedule_lock);
> +	spin_unlock_irqrestore(&schedule_lock, flags);
>   }
>   
>   void __i915_priolist_free(struct i915_priolist *p)
> 

Looks fine to me. Provisional r-b:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

But let's wait for a media benchmarking run to see if you have nailed 
the regression.

Regards,

Tvrtko