[Intel-gfx] [PATCH v2] drm/i915: Bump ready tasks ahead of busywaits
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Apr 9 15:38:37 UTC 2019
On 09/04/2019 16:29, Chris Wilson wrote:
> Consider two tasks that are running in parallel on a pair of engines
> (vcs0, vcs1), but then must complete on a shared engine (rcs0). To
> maximise throughput, we want to run the first ready task on rcs0 (i.e.
> the first task that completes on either of vcs0 or vcs1). When using
> semaphores, however, we will instead queue onto rcs in submission order.
>
> To resolve this incorrect ordering, we want to re-evaluate the priority
> queue when each of the request is ready. Normally this happens because
> we only insert into the priority queue requests that are ready, but with
> semaphores we are inserting ahead of their readiness and to compensate
> we penalize those tasks with reduced priority (so that tasks that do not
> need to busywait should naturally be run first). However, given a series
> of tasks that each use semaphores, the queue degrades into submission
> fifo rather than readiness fifo, and so to counter this we give a small
> boost to semaphore users as their dependent tasks are completed (and so
> we no longer require any busywait prior to running the user task as they
> are then ready themselves).
>
> v2: Fixup irqsave for schedule_lock (Tvrtko)
>
> Testcase: igt/gem_exec_schedule/semaphore-codependency
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
> Cc: Dmitry Ermilov <dmitry.ermilov at intel.com>
> ---
> drivers/gpu/drm/i915/i915_request.c | 40 +++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_request.h | 1 +
> drivers/gpu/drm/i915/i915_scheduler.c | 21 +++++++-------
> 3 files changed, 52 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 96a9e8bcd805..a7d87cfaabcb 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -552,6 +552,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> return NOTIFY_DONE;
> }
>
> +static int __i915_sw_fence_call
> +semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> + struct i915_request *request =
> + container_of(fence, typeof(*request), semaphore);
> +
> + switch (state) {
> + case FENCE_COMPLETE:
> + /*
> + * We only check a small portion of our dependencies
> + * and so cannot guarantee that there remains no
> + * semaphore chain across all. Instead of opting
> + * for the full NOSEMAPHORE boost, we go for the
> + * smaller (but still preempting) boost of
> + * NEWCLIENT. This will be enough to boost over
> + * a busywaiting request (as that cannot be
> + * NEWCLIENT) without accidentally boosting
> + * a busywait over real work elsewhere.
> + */
> + i915_schedule_bump_priority(request, I915_PRIORITY_NEWCLIENT);
> + break;
> +
> + case FENCE_FREE:
> + i915_request_put(request);
> + break;
> + }
> +
> + return NOTIFY_DONE;
> +}
> +
> static void ring_retire_requests(struct intel_ring *ring)
> {
> struct i915_request *rq, *rn;
> @@ -702,6 +732,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>
> /* We bump the ref for the fence chain */
> i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
> + i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify);
>
> i915_sched_node_init(&rq->sched);
>
> @@ -784,6 +815,12 @@ emit_semaphore_wait(struct i915_request *to,
> &from->fence, 0,
> I915_FENCE_GFP);
>
> + err = i915_sw_fence_await_dma_fence(&to->semaphore,
> + &from->fence, 0,
> + I915_FENCE_GFP);
> + if (err < 0)
> + return err;
> +
> /* We need to pin the signaler's HWSP until we are finished reading. */
> err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
> if (err)
> @@ -1114,6 +1151,7 @@ void i915_request_add(struct i915_request *request)
> * run at the earliest possible convenience.
> */
> local_bh_disable();
> + i915_sw_fence_commit(&request->semaphore);
> rcu_read_lock(); /* RCU serialisation for set-wedged protection */
> if (engine->schedule) {
> struct i915_sched_attr attr = request->gem_context->sched;
> @@ -1320,7 +1358,9 @@ long i915_request_wait(struct i915_request *rq,
> if (flags & I915_WAIT_PRIORITY) {
> if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
> gen6_rps_boost(rq);
> + local_bh_disable(); /* suspend tasklets for reprioritisation */
> i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
> + local_bh_enable(); /* kick tasklets en masse */
> }
>
> wait.tsk = current;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 875be6f71412..a982664618c2 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -143,6 +143,7 @@ struct i915_request {
> struct i915_sw_dma_fence_cb dmaq;
> };
> struct list_head execute_cb;
> + struct i915_sw_fence semaphore;
>
> /*
> * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 77dbf7d74e12..39bc4f54e272 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -64,7 +64,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
> {
> bool ret = false;
>
> - spin_lock(&schedule_lock);
> + spin_lock_irq(&schedule_lock);
>
> if (!node_signaled(signal)) {
> INIT_LIST_HEAD(&dep->dfs_link);
> @@ -81,7 +81,7 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
> ret = true;
> }
>
> - spin_unlock(&schedule_lock);
> + spin_unlock_irq(&schedule_lock);
>
> return ret;
> }
> @@ -108,7 +108,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
>
> GEM_BUG_ON(!list_empty(&node->link));
>
> - spin_lock(&schedule_lock);
> + spin_lock_irq(&schedule_lock);
>
> /*
> * Everyone we depended upon (the fences we wait to be signaled)
> @@ -135,7 +135,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
> i915_dependency_free(dep);
> }
>
> - spin_unlock(&schedule_lock);
> + spin_unlock_irq(&schedule_lock);
> }
>
> static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> @@ -356,7 +356,7 @@ static void __i915_schedule(struct i915_request *rq,
>
> memset(&cache, 0, sizeof(cache));
> engine = rq->engine;
> - spin_lock_irq(&engine->timeline.lock);
> + spin_lock(&engine->timeline.lock);
>
> /* Fifo and depth-first replacement ensure our deps execute before us */
> list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
> @@ -407,32 +407,33 @@ static void __i915_schedule(struct i915_request *rq,
> tasklet_hi_schedule(&engine->execlists.tasklet);
> }
>
> - spin_unlock_irq(&engine->timeline.lock);
> + spin_unlock(&engine->timeline.lock);
> }
>
> void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
> {
> - spin_lock(&schedule_lock);
> + spin_lock_irq(&schedule_lock);
> __i915_schedule(rq, attr);
> - spin_unlock(&schedule_lock);
> + spin_unlock_irq(&schedule_lock);
> }
>
> void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
> {
> struct i915_sched_attr attr;
> + unsigned long flags;
>
> GEM_BUG_ON(bump & ~I915_PRIORITY_MASK);
>
> if (READ_ONCE(rq->sched.attr.priority) == I915_PRIORITY_INVALID)
> return;
>
> - spin_lock_bh(&schedule_lock);
> + spin_lock_irqsave(&schedule_lock, flags);
>
> attr = rq->sched.attr;
> attr.priority |= bump;
> __i915_schedule(rq, &attr);
>
> - spin_unlock_bh(&schedule_lock);
> + spin_unlock_irqrestore(&schedule_lock, flags);
> }
>
> void __i915_priolist_free(struct i915_priolist *p)
>
Looks fine to me. Provisional r-b:
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
But let's wait for a media benchmarking run to see if you have nailed
the regression.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list