[Intel-gfx] [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction

Thu Nov 3 11:03:47 UTC 2016

On 02/11/2016 17:50, Chris Wilson wrote:
> The scheduler needs to know the dependencies of each request for the
> lifetime of the request, as it may choose to reschedule the requests at
> any time and must ensure the dependency tree is not broken. This is in
> additional to using the fence to only allow execution after all
> dependencies have been completed.
>
> One option was to extend the fence to support the bidirectional
> dependency tracking required by the scheduler. However the mismatch in
> lifetimes between the submit fence and the request essentially meant
> that we had to build a completely separate struct (and we could not
> simply reuse the existing waitqueue in the fence for one half of the
> dependency tracking). The extra dependency tracking simply did not mesh
> well with the fence, and keeping it separate both keeps the fence
> implementation simpler and allows us to extend the dependency tracking
> into a priority tree (whilst maintaining support for reordering the
> tree).
>
> To avoid the additional allocations and list manipulations, the use of
> the priotree is disabled when there are no schedulers to use it.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c | 72 ++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_request.h | 23 +++++++++++
>  2 files changed, 94 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9c8605c834f9..13090f226203 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -113,6 +113,59 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>
> +static int
> +i915_priotree_add_dependency(struct i915_priotree *pt,
> +			     struct i915_priotree *signal,
> +			     struct i915_dependency *dep)
> +{
> +	unsigned long flags = 0;
> +
> +	if (!dep) {
> +		dep = kmalloc(sizeof(*dep), GFP_KERNEL);

I will mention a dedicated cache again since this could possibly be our 
hottest allocation path. With a dedicated slab I've seen it grow to 5-7k 
objects in some benchmarks, with the request slab around 1k at the same 
time.

> +		if (!dep)
> +			return -ENOMEM;
> +
> +		flags |= I915_DEPENDENCY_ALLOC;
> +	}

Not sure if it would be any nicer to just set the flags after allocating 
to I915_DEPENDENCY_ALLOC and add an else path to set it to zero here.

> +
> +	dep->signal = signal;
> +	dep->flags = flags;
> +
> +	list_add(&dep->post_link, &signal->post_list);
> +	list_add(&dep->pre_link, &pt->pre_list);
> +	return 0;
> +}
> +
> +static void
> +i915_priotree_fini(struct i915_priotree *pt)
> +{
> +	struct i915_dependency *dep, *next;
> +
> +	/* Everyone we depended upon should retire before us and remove
> +	 * themselves from our list. However, retirement is run independently
> +	 * on each timeline and so we may be called out-of-order.
> +	 */
> +	list_for_each_entry_safe(dep, next, &pt->pre_list, pre_link) {
> +		list_del(&dep->post_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			kfree(dep);
> +	}
> +
> +	/* Remove ourselves from everyone who depends upon us */
> +	list_for_each_entry_safe(dep, next, &pt->post_list, post_link) {
> +		list_del(&dep->pre_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			kfree(dep);
> +	}
> +}
> +
> +static void
> +i915_priotree_init(struct i915_priotree *pt)
> +{
> +	INIT_LIST_HEAD(&pt->pre_list);
> +	INIT_LIST_HEAD(&pt->post_list);
> +}
> +
>  void i915_gem_retire_noop(struct i915_gem_active *active,
>  			  struct drm_i915_gem_request *request)
>  {
> @@ -182,6 +235,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  	i915_gem_context_put(request->ctx);
>
>  	dma_fence_signal(&request->fence);
> +
> +	i915_priotree_fini(&request->priotree);
>  	i915_gem_request_put(request);
>  }
>
> @@ -464,6 +519,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	 */
>  	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
>
> +	i915_priotree_init(&req->priotree);
> +
>  	INIT_LIST_HEAD(&req->active_list);
>  	req->i915 = dev_priv;
>  	req->engine = engine;
> @@ -517,6 +574,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
>
>  	GEM_BUG_ON(to == from);
>
> +	if (to->engine->schedule) {
> +		ret = i915_priotree_add_dependency(&to->priotree,
> +						   &from->priotree,
> +						   NULL);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	if (to->timeline == from->timeline)
>  		return 0;
>
> @@ -740,9 +805,14 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>
>  	prev = i915_gem_active_raw(&timeline->last_request,
>  				   &request->i915->drm.struct_mutex);
> -	if (prev)
> +	if (prev) {
>  		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
>  					     &request->submitq);
> +		if (engine->schedule)
> +			i915_priotree_add_dependency(&request->priotree,
> +						     &prev->priotree,
> +						     &request->depq);
> +	}
>
>  	spin_lock_irq(&timeline->lock);
>  	list_add_tail(&request->link, &timeline->requests);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 725d04128eaf..0cb2ba546062 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -41,6 +41,18 @@ struct intel_signal_node {
>  	struct intel_wait wait;
>  };
>
> +struct i915_dependency {
> +	struct i915_priotree *signal;
> +	struct list_head pre_link, post_link;
> +	unsigned long flags;
> +#define I915_DEPENDENCY_ALLOC BIT(0)
> +};
> +
> +struct i915_priotree {
> +	struct list_head pre_list; /* who is before us, we depend upon */
> +	struct list_head post_list; /* who is after us, they depend upon us */
> +};

I need a picture to imagine this data structure. :(

> +
>  /**
>   * Request queue structure.
>   *
> @@ -89,6 +101,17 @@ struct drm_i915_gem_request {
>  	wait_queue_t submitq;
>  	wait_queue_t execq;
>
> +	/* A list of everyone we wait upon, and everyone who waits upon us.
> +	 * Even though we will not be submitted to the hardware before the
> +	 * submit fence is signaled (it waits for all external events as well
> +	 * as our own requests), the scheduler still needs to know the
> +	 * dependency tree for the lifetime of the request (from execbuf
> +	 * to retirement), i.e. bidirectional dependency information for the
> +	 * request not tied to individual fences.
> +	 */
> +	struct i915_priotree priotree;
> +	struct i915_dependency depq;

Why depq and not dep_node or something? Makes me wonder what am I 
missing about the "q". It is not a queue?!?

> +
>  	u32 global_seqno;
>
>  	/** GEM sequence number associated with the previous request,
>

Regards,

Tvrtko