[Intel-gfx] [PATCH 10/39] drm/i915: Allow contexts to share a single timeline across all engines

Thu Mar 14 16:09:39 UTC 2019

On 13/03/2019 14:43, Chris Wilson wrote:
> Previously, our view has been always to run the engines independently
> within a context. (Multiple engines happened before we had contexts and
> timelines, so they always operated independently and that behaviour
> persisted into contexts.) However, at the user level the context often
> represents a single timeline (e.g. GL contexts) and userspace must
> ensure that the individual engines are serialised to present that
> ordering to the client (or forgot about this detail entirely and hope no
> one notices - a fair ploy if the client can only directly control one
> engine themselves ;)
> 
> In the next patch, we will want to construct a set of engines that
> operate as one, that have a single timeline interwoven between them, to
> present a single virtual engine to the user. (They submit to the virtual
> engine, then we decide which engine to execute on based.)
> 
> To that end, we want to be able to create contexts which have a single
> timeline (fence context) shared between all engines, rather than multiple
> timelines.
> 
> v2: Move the specialised timeline ordering to its own function.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c       | 32 ++++++--
>   drivers/gpu/drm/i915/i915_gem_context_types.h |  2 +
>   drivers/gpu/drm/i915/i915_request.c           | 80 +++++++++++++------
>   drivers/gpu/drm/i915/i915_request.h           |  5 +-
>   drivers/gpu/drm/i915/i915_sw_fence.c          | 39 +++++++--
>   drivers/gpu/drm/i915/i915_sw_fence.h          | 13 ++-
>   drivers/gpu/drm/i915/intel_lrc.c              |  5 +-
>   .../gpu/drm/i915/selftests/i915_gem_context.c | 18 +++--
>   drivers/gpu/drm/i915/selftests/mock_context.c |  2 +-
>   include/uapi/drm/i915_drm.h                   |  3 +-
>   10 files changed, 149 insertions(+), 50 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 07c097ad83ee..4e4b0b5c4be0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -238,6 +238,9 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
>   	rbtree_postorder_for_each_entry_safe(it, n, &ctx->hw_contexts, node)
>   		intel_context_put(it);
>   
> +	if (ctx->timeline)
> +		i915_timeline_put(ctx->timeline);
> +
>   	kfree(ctx->name);
>   	put_pid(ctx->pid);
>   
> @@ -449,12 +452,17 @@ static void __assign_ppgtt(struct i915_gem_context *ctx,
>   
>   static struct i915_gem_context *
>   i915_gem_create_context(struct drm_i915_private *dev_priv,
> -			struct drm_i915_file_private *file_priv)
> +			struct drm_i915_file_private *file_priv,
> +			unsigned int flags)
>   {
>   	struct i915_gem_context *ctx;
>   
>   	lockdep_assert_held(&dev_priv->drm.struct_mutex);
>   
> +	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE &&
> +	    !HAS_EXECLISTS(dev_priv))
> +		return ERR_PTR(-EINVAL);
> +
>   	/* Reap the most stale context */
>   	contexts_free_first(dev_priv);
>   
> @@ -477,6 +485,18 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
>   		i915_ppgtt_put(ppgtt);
>   	}
>   
> +	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) {
> +		struct i915_timeline *timeline;
> +
> +		timeline = i915_timeline_create(dev_priv, ctx->name, NULL);
> +		if (IS_ERR(timeline)) {
> +			__destroy_hw_context(ctx, file_priv);
> +			return ERR_CAST(timeline);
> +		}
> +
> +		ctx->timeline = timeline;
> +	}
> +
>   	trace_i915_context_create(ctx);
>   
>   	return ctx;
> @@ -505,7 +525,7 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>   	if (ret)
>   		return ERR_PTR(ret);
>   
> -	ctx = i915_gem_create_context(to_i915(dev), NULL);
> +	ctx = i915_gem_create_context(to_i915(dev), NULL, 0);
>   	if (IS_ERR(ctx))
>   		goto out;
>   
> @@ -541,7 +561,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
>   	struct i915_gem_context *ctx;
>   	int err;
>   
> -	ctx = i915_gem_create_context(i915, NULL);
> +	ctx = i915_gem_create_context(i915, NULL, 0);
>   	if (IS_ERR(ctx))
>   		return ctx;
>   
> @@ -673,7 +693,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
>   	idr_init_base(&file_priv->vm_idr, 1);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
> -	ctx = i915_gem_create_context(i915, file_priv);
> +	ctx = i915_gem_create_context(i915, file_priv, 0);
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	if (IS_ERR(ctx)) {
>   		idr_destroy(&file_priv->context_idr);
> @@ -789,7 +809,7 @@ last_request_on_engine(struct i915_timeline *timeline,
>   
>   	rq = i915_active_request_raw(&timeline->last_request,
>   				     &engine->i915->drm.struct_mutex);
> -	if (rq && rq->engine == engine) {
> +	if (rq && rq->engine->mask & engine->mask) {
>   		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
>   			  timeline->name, engine->name,
>   			  rq->fence.context, rq->fence.seqno);
> @@ -1508,7 +1528,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>   	if (ret)
>   		return ret;
>   
> -	ctx = i915_gem_create_context(i915, file_priv);
> +	ctx = i915_gem_create_context(i915, file_priv, args->flags);
>   	mutex_unlock(&dev->struct_mutex);
>   	if (IS_ERR(ctx))
>   		return PTR_ERR(ctx);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context_types.h b/drivers/gpu/drm/i915/i915_gem_context_types.h
> index 2bf19730eaa9..f8f6e6c960a7 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context_types.h
> @@ -41,6 +41,8 @@ struct i915_gem_context {
>   	/** file_priv: owning file descriptor */
>   	struct drm_i915_file_private *file_priv;
>   
> +	struct i915_timeline *timeline;
> +
>   	/**
>   	 * @ppgtt: unique address space (GTT)
>   	 *
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 0a3d94517d0a..2382339172b4 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -993,6 +993,60 @@ void i915_request_skip(struct i915_request *rq, int error)
>   	memset(vaddr + head, 0, rq->postfix - head);
>   }
>   
> +static struct i915_request *
> +__i915_request_add_to_timeline(struct i915_request *rq)
> +{
> +	struct i915_timeline *timeline = rq->timeline;
> +	struct i915_request *prev;
> +
> +	/*
> +	 * Dependency tracking and request ordering along the timeline
> +	 * is special cased so that we can eliminate redundant ordering
> +	 * operations while building the request (we know that the timeline
> +	 * itself is ordered, and here we guarantee it).
> +	 *
> +	 * As we know we will need to emit tracking along the timeline,
> +	 * we embed the hooks into our request struct -- at the cost of
> +	 * having to have specialised no-allocation interfaces (which will
> +	 * be beneficial elsewhere).
> +	 *
> +	 * A second benefit to open-coding i915_request_await_request is
> +	 * that we can apply a slight variant of the rules specialised
> +	 * for timelines that jump between engines (such as virtual engines).
> +	 * If we consider the case of virtual engine, we must emit a dma-fence
> +	 * to prevent scheduling of the second request until the first is
> +	 * complete (to maximise our greedy late load balancing) and this
> +	 * precludes optimising to use semaphores serialisation of a single
> +	 * timeline across engines.
> +	 */
> +	prev = i915_active_request_raw(&timeline->last_request,
> +				       &rq->i915->drm.struct_mutex);
> +	if (prev && !i915_request_completed(prev)) {
> +		if (is_power_of_2(prev->engine->mask | rq->engine->mask))
> +			i915_sw_fence_await_sw_fence(&rq->submit,
> +						     &prev->submit,
> +						     &rq->submitq);
> +		else
> +			__i915_sw_fence_await_dma_fence(&rq->submit,
> +							&prev->fence,
> +							&rq->dmaq);
> +		if (rq->engine->schedule)
> +			__i915_sched_node_add_dependency(&rq->sched,
> +							 &prev->sched,
> +							 &rq->dep,
> +							 0);
> +	}
> +
> +	spin_lock_irq(&timeline->lock);
> +	list_add_tail(&rq->link, &timeline->requests);
> +	spin_unlock_irq(&timeline->lock);
> +
> +	GEM_BUG_ON(timeline->seqno != rq->fence.seqno);
> +	__i915_active_request_set(&timeline->last_request, rq);
> +
> +	return prev;
> +}
> +
>   /*
>    * NB: This function is not allowed to fail. Doing so would mean the the
>    * request is not being tracked for completion but the work itself is
> @@ -1037,31 +1091,7 @@ void i915_request_add(struct i915_request *request)
>   	GEM_BUG_ON(IS_ERR(cs));
>   	request->postfix = intel_ring_offset(request, cs);
>   
> -	/*
> -	 * Seal the request and mark it as pending execution. Note that
> -	 * we may inspect this state, without holding any locks, during
> -	 * hangcheck. Hence we apply the barrier to ensure that we do not
> -	 * see a more recent value in the hws than we are tracking.
> -	 */
> -
> -	prev = i915_active_request_raw(&timeline->last_request,
> -				       &request->i915->drm.struct_mutex);
> -	if (prev && !i915_request_completed(prev)) {
> -		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
> -					     &request->submitq);
> -		if (engine->schedule)
> -			__i915_sched_node_add_dependency(&request->sched,
> -							 &prev->sched,
> -							 &request->dep,
> -							 0);
> -	}
> -
> -	spin_lock_irq(&timeline->lock);
> -	list_add_tail(&request->link, &timeline->requests);
> -	spin_unlock_irq(&timeline->lock);
> -
> -	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
> -	__i915_active_request_set(&timeline->last_request, request);
> +	prev = __i915_request_add_to_timeline(request);
>   
>   	list_add_tail(&request->ring_link, &ring->request_list);
>   	if (list_is_first(&request->ring_link, &ring->request_list)) {
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 8c8fa5010644..cd6c130964cd 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -128,7 +128,10 @@ struct i915_request {
>   	 * It is used by the driver to then queue the request for execution.
>   	 */
>   	struct i915_sw_fence submit;
> -	wait_queue_entry_t submitq;
> +	union {
> +		wait_queue_entry_t submitq;
> +		struct i915_sw_dma_fence_cb dmaq;
> +	};
>   	struct list_head execute_cb;
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 8d1400d378d7..5387aafd3424 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -359,11 +359,6 @@ int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
>   	return __i915_sw_fence_await_sw_fence(fence, signaler, NULL, gfp);
>   }
>   
> -struct i915_sw_dma_fence_cb {
> -	struct dma_fence_cb base;
> -	struct i915_sw_fence *fence;
> -};
> -
>   struct i915_sw_dma_fence_cb_timer {
>   	struct i915_sw_dma_fence_cb base;
>   	struct dma_fence *dma;
> @@ -480,6 +475,40 @@ int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
>   	return ret;
>   }
>   
> +static void __dma_i915_sw_fence_wake(struct dma_fence *dma,
> +				     struct dma_fence_cb *data)
> +{
> +	struct i915_sw_dma_fence_cb *cb = container_of(data, typeof(*cb), base);
> +
> +	i915_sw_fence_complete(cb->fence);
> +}
> +
> +int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				    struct dma_fence *dma,
> +				    struct i915_sw_dma_fence_cb *cb)
> +{
> +	int ret;
> +
> +	debug_fence_assert(fence);
> +
> +	if (dma_fence_is_signaled(dma))
> +		return 0;
> +
> +	cb->fence = fence;
> +	i915_sw_fence_await(fence);
> +
> +	ret = dma_fence_add_callback(dma, &cb->base, __dma_i915_sw_fence_wake);
> +	if (ret == 0) {
> +		ret = 1;
> +	} else {
> +		i915_sw_fence_complete(fence);
> +		if (ret == -ENOENT) /* fence already signaled */
> +			ret = 0;
> +	}
> +
> +	return ret;
> +}
> +
>   int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    struct reservation_object *resv,
>   				    const struct dma_fence_ops *exclude,
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 6dec9e1d1102..9cb5c3b307a6 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -9,14 +9,13 @@
>   #ifndef _I915_SW_FENCE_H_
>   #define _I915_SW_FENCE_H_
>   
> +#include <linux/dma-fence.h>
>   #include <linux/gfp.h>
>   #include <linux/kref.h>
>   #include <linux/notifier.h> /* for NOTIFY_DONE */
>   #include <linux/wait.h>
>   
>   struct completion;
> -struct dma_fence;
> -struct dma_fence_ops;
>   struct reservation_object;
>   
>   struct i915_sw_fence {
> @@ -68,10 +67,20 @@ int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
>   int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
>   				     struct i915_sw_fence *after,
>   				     gfp_t gfp);
> +
> +struct i915_sw_dma_fence_cb {
> +	struct dma_fence_cb base;
> +	struct i915_sw_fence *fence;
> +};
> +
> +int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				    struct dma_fence *dma,
> +				    struct i915_sw_dma_fence_cb *cb);
>   int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
>   				  struct dma_fence *dma,
>   				  unsigned long timeout,
>   				  gfp_t gfp);
> +
>   int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    struct reservation_object *resv,
>   				    const struct dma_fence_ops *exclude,
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 5669823f6901..6e9bd70f5e92 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2804,7 +2804,10 @@ populate_lr_context(struct intel_context *ce,
>   
>   static struct i915_timeline *get_timeline(struct i915_gem_context *ctx)
>   {
> -	return i915_timeline_create(ctx->i915, ctx->name, NULL);
> +	if (ctx->timeline)
> +		return i915_timeline_get(ctx->timeline);
> +	else
> +		return i915_timeline_create(ctx->i915, ctx->name, NULL);
>   }
>   
>   static int execlists_context_deferred_alloc(struct intel_context *ce,
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 1ba354a916c0..86b86569ee21 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -76,7 +76,7 @@ static int live_nop_switch(void *arg)
>   	}
>   
>   	for (n = 0; n < nctx; n++) {
> -		ctx[n] = i915_gem_create_context(i915, file->driver_priv);
> +		ctx[n] = i915_gem_create_context(i915, file->driver_priv, 0);
>   		if (IS_ERR(ctx[n])) {
>   			err = PTR_ERR(ctx[n]);
>   			goto out_unlock;
> @@ -526,7 +526,8 @@ static int igt_ctx_exec(void *arg)
>   			struct i915_gem_context *ctx;
>   			intel_wakeref_t wakeref;
>   
> -			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			ctx = i915_gem_create_context(i915,
> +						      file->driver_priv, 0);
>   			if (IS_ERR(ctx)) {
>   				err = PTR_ERR(ctx);
>   				goto out_unlock;
> @@ -611,7 +612,7 @@ static int igt_shared_ctx_exec(void *arg)
>   
>   	mutex_lock(&i915->drm.struct_mutex);
>   
> -	parent = i915_gem_create_context(i915, file->driver_priv);
> +	parent = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(parent)) {
>   		err = PTR_ERR(parent);
>   		goto out_unlock;
> @@ -645,7 +646,8 @@ static int igt_shared_ctx_exec(void *arg)
>   			if (ctx)
>   				__destroy_hw_context(ctx, file->driver_priv);
>   
> -			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			ctx = i915_gem_create_context(i915,
> +						      file->driver_priv, 0);
>   			if (IS_ERR(ctx)) {
>   				err = PTR_ERR(ctx);
>   				goto out_test;
> @@ -1088,7 +1090,7 @@ __igt_ctx_sseu(struct drm_i915_private *i915,
>   
>   	mutex_lock(&i915->drm.struct_mutex);
>   
> -	ctx = i915_gem_create_context(i915, file->driver_priv);
> +	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx)) {
>   		ret = PTR_ERR(ctx);
>   		goto out_unlock;
> @@ -1198,7 +1200,7 @@ static int igt_ctx_readonly(void *arg)
>   	if (err)
>   		goto out_unlock;
>   
> -	ctx = i915_gem_create_context(i915, file->driver_priv);
> +	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx)) {
>   		err = PTR_ERR(ctx);
>   		goto out_unlock;
> @@ -1524,13 +1526,13 @@ static int igt_vm_isolation(void *arg)
>   	if (err)
>   		goto out_unlock;
>   
> -	ctx_a = i915_gem_create_context(i915, file->driver_priv);
> +	ctx_a = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx_a)) {
>   		err = PTR_ERR(ctx_a);
>   		goto out_unlock;
>   	}
>   
> -	ctx_b = i915_gem_create_context(i915, file->driver_priv);
> +	ctx_b = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx_b)) {
>   		err = PTR_ERR(ctx_b);
>   		goto out_unlock;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> index f90328b21763..1d6dc2fe36ab 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> @@ -94,7 +94,7 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
>   {
>   	lockdep_assert_held(&i915->drm.struct_mutex);
>   
> -	return i915_gem_create_context(i915, file->driver_priv);
> +	return i915_gem_create_context(i915, file->driver_priv, 0);
>   }
>   
>   struct i915_gem_context *
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 46189a8eaecb..5206d0006043 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1456,8 +1456,9 @@ struct drm_i915_gem_context_create_ext {
>   	__u32 ctx_id; /* output: id of new context*/
>   	__u32 flags;
>   #define I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS	(1u << 0)
> +#define I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE	(1u << 1)
>   #define I915_CONTEXT_CREATE_FLAGS_UNKNOWN \
> -	(-(I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS << 1))
> +	(-(I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE << 1))
>   	__u64 extensions;
>   };
>   
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko