[Intel-gfx] [RFC 5/5] drm/i915: Per batch buffer VCS balancing

Oscar Mateo oscar.mateo at intel.com
Thu Nov 16 00:21:40 UTC 2017



On 11/13/2017 05:09 AM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Building on top of the three previously introduced new pieces of uAPI,
> class/instance based engine selection, engine capabilities and concurrent
> contexts, we can extend the execbuf2 uAPI to allow per batch buffer load
> balancing on the VCS engines.
>
> Legacy execbuf, with contexts mark as concurrent, can now load balance
> between individual batches, instead of only statically per client.
>
> This also means two batches submitted one after another, but both to
> I915_EXEC_BSD, can potentially be running in parallel on VCS0 and VCS1
> respectively.
>
> For legacy (non-concurrent) contexts, behaviour is unchanged. VCS engine
> is assigned to each context at the time of first submission and it
> persists for the context lifetime.
>
> In both cases trivial round-robin approach is used to load balance.
>
> If execbuf requires a  a particular engine feature, like for example HEVC
> which is only available on vcs0, it needs to mark it's execbuf calls
> appropriately using the engine capabilities uAPI.
>
> For the class/instance based execbuf we add I915_EXEC_INSTANCE_ANY to
> accomplish the same behaviour. This is only allowed to be used on
> concurrent contexts or an error will be returned.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h            |  7 +--
>   drivers/gpu/drm/i915/i915_gem.c            |  2 +-
>   drivers/gpu/drm/i915/i915_gem_context.c    |  2 +
>   drivers/gpu/drm/i915/i915_gem_context.h    |  2 +
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 80 +++++++++++++++++++++++-------
>   include/uapi/drm/i915_drm.h                |  6 +++
>   6 files changed, 78 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index bf7216742452..9137127fad22 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1562,9 +1562,6 @@ struct i915_gem_mm {
>   
>   	u64 unordered_timeline;
>   
> -	/* the indicator for dispatch video commands on two BSD rings */
> -	atomic_t bsd_engine_dispatch_index;
> -
>   	/** Bit 6 swizzling required for X tiling */
>   	uint32_t bit_6_swizzle_x;
>   	/** Bit 6 swizzling required for Y tiling */
> @@ -2292,6 +2289,10 @@ struct drm_i915_private {
>   	struct i915_gem_context *preempt_context;
>   	struct intel_engine_cs *engine_class[MAX_ENGINE_CLASS + 1]
>   					    [MAX_ENGINE_INSTANCE + 1];
> +
> +	/* the indicator for dispatch video commands on two BSD rings */
> +	atomic_t vcs_dispatch_index;
> +
>   	struct i915_vma *semaphore;
>   
>   	struct drm_dma_handle *status_page_dmah;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c5ddae2cc4ef..f39881bbd4ee 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5300,7 +5300,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
>   	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>   	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
>   
> -	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
> +	atomic_set(&dev_priv->vcs_dispatch_index, 0);
>   
>   	spin_lock_init(&dev_priv->fb_tracking.lock);
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 3ea1251924ea..fb7021ee35d5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -275,6 +275,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>   	list_add_tail(&ctx->link, &dev_priv->contexts.list);
>   	ctx->i915 = dev_priv;
>   	ctx->priority = I915_PRIORITY_NORMAL;
> +	atomic_set(&ctx->vcs_dispatch_index,
> +		   atomic_fetch_xor(1, &dev_priv->vcs_dispatch_index));
>   
>   	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
>   	INIT_LIST_HEAD(&ctx->handles_list);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 0531c5eedb2a..638c716afbef 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -192,6 +192,8 @@ struct i915_gem_context {
>   	 * context close.
>   	 */
>   	struct list_head handles_list;
> +
> +	atomic_t vcs_dispatch_index;
>   };
>   
>   static inline bool i915_gem_context_is_closed(const struct i915_gem_context *ctx)
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index ecb6290ab6ee..a7bf4fedbdbc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -235,6 +235,7 @@ struct i915_execbuffer {
>   
>   	u64 invalid_flags; /** Set of execobj.flags that are invalid */
>   	u32 context_flags; /** Set of execobj.flags to insert from the ctx */
> +	bool ctx_concurrent;
>   
>   	u32 batch_start_offset; /** Location within object of batch */
>   	u32 batch_len; /** Length of batch within object */
> @@ -675,6 +676,8 @@ static int eb_select_context(struct i915_execbuffer *eb)
>   	eb->ctx = ctx;
>   	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
>   
> +	eb->ctx_concurrent = i915_gem_context_is_concurrent(ctx);
> +
>   	eb->context_flags = 0;
>   	if (ctx->flags & CONTEXT_NO_ZEROMAP)
>   		eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
> @@ -1987,26 +1990,63 @@ static int eb_submit(struct i915_execbuffer *eb)
>   	return 0;
>   }
>   
> +static unsigned int select_vcs_engine(struct i915_execbuffer *eb, u64 eb_flags)
> +{
> +	struct drm_i915_private *i915 = eb->i915;
> +	u8 eb_caps = (eb_flags & I915_EXEC_ENGINE_CAP_MASK) >>
> +		     I915_EXEC_ENGINE_CAP_SHIFT;
> +	unsigned int instance;
> +
> +	if (!HAS_BSD2(i915))
> +		return 0;
> +
> +	instance = atomic_fetch_xor(1, &eb->ctx->vcs_dispatch_index);
> +
> +	if (instance == 1 &&
> +	    (eb_caps & i915->engine[_VCS(instance)]->caps) != eb_caps)
> +		instance = 0;
> +
> +	return instance;
> +}
> +
> +static unsigned int
> +client_static_vcs_choice(struct drm_i915_private *i915,
> +			 struct drm_i915_file_private *file_priv)
> +{
> +	/*
> +	 * Check whether the file_priv has already selected one engine
> +	 * and if not select one.
> +	 */
> +	if ((int)file_priv->bsd_engine < 0)
> +		file_priv->bsd_engine =
> +				atomic_fetch_xor(1, &i915->vcs_dispatch_index);
> +
> +	return file_priv->bsd_engine;
> +}
> +
>   /**
>    * Find one BSD ring to dispatch the corresponding BSD command.
>    * The engine index is returned.
>    */
>   static unsigned int
> -gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv,
> -			 struct drm_file *file)
> +eb_select_vcs_engine(struct i915_execbuffer *eb,
> +		     struct drm_i915_file_private *file_priv,
> +		     u64 eb_flags)
>   {
> -	struct drm_i915_file_private *file_priv = file->driver_priv;
> -
> -	/* Check whether the file_priv has already selected one ring. */
> -	if ((int)file_priv->bsd_engine < 0)
> -		file_priv->bsd_engine = atomic_fetch_xor(1,
> -			 &dev_priv->mm.bsd_engine_dispatch_index);
> -
> -	return file_priv->bsd_engine;
> +	/*
> +	 * For concurrent contexts do a round-robin engine assignment
> +	 * for each batch buffer.
> +	 */
> +	if (eb->ctx_concurrent)
> +		return select_vcs_engine(eb, eb_flags);
> +	else
> +		return client_static_vcs_choice(eb->i915, file_priv);
>   }
>   
>   static struct intel_engine_cs *
> -eb_select_engine_class_instance(struct drm_i915_private *i915, u64 eb_flags)
> +eb_select_engine_class_instance(struct i915_execbuffer *eb,
> +				struct drm_i915_file_private *file_priv,
> +				u64 eb_flags)
>   {
>   	u8 class = eb_flags & I915_EXEC_RING_MASK;
>   	u8 instance = (eb_flags & I915_EXEC_INSTANCE_MASK) >>
> @@ -2015,7 +2055,10 @@ eb_select_engine_class_instance(struct drm_i915_private *i915, u64 eb_flags)
>   		  I915_EXEC_ENGINE_CAP_SHIFT;
>   	struct intel_engine_cs *engine;
>   
> -	engine = intel_engine_lookup_user(i915, class, instance);
> +	if (instance == I915_EXEC_INSTANCE_ANY)
> +		instance = eb_select_vcs_engine(eb, file_priv, eb_flags);

At this moment we don't know that this is for the video engine:

if (instance == I915_EXEC_INSTANCE_ANY) {
     if (class == I915_EXEC_BSD)
         instance = eb_select_vcs_engine(eb, file_priv, eb_flags);
     else
         instance = 0;
}

> +
> +	engine = intel_engine_lookup_user(eb->i915, class, instance);
>   
>   	if (engine && ((caps & engine->caps) != caps))
>   		return NULL;
> @@ -2034,15 +2077,17 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
>   };
>   
>   static struct intel_engine_cs *
> -eb_select_engine(struct drm_i915_private *dev_priv,
> +eb_select_engine(struct i915_execbuffer *eb,
>   		 struct drm_file *file,
>   		 struct drm_i915_gem_execbuffer2 *args)
>   {
> +	struct drm_i915_private *dev_priv = eb->i915;
>   	unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
>   	struct intel_engine_cs *engine;
>   
>   	if (args->flags & I915_EXEC_CLASS_INSTANCE)
> -		return eb_select_engine_class_instance(dev_priv, args->flags);
> +		return eb_select_engine_class_instance(eb, file->driver_priv,
> +						       args->flags);
>   
>   	if (user_ring_id > I915_USER_RINGS) {
>   		DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
> @@ -2060,7 +2105,8 @@ eb_select_engine(struct drm_i915_private *dev_priv,
>   		unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK;
>   
>   		if (bsd_idx == I915_EXEC_BSD_DEFAULT) {
> -			bsd_idx = gen8_dispatch_bsd_engine(dev_priv, file);
> +			bsd_idx = eb_select_vcs_engine(eb, file->driver_priv,
> +						       args->flags);
>   		} else if (bsd_idx >= I915_EXEC_BSD_RING1 &&
>   			   bsd_idx <= I915_EXEC_BSD_RING2) {
>   			bsd_idx >>= I915_EXEC_BSD_SHIFT;
> @@ -2283,8 +2329,8 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>   		goto err_destroy;
>   
>   	err = -EINVAL;
> -	eb.engine = eb_select_engine(eb.i915, file, args);
> -	if (!eb.engine)
> +	eb.engine = eb_select_engine(&eb, file, args);
> +	if (unlikely(!eb.engine))
>   		goto err_engine;
>   
>   	if (args->flags & I915_EXEC_RESOURCE_STREAMER) {
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index bef377c4b4fc..35255d7802db 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1056,6 +1056,12 @@ struct drm_i915_gem_execbuffer2 {
>   #define I915_EXEC_INSTANCE_SHIFT	(21)
>   #define I915_EXEC_INSTANCE_MASK		(0xff << I915_EXEC_INSTANCE_SHIFT)
>   
> +/*
> + * Batches sent with instance set to any can be load balanced by the driver
> + * is concurrent context flag is also enabled.
> + */

"if" concurrent context flag

> +#define I915_EXEC_INSTANCE_ANY		(0xff)
> +
>   /*
>    * Inform the kernel of what engine capabilities this batch buffer
>    * requires. For example only the first VCS engine has the HEVC block.



More information about the Intel-gfx mailing list