[Intel-gfx] [RFC 5/5] drm/i915: Per batch buffer VCS balancing
Oscar Mateo
oscar.mateo at intel.com
Thu Nov 16 00:21:40 UTC 2017
On 11/13/2017 05:09 AM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Building on top of the three previously introduced new pieces of uAPI,
> class/instance based engine selection, engine capabilities and concurrent
> contexts, we can extend the execbuf2 uAPI to allow per batch buffer load
> balancing on the VCS engines.
>
> Legacy execbuf, with contexts mark as concurrent, can now load balance
> between individual batches, instead of only statically per client.
>
> This also means two batches submitted one after another, but both to
> I915_EXEC_BSD, can potentially be running in parallel on VCS0 and VCS1
> respectively.
>
> For legacy (non-concurrent) contexts, behaviour is unchanged. VCS engine
> is assigned to each context at the time of first submission and it
> persists for the context lifetime.
>
> In both cases trivial round-robin approach is used to load balance.
>
> If execbuf requires a a particular engine feature, like for example HEVC
> which is only available on vcs0, it needs to mark it's execbuf calls
> appropriately using the engine capabilities uAPI.
>
> For the class/instance based execbuf we add I915_EXEC_INSTANCE_ANY to
> accomplish the same behaviour. This is only allowed to be used on
> concurrent contexts or an error will be returned.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.h | 7 +--
> drivers/gpu/drm/i915/i915_gem.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_context.c | 2 +
> drivers/gpu/drm/i915/i915_gem_context.h | 2 +
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 80 +++++++++++++++++++++++-------
> include/uapi/drm/i915_drm.h | 6 +++
> 6 files changed, 78 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index bf7216742452..9137127fad22 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1562,9 +1562,6 @@ struct i915_gem_mm {
>
> u64 unordered_timeline;
>
> - /* the indicator for dispatch video commands on two BSD rings */
> - atomic_t bsd_engine_dispatch_index;
> -
> /** Bit 6 swizzling required for X tiling */
> uint32_t bit_6_swizzle_x;
> /** Bit 6 swizzling required for Y tiling */
> @@ -2292,6 +2289,10 @@ struct drm_i915_private {
> struct i915_gem_context *preempt_context;
> struct intel_engine_cs *engine_class[MAX_ENGINE_CLASS + 1]
> [MAX_ENGINE_INSTANCE + 1];
> +
> + /* the indicator for dispatch video commands on two BSD rings */
> + atomic_t vcs_dispatch_index;
> +
> struct i915_vma *semaphore;
>
> struct drm_dma_handle *status_page_dmah;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c5ddae2cc4ef..f39881bbd4ee 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5300,7 +5300,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
> init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
> init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
>
> - atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
> + atomic_set(&dev_priv->vcs_dispatch_index, 0);
>
> spin_lock_init(&dev_priv->fb_tracking.lock);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 3ea1251924ea..fb7021ee35d5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -275,6 +275,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
> list_add_tail(&ctx->link, &dev_priv->contexts.list);
> ctx->i915 = dev_priv;
> ctx->priority = I915_PRIORITY_NORMAL;
> + atomic_set(&ctx->vcs_dispatch_index,
> + atomic_fetch_xor(1, &dev_priv->vcs_dispatch_index));
>
> INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
> INIT_LIST_HEAD(&ctx->handles_list);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 0531c5eedb2a..638c716afbef 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -192,6 +192,8 @@ struct i915_gem_context {
> * context close.
> */
> struct list_head handles_list;
> +
> + atomic_t vcs_dispatch_index;
> };
>
> static inline bool i915_gem_context_is_closed(const struct i915_gem_context *ctx)
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index ecb6290ab6ee..a7bf4fedbdbc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -235,6 +235,7 @@ struct i915_execbuffer {
>
> u64 invalid_flags; /** Set of execobj.flags that are invalid */
> u32 context_flags; /** Set of execobj.flags to insert from the ctx */
> + bool ctx_concurrent;
>
> u32 batch_start_offset; /** Location within object of batch */
> u32 batch_len; /** Length of batch within object */
> @@ -675,6 +676,8 @@ static int eb_select_context(struct i915_execbuffer *eb)
> eb->ctx = ctx;
> eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
>
> + eb->ctx_concurrent = i915_gem_context_is_concurrent(ctx);
> +
> eb->context_flags = 0;
> if (ctx->flags & CONTEXT_NO_ZEROMAP)
> eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
> @@ -1987,26 +1990,63 @@ static int eb_submit(struct i915_execbuffer *eb)
> return 0;
> }
>
> +static unsigned int select_vcs_engine(struct i915_execbuffer *eb, u64 eb_flags)
> +{
> + struct drm_i915_private *i915 = eb->i915;
> + u8 eb_caps = (eb_flags & I915_EXEC_ENGINE_CAP_MASK) >>
> + I915_EXEC_ENGINE_CAP_SHIFT;
> + unsigned int instance;
> +
> + if (!HAS_BSD2(i915))
> + return 0;
> +
> + instance = atomic_fetch_xor(1, &eb->ctx->vcs_dispatch_index);
> +
> + if (instance == 1 &&
> + (eb_caps & i915->engine[_VCS(instance)]->caps) != eb_caps)
> + instance = 0;
> +
> + return instance;
> +}
> +
> +static unsigned int
> +client_static_vcs_choice(struct drm_i915_private *i915,
> + struct drm_i915_file_private *file_priv)
> +{
> + /*
> + * Check whether the file_priv has already selected one engine
> + * and if not select one.
> + */
> + if ((int)file_priv->bsd_engine < 0)
> + file_priv->bsd_engine =
> + atomic_fetch_xor(1, &i915->vcs_dispatch_index);
> +
> + return file_priv->bsd_engine;
> +}
> +
> /**
> * Find one BSD ring to dispatch the corresponding BSD command.
> * The engine index is returned.
> */
> static unsigned int
> -gen8_dispatch_bsd_engine(struct drm_i915_private *dev_priv,
> - struct drm_file *file)
> +eb_select_vcs_engine(struct i915_execbuffer *eb,
> + struct drm_i915_file_private *file_priv,
> + u64 eb_flags)
> {
> - struct drm_i915_file_private *file_priv = file->driver_priv;
> -
> - /* Check whether the file_priv has already selected one ring. */
> - if ((int)file_priv->bsd_engine < 0)
> - file_priv->bsd_engine = atomic_fetch_xor(1,
> - &dev_priv->mm.bsd_engine_dispatch_index);
> -
> - return file_priv->bsd_engine;
> + /*
> + * For concurrent contexts do a round-robin engine assignment
> + * for each batch buffer.
> + */
> + if (eb->ctx_concurrent)
> + return select_vcs_engine(eb, eb_flags);
> + else
> + return client_static_vcs_choice(eb->i915, file_priv);
> }
>
> static struct intel_engine_cs *
> -eb_select_engine_class_instance(struct drm_i915_private *i915, u64 eb_flags)
> +eb_select_engine_class_instance(struct i915_execbuffer *eb,
> + struct drm_i915_file_private *file_priv,
> + u64 eb_flags)
> {
> u8 class = eb_flags & I915_EXEC_RING_MASK;
> u8 instance = (eb_flags & I915_EXEC_INSTANCE_MASK) >>
> @@ -2015,7 +2055,10 @@ eb_select_engine_class_instance(struct drm_i915_private *i915, u64 eb_flags)
> I915_EXEC_ENGINE_CAP_SHIFT;
> struct intel_engine_cs *engine;
>
> - engine = intel_engine_lookup_user(i915, class, instance);
> + if (instance == I915_EXEC_INSTANCE_ANY)
> + instance = eb_select_vcs_engine(eb, file_priv, eb_flags);
At this moment we don't know that this is for the video engine:
if (instance == I915_EXEC_INSTANCE_ANY) {
if (class == I915_EXEC_BSD)
instance = eb_select_vcs_engine(eb, file_priv, eb_flags);
else
instance = 0;
}
> +
> + engine = intel_engine_lookup_user(eb->i915, class, instance);
>
> if (engine && ((caps & engine->caps) != caps))
> return NULL;
> @@ -2034,15 +2077,17 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
> };
>
> static struct intel_engine_cs *
> -eb_select_engine(struct drm_i915_private *dev_priv,
> +eb_select_engine(struct i915_execbuffer *eb,
> struct drm_file *file,
> struct drm_i915_gem_execbuffer2 *args)
> {
> + struct drm_i915_private *dev_priv = eb->i915;
> unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
> struct intel_engine_cs *engine;
>
> if (args->flags & I915_EXEC_CLASS_INSTANCE)
> - return eb_select_engine_class_instance(dev_priv, args->flags);
> + return eb_select_engine_class_instance(eb, file->driver_priv,
> + args->flags);
>
> if (user_ring_id > I915_USER_RINGS) {
> DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
> @@ -2060,7 +2105,8 @@ eb_select_engine(struct drm_i915_private *dev_priv,
> unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK;
>
> if (bsd_idx == I915_EXEC_BSD_DEFAULT) {
> - bsd_idx = gen8_dispatch_bsd_engine(dev_priv, file);
> + bsd_idx = eb_select_vcs_engine(eb, file->driver_priv,
> + args->flags);
> } else if (bsd_idx >= I915_EXEC_BSD_RING1 &&
> bsd_idx <= I915_EXEC_BSD_RING2) {
> bsd_idx >>= I915_EXEC_BSD_SHIFT;
> @@ -2283,8 +2329,8 @@ i915_gem_do_execbuffer(struct drm_device *dev,
> goto err_destroy;
>
> err = -EINVAL;
> - eb.engine = eb_select_engine(eb.i915, file, args);
> - if (!eb.engine)
> + eb.engine = eb_select_engine(&eb, file, args);
> + if (unlikely(!eb.engine))
> goto err_engine;
>
> if (args->flags & I915_EXEC_RESOURCE_STREAMER) {
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index bef377c4b4fc..35255d7802db 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1056,6 +1056,12 @@ struct drm_i915_gem_execbuffer2 {
> #define I915_EXEC_INSTANCE_SHIFT (21)
> #define I915_EXEC_INSTANCE_MASK (0xff << I915_EXEC_INSTANCE_SHIFT)
>
> +/*
> + * Batches sent with instance set to any can be load balanced by the driver
> + * is concurrent context flag is also enabled.
> + */
"if" concurrent context flag
> +#define I915_EXEC_INSTANCE_ANY (0xff)
> +
> /*
> * Inform the kernel of what engine capabilities this batch buffer
> * requires. For example only the first VCS engine has the HEVC block.
More information about the Intel-gfx
mailing list