[Intel-gfx] [PATCH 08/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+

Thu Feb 28 10:49:28 UTC 2019

On 26/02/2019 10:24, Chris Wilson wrote:
> Having introduced per-context seqno, we now have a means to identity
> progress across the system without feel of rollback as befell the
> global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> advance of submission safe in the knowledge that our target seqno and
> address is stable.
> 
> However, since we are telling the GPU to busy-spin on the target address
> until it matches the signaling seqno, we only want to do so when we are
> sure that busy-spin will be completed quickly. To achieve this we only
> submit the request to HW once the signaler is itself executing (modulo
> preemption causing us to wait longer), and we only do so for default and
> above priority requests (so that idle priority tasks never themselves
> hog the GPU waiting for others).
> 
> As might be reasonably expected, HW semaphores excel in inter-engine
> synchronisation microbenchmarks (where the reduced latency / increased
> throughput more than offset the power cost of spinning on a second ring)
> and have significant improvement for single clients that utilize multiple
> engines (typically media players), without regressing multiple clients
> that can saturate the system.
> 
> v3: Drop the older NEQ branch, now we pin the signaler's HWSP anyway.
> v4: Tell the world and include it as part of scheduler caps.

I think I asked for a table of results showing power, and power/perf to 
be put in the commit message.

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.c           |   2 +-
>   drivers/gpu/drm/i915/i915_request.c       | 138 +++++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_request.h       |   1 +
>   drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
>   drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
>   drivers/gpu/drm/i915/intel_engine_cs.c    |   1 +
>   drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
>   drivers/gpu/drm/i915/intel_lrc.c          |   1 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h   |   7 ++
>   include/uapi/drm/i915_drm.h               |   1 +
>   10 files changed, 158 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index c6354f6cdbdb..c08abdef5eb6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -351,7 +351,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
>   		value = min_t(int, INTEL_PPGTT(dev_priv), I915_GEM_PPGTT_FULL);
>   		break;
>   	case I915_PARAM_HAS_SEMAPHORES:
> -		value = 0;
> +		value = !!(dev_priv->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
>   		break;
>   	case I915_PARAM_HAS_SECURE_BATCHES:
>   		value = capable(CAP_SYS_ADMIN);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index d354967d6ae8..59e30b8c4ee9 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -22,8 +22,9 @@
>    *
>    */
>   
> -#include <linux/prefetch.h>
>   #include <linux/dma-fence-array.h>
> +#include <linux/irq_work.h>
> +#include <linux/prefetch.h>
>   #include <linux/sched.h>
>   #include <linux/sched/clock.h>
>   #include <linux/sched/signal.h>
> @@ -32,9 +33,16 @@
>   #include "i915_active.h"
>   #include "i915_reset.h"
>   
> +struct execute_cb {
> +	struct list_head link;
> +	struct irq_work work;
> +	struct i915_sw_fence *fence;
> +};
> +
>   static struct i915_global_request {
>   	struct kmem_cache *slab_requests;
>   	struct kmem_cache *slab_dependencies;
> +	struct kmem_cache *slab_execute_cbs;
>   } global;
>   
>   static const char *i915_fence_get_driver_name(struct dma_fence *fence)
> @@ -325,6 +333,69 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> +static void irq_execute_cb(struct irq_work *wrk)
> +{
> +	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
> +
> +	i915_sw_fence_complete(cb->fence);
> +	kmem_cache_free(global.slab_execute_cbs, cb);
> +}
> +
> +static void __notify_execute_cb(struct i915_request *rq)
> +{
> +	struct execute_cb *cb;
> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (list_empty(&rq->execute_cb))
> +		return;
> +
> +	list_for_each_entry(cb, &rq->execute_cb, link)
> +		irq_work_queue(&cb->work);
> +
> +	/*
> +	 * XXX Rollback on __i915_request_unsubmit()
> +	 *
> +	 * In the future, perhaps when we have an active time-slicing scheduler,
> +	 * it will be interesting to unsubmit parallel execution and remove
> +	 * busywaits from the GPU until their master is restarted. This is
> +	 * quite hairy, we have to carefully rollback the fence and do a
> +	 * preempt-to-idle cycle on the target engine, all the while the
> +	 * master execute_cb may refire.
> +	 */
> +	INIT_LIST_HEAD(&rq->execute_cb);
> +}
> +
> +static int
> +i915_request_await_execution(struct i915_request *rq,
> +			     struct i915_request *signal,
> +			     gfp_t gfp)
> +{
> +	struct execute_cb *cb;
> +
> +	if (i915_request_is_active(signal))
> +		return 0;
> +
> +	cb = kmem_cache_alloc(global.slab_execute_cbs, gfp);
> +	if (!cb)
> +		return -ENOMEM;
> +
> +	cb->fence = &rq->submit;
> +	i915_sw_fence_await(cb->fence);
> +	init_irq_work(&cb->work, irq_execute_cb);
> +
> +	spin_lock_irq(&signal->lock);
> +	if (i915_request_is_active(signal)) {
> +		i915_sw_fence_complete(cb->fence);
> +		kmem_cache_free(global.slab_execute_cbs, cb);
> +	} else {
> +		list_add_tail(&cb->link, &signal->execute_cb);
> +	}
> +	spin_unlock_irq(&signal->lock);
> +
> +	return 0;
> +}
> +
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -361,6 +432,8 @@ void __i915_request_submit(struct i915_request *request)
>   	    !i915_request_enable_breadcrumb(request))
>   		intel_engine_queue_breadcrumbs(engine);
>   
> +	__notify_execute_cb(request);
> +
>   	spin_unlock(&request->lock);
>   
>   	engine->emit_fini_breadcrumb(request,
> @@ -608,6 +681,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	}
>   
>   	INIT_LIST_HEAD(&rq->active_list);
> +	INIT_LIST_HEAD(&rq->execute_cb);
>   
>   	tl = ce->ring->timeline;
>   	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> @@ -696,6 +770,52 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	return ERR_PTR(ret);
>   }
>   
> +static int
> +emit_semaphore_wait(struct i915_request *to,
> +		    struct i915_request *from,
> +		    gfp_t gfp)
> +{
> +	u32 hwsp_offset;
> +	u32 *cs;
> +	int err;
> +
> +	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
> +	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
> +
> +	/* We need to pin the signaler's HWSP until we are finished reading. */
> +	err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
> +	if (err)
> +		return err;
> +
> +	/* Only submit our spinner after the signaler is running! */
> +	err = i915_request_await_execution(to, from, gfp);
> +	if (err)
> +		return err;
> +
> +	cs = intel_ring_begin(to, 4);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	/*
> +	 * Using greater-than-or-equal here means we have to worry
> +	 * about seqno wraparound. To side step that issue, we swap
> +	 * the timeline HWSP upon wrapping, so that everyone listening
> +	 * for the old (pre-wrap) values do not see the much smaller
> +	 * (post-wrap) values than they were expecting (and so wait
> +	 * forever).
> +	 */
> +	*cs++ = MI_SEMAPHORE_WAIT |
> +		MI_SEMAPHORE_GLOBAL_GTT |
> +		MI_SEMAPHORE_POLL |
> +		MI_SEMAPHORE_SAD_GTE_SDD;
> +	*cs++ = from->fence.seqno;
> +	*cs++ = hwsp_offset;
> +	*cs++ = 0;
> +
> +	intel_ring_advance(to, cs);
> +	return 0;
> +}
> +
>   static int
>   i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   {
> @@ -717,6 +837,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
>   						       &from->submit,
>   						       I915_FENCE_GFP);
> +	} else if (intel_engine_has_semaphores(to->engine) &&
> +		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
> +		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
>   	} else {
>   		ret = i915_sw_fence_await_dma_fence(&to->submit,
>   						    &from->fence, 0,
> @@ -1208,14 +1331,23 @@ int __init i915_global_request_init(void)
>   	if (!global.slab_requests)
>   		return -ENOMEM;
>   
> +	global.slab_execute_cbs = KMEM_CACHE(execute_cb,
> +					     SLAB_HWCACHE_ALIGN |
> +					     SLAB_RECLAIM_ACCOUNT |
> +					     SLAB_TYPESAFE_BY_RCU);
> +	if (!global.slab_execute_cbs)
> +		goto err_requests;
> +
>   	global.slab_dependencies = KMEM_CACHE(i915_dependency,
>   					      SLAB_HWCACHE_ALIGN |
>   					      SLAB_RECLAIM_ACCOUNT);
>   	if (!global.slab_dependencies)
> -		goto err_requests;
> +		goto err_execute_cbs;
>   
>   	return 0;
>   
> +err_execute_cbs:
> +	kmem_cache_destroy(global.slab_execute_cbs);
>   err_requests:
>   	kmem_cache_destroy(global.slab_requests);
>   	return -ENOMEM;
> @@ -1224,11 +1356,13 @@ int __init i915_global_request_init(void)
>   void i915_global_request_shrink(void)
>   {
>   	kmem_cache_shrink(global.slab_dependencies);
> +	kmem_cache_shrink(global.slab_execute_cbs);
>   	kmem_cache_shrink(global.slab_requests);
>   }
>   
>   void i915_global_request_exit(void)
>   {
>   	kmem_cache_destroy(global.slab_dependencies);
> +	kmem_cache_destroy(global.slab_execute_cbs);
>   	kmem_cache_destroy(global.slab_requests);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index ea1e6f0ade53..3e0d62d35226 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -129,6 +129,7 @@ struct i915_request {
>   	 */
>   	struct i915_sw_fence submit;
>   	wait_queue_entry_t submitq;
> +	struct list_head execute_cb;
>   
>   	/*
>   	 * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 7c58b049ecb5..8d1400d378d7 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
>   	__i915_sw_fence_notify(fence, FENCE_FREE);
>   }
>   
> -static void i915_sw_fence_complete(struct i915_sw_fence *fence)
> +void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   
> @@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   	__i915_sw_fence_complete(fence, NULL);
>   }
>   
> -static void i915_sw_fence_await(struct i915_sw_fence *fence)
> +void i915_sw_fence_await(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 0e055ea0179f..6dec9e1d1102 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    unsigned long timeout,
>   				    gfp_t gfp);
>   
> +void i915_sw_fence_await(struct i915_sw_fence *fence);
> +void i915_sw_fence_complete(struct i915_sw_fence *fence);
> +
>   static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
>   {
>   	return atomic_read(&fence->pending) <= 0;
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index ef49b1b0537b..7e0fe433ab17 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -616,6 +616,7 @@ void intel_engines_set_scheduler_caps(struct drm_i915_private *i915)
>   	} map[] = {
>   #define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) }
>   		MAP(PREEMPTION, PREEMPTION),
> +		MAP(SEMAPHORES, SEMAPHORES),
>   #undef MAP
>   	};
>   	struct intel_engine_cs *engine;
> diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
> index b96a31bc1080..0efaadd3bc32 100644
> --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> @@ -106,7 +106,12 @@
>   #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
>   #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
>   #define   MI_SEMAPHORE_POLL		(1<<15)
> +#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
>   #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
> +#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
> +#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
> +#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
> +#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
>   #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
>   #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
>   #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index f57cfe2fc078..53d6f7fdb50e 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2324,6 +2324,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>   	engine->park = NULL;
>   	engine->unpark = NULL;
>   
> +	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
>   	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>   	if (engine->i915->preempt_context)
>   		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index b8ec7e40a59b..2e7264119ec4 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -499,6 +499,7 @@ struct intel_engine_cs {
>   #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>   #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>   #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> +#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
>   	unsigned int flags;
>   
>   	/*
> @@ -576,6 +577,12 @@ intel_engine_has_preemption(const struct intel_engine_cs *engine)
>   	return engine->flags & I915_ENGINE_HAS_PREEMPTION;
>   }
>   
> +static inline bool
> +intel_engine_has_semaphores(const struct intel_engine_cs *engine)
> +{
> +	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
> +}
> +
>   void intel_engines_set_scheduler_caps(struct drm_i915_private *i915);
>   
>   static inline bool __execlists_need_preempt(int prio, int last)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 8304a7f1ec3f..b10eea3f6d24 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -479,6 +479,7 @@ typedef struct drm_i915_irq_wait {
>   #define   I915_SCHEDULER_CAP_ENABLED	(1ul << 0)
>   #define   I915_SCHEDULER_CAP_PRIORITY	(1ul << 1)
>   #define   I915_SCHEDULER_CAP_PREEMPTION	(1ul << 2)
> +#define   I915_SCHEDULER_CAP_SEMAPHORES	(1ul << 3)
>   
>   #define I915_PARAM_HUC_STATUS		 42
>   
>