[Intel-gfx] [PATCH 15/24] drm/i915: Make sure execbuffer always passes ww state to i915_vma_pin.

Wed Aug 12 19:32:41 UTC 2020

On 8/10/20 12:30 PM, Maarten Lankhorst wrote:
> As a preparation step for full object locking and wait/wound handling
> during pin and object mapping, ensure that we always pass the ww context
> in i915_gem_execbuffer.c to i915_vma_pin, use lockdep to ensure this
> happens.
>
> This also requires changing the order of eb_parse slightly, to ensure
> we pass ww at a point where we could still handle -EDEADLK safely.
>
> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>

I'm a bit curious as how we handle the lifetime on the contending locks 
since we often return through the call tree before doing the ww 
transaction relaxation  (the slow lock). Has that been a problem?

> ---
>   drivers/gpu/drm/i915/display/intel_display.c  |   2 +-
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   |   4 +-
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 140 ++++++++++--------
>   .../i915/gem/selftests/i915_gem_execbuffer.c  |   4 +-
>   drivers/gpu/drm/i915/gt/gen6_ppgtt.c          |   4 +-
>   drivers/gpu/drm/i915/gt/gen6_ppgtt.h          |   4 +-
>   drivers/gpu/drm/i915/gt/intel_context.c       |  65 +++++---
>   drivers/gpu/drm/i915/gt/intel_context.h       |  13 ++
>   drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +-
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |   2 +-
>   drivers/gpu/drm/i915/gt/intel_gt.c            |   2 +-
>   drivers/gpu/drm/i915/gt/intel_lrc.c           |   5 +-
>   drivers/gpu/drm/i915/gt/intel_renderstate.c   |   2 +-
>   drivers/gpu/drm/i915/gt/intel_ring.c          |  10 +-
>   drivers/gpu/drm/i915/gt/intel_ring.h          |   3 +-
>   .../gpu/drm/i915/gt/intel_ring_submission.c   |  15 +-
>   drivers/gpu/drm/i915/gt/intel_timeline.c      |  12 +-
>   drivers/gpu/drm/i915/gt/intel_timeline.h      |   3 +-
>   drivers/gpu/drm/i915/gt/mock_engine.c         |   3 +-
>   drivers/gpu/drm/i915/gt/selftest_lrc.c        |   2 +-
>   drivers/gpu/drm/i915/gt/selftest_timeline.c   |   4 +-
>   drivers/gpu/drm/i915/gt/uc/intel_guc.c        |   2 +-
>   drivers/gpu/drm/i915/i915_drv.h               |  13 +-
>   drivers/gpu/drm/i915/i915_gem.c               |  11 +-
>   drivers/gpu/drm/i915/i915_vma.c               |  13 +-
>   drivers/gpu/drm/i915/i915_vma.h               |  13 +-
>   26 files changed, 217 insertions(+), 137 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
> index 5b4434289117..aa5a88340d10 100644
> --- a/drivers/gpu/drm/i915/display/intel_display.c
> +++ b/drivers/gpu/drm/i915/display/intel_display.c
> @@ -3451,7 +3451,7 @@ initial_plane_vma(struct drm_i915_private *i915,
>   	if (IS_ERR(vma))
>   		goto err_obj;
>   
> -	if (i915_ggtt_pin(vma, 0, PIN_MAPPABLE | PIN_OFFSET_FIXED | base))
> +	if (i915_ggtt_pin(vma, NULL, 0, PIN_MAPPABLE | PIN_OFFSET_FIXED | base))
>   		goto err_obj;
>   
>   	if (i915_gem_object_is_tiled(obj) &&
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 34c8b0dd85e0..cf5ecbde9e06 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -1154,7 +1154,7 @@ static int context_barrier_task(struct i915_gem_context *ctx,
>   
>   		i915_gem_ww_ctx_init(&ww, true);
>   retry:
> -		err = intel_context_pin(ce);
> +		err = intel_context_pin_ww(ce, &ww);
>   		if (err)
>   			goto err;
>   
> @@ -1247,7 +1247,7 @@ static int pin_ppgtt_update(struct intel_context *ce, struct i915_gem_ww_ctx *ww
>   
>   	if (!HAS_LOGICAL_RING_CONTEXTS(vm->i915))
>   		/* ppGTT is not part of the legacy context image */
> -		return gen6_ppgtt_pin(i915_vm_to_ppgtt(vm));
> +		return gen6_ppgtt_pin(i915_vm_to_ppgtt(vm), ww);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 604e26adea23..94bfdc54f035 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -437,16 +437,17 @@ eb_pin_vma(struct i915_execbuffer *eb,
>   		pin_flags |= PIN_GLOBAL;
>   
>   	/* Attempt to reuse the current location if available */
> -	if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) {
> +	/* TODO: Add -EDEADLK handling here */
> +	if (unlikely(i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags))) {
>   		if (entry->flags & EXEC_OBJECT_PINNED)
>   			return false;
>   
>   		/* Failing that pick any _free_ space if suitable */
> -		if (unlikely(i915_vma_pin(vma,
> -					  entry->pad_to_size,
> -					  entry->alignment,
> -					  eb_pin_flags(entry, ev->flags) |
> -					  PIN_USER | PIN_NOEVICT)))
> +		if (unlikely(i915_vma_pin_ww(vma, &eb->ww,
> +					     entry->pad_to_size,
> +					     entry->alignment,
> +					     eb_pin_flags(entry, ev->flags) |
> +					     PIN_USER | PIN_NOEVICT)))
>   			return false;
>   	}
>   
> @@ -587,7 +588,7 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
>   		obj->cache_level != I915_CACHE_NONE);
>   }
>   
> -static int eb_reserve_vma(const struct i915_execbuffer *eb,
> +static int eb_reserve_vma(struct i915_execbuffer *eb,
>   			  struct eb_vma *ev,
>   			  u64 pin_flags)
>   {
> @@ -602,7 +603,7 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
>   			return err;
>   	}
>   
> -	err = i915_vma_pin(vma,
> +	err = i915_vma_pin_ww(vma, &eb->ww,
>   			   entry->pad_to_size, entry->alignment,
>   			   eb_pin_flags(entry, ev->flags) | pin_flags);
>   	if (err)
> @@ -1133,9 +1134,10 @@ static void *reloc_kmap(struct drm_i915_gem_object *obj,
>   }
>   
>   static void *reloc_iomap(struct drm_i915_gem_object *obj,
> -			 struct reloc_cache *cache,
> +			 struct i915_execbuffer *eb,
>   			 unsigned long page)
>   {
> +	struct reloc_cache *cache = &eb->reloc_cache;
>   	struct i915_ggtt *ggtt = cache_to_ggtt(cache);
>   	unsigned long offset;
>   	void *vaddr;
> @@ -1157,10 +1159,13 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
>   		if (err)
>   			return ERR_PTR(err);
>   
> -		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
> -					       PIN_MAPPABLE |
> -					       PIN_NONBLOCK /* NOWARN */ |
> -					       PIN_NOEVICT);
> +		vma = i915_gem_object_ggtt_pin_ww(obj, &eb->ww, NULL, 0, 0,
> +						  PIN_MAPPABLE |
> +						  PIN_NONBLOCK /* NOWARN */ |
> +						  PIN_NOEVICT);
> +		if (vma == ERR_PTR(-EDEADLK))
> +			return vma;
> +
>   		if (IS_ERR(vma)) {
>   			memset(&cache->node, 0, sizeof(cache->node));
>   			mutex_lock(&ggtt->vm.mutex);
> @@ -1196,9 +1201,10 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
>   }
>   
>   static void *reloc_vaddr(struct drm_i915_gem_object *obj,
> -			 struct reloc_cache *cache,
> +			 struct i915_execbuffer *eb,
>   			 unsigned long page)
>   {
> +	struct reloc_cache *cache = &eb->reloc_cache;
>   	void *vaddr;
>   
>   	if (cache->page == page) {
> @@ -1206,7 +1212,7 @@ static void *reloc_vaddr(struct drm_i915_gem_object *obj,
>   	} else {
>   		vaddr = NULL;
>   		if ((cache->vaddr & KMAP) == 0)
> -			vaddr = reloc_iomap(obj, cache, page);
> +			vaddr = reloc_iomap(obj, eb, page);
>   		if (!vaddr)
>   			vaddr = reloc_kmap(obj, cache, page);
>   	}
> @@ -1293,7 +1299,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>   		goto err_unmap;
>   	}
>   
> -	err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
> +	err = i915_vma_pin_ww(batch, &eb->ww, 0, 0, PIN_USER | PIN_NONBLOCK);
>   	if (err)
>   		goto err_unmap;
>   
> @@ -1314,7 +1320,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>   			eb->reloc_context = ce;
>   		}
>   
> -		err = intel_context_pin(ce);
> +		err = intel_context_pin_ww(ce, &eb->ww);
>   		if (err)
>   			goto err_unpin;
>   
> @@ -1537,8 +1543,7 @@ relocate_entry(struct i915_vma *vma,
>   		void *vaddr;
>   
>   repeat:
> -		vaddr = reloc_vaddr(vma->obj,
> -				    &eb->reloc_cache,
> +		vaddr = reloc_vaddr(vma->obj, eb,
>   				    offset >> PAGE_SHIFT);
>   		if (IS_ERR(vaddr))
>   			return PTR_ERR(vaddr);
> @@ -1954,6 +1959,7 @@ static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb,
>   	rq = eb_pin_engine(eb, false);
>   	if (IS_ERR(rq)) {
>   		err = PTR_ERR(rq);
> +		rq = NULL;
>   		goto err;
>   	}
>   
> @@ -2238,7 +2244,8 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
>   }
>   
>   static struct i915_vma *
> -shadow_batch_pin(struct drm_i915_gem_object *obj,
> +shadow_batch_pin(struct i915_execbuffer *eb,
> +		 struct drm_i915_gem_object *obj,
>   		 struct i915_address_space *vm,
>   		 unsigned int flags)
>   {
> @@ -2249,7 +2256,7 @@ shadow_batch_pin(struct drm_i915_gem_object *obj,
>   	if (IS_ERR(vma))
>   		return vma;
>   
> -	err = i915_vma_pin(vma, 0, 0, flags);
> +	err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, flags);
>   	if (err)
>   		return ERR_PTR(err);
>   
> @@ -2403,16 +2410,33 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
>   	return err;
>   }
>   
> +static struct i915_vma *eb_dispatch_secure(struct i915_execbuffer *eb, struct i915_vma *vma)
> +{
> +	/*
> +	 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
> +	 * batch" bit. Hence we need to pin secure batches into the global gtt.
> +	 * hsw should have this fixed, but bdw mucks it up again. */
> +	if (eb->batch_flags & I915_DISPATCH_SECURE)
> +		return i915_gem_object_ggtt_pin_ww(vma->obj, &eb->ww, NULL, 0, 0, 0);
> +
> +	return NULL;
> +}
> +
>   static int eb_parse(struct i915_execbuffer *eb)
>   {
>   	struct drm_i915_private *i915 = eb->i915;
>   	struct intel_gt_buffer_pool_node *pool = eb->batch_pool;
> -	struct i915_vma *shadow, *trampoline;
> +	struct i915_vma *shadow, *trampoline, *batch;
>   	unsigned int len;
>   	int err;
>   
> -	if (!eb_use_cmdparser(eb))
> -		return 0;
> +	if (!eb_use_cmdparser(eb)) {
> +		batch = eb_dispatch_secure(eb, eb->batch->vma);
> +		if (IS_ERR(batch))
> +			return PTR_ERR(batch);
> +
> +		goto secure_batch;
> +	}
>   
>   	len = eb->batch_len;
>   	if (!CMDPARSER_USES_GGTT(eb->i915)) {
> @@ -2440,7 +2464,7 @@ static int eb_parse(struct i915_execbuffer *eb)
>   	if (err)
>   		goto err;
>   
> -	shadow = shadow_batch_pin(pool->obj, eb->context->vm, PIN_USER);
> +	shadow = shadow_batch_pin(eb, pool->obj, eb->context->vm, PIN_USER);
>   	if (IS_ERR(shadow)) {
>   		err = PTR_ERR(shadow);
>   		goto err;
> @@ -2452,7 +2476,7 @@ static int eb_parse(struct i915_execbuffer *eb)
>   	if (CMDPARSER_USES_GGTT(eb->i915)) {
>   		trampoline = shadow;
>   
> -		shadow = shadow_batch_pin(pool->obj,
> +		shadow = shadow_batch_pin(eb, pool->obj,
>   					  &eb->engine->gt->ggtt->vm,
>   					  PIN_GLOBAL);
>   		if (IS_ERR(shadow)) {
> @@ -2465,19 +2489,34 @@ static int eb_parse(struct i915_execbuffer *eb)
>   		eb->batch_flags |= I915_DISPATCH_SECURE;
>   	}
>   
> +	batch = eb_dispatch_secure(eb, shadow);
> +	if (IS_ERR(batch)) {
> +		err = PTR_ERR(batch);
> +		goto err_trampoline;
> +	}
> +
>   	err = eb_parse_pipeline(eb, shadow, trampoline);
>   	if (err)
> -		goto err_trampoline;
> +		goto err_unpin_batch;
>   
> -	eb->vma[eb->buffer_count].vma = i915_vma_get(shadow);
> -	eb->vma[eb->buffer_count].flags = __EXEC_OBJECT_HAS_PIN;
>   	eb->batch = &eb->vma[eb->buffer_count++];
> +	eb->batch->vma = i915_vma_get(shadow);
> +	eb->batch->flags = __EXEC_OBJECT_HAS_PIN;
>   
>   	eb->trampoline = trampoline;
>   	eb->batch_start_offset = 0;
>   
> +secure_batch:
> +	if (batch) {
> +		eb->batch = &eb->vma[eb->buffer_count++];
> +		eb->batch->flags = __EXEC_OBJECT_HAS_PIN;
> +		eb->batch->vma = i915_vma_get(batch);
> +	}
>   	return 0;
>   
> +err_unpin_batch:
> +	if (batch)
> +		i915_vma_unpin(batch);
>   err_trampoline:
>   	if (trampoline)
>   		i915_vma_unpin(trampoline);
> @@ -2619,7 +2658,7 @@ static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throt
>   	 * GGTT space, so do this first before we reserve a seqno for
>   	 * ourselves.
>   	 */
> -	err = intel_context_pin(ce);
> +	err = intel_context_pin_ww(ce, &eb->ww);
>   	if (err)
>   		return ERR_PTR(err);
>   
> @@ -3237,33 +3276,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>   
>   	ww_acquire_done(&eb.ww.ctx);
>   
> -	/*
> -	 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
> -	 * batch" bit. Hence we need to pin secure batches into the global gtt.
> -	 * hsw should have this fixed, but bdw mucks it up again. */
> -	if (eb.batch_flags & I915_DISPATCH_SECURE) {
> -		struct i915_vma *vma;
> -
> -		/*
> -		 * So on first glance it looks freaky that we pin the batch here
> -		 * outside of the reservation loop. But:
> -		 * - The batch is already pinned into the relevant ppgtt, so we
> -		 *   already have the backing storage fully allocated.
> -		 * - No other BO uses the global gtt (well contexts, but meh),
> -		 *   so we don't really have issues with multiple objects not
> -		 *   fitting due to fragmentation.
> -		 * So this is actually safe.
> -		 */
> -		vma = i915_gem_object_ggtt_pin(eb.batch->vma->obj, NULL, 0, 0, 0);
> -		if (IS_ERR(vma)) {
> -			err = PTR_ERR(vma);
> -			goto err_vma;
> -		}
> -
> -		batch = vma;
> -	} else {
> -		batch = eb.batch->vma;
> -	}
> +	batch = eb.batch->vma;
>   
>   	/* All GPU relocation batches must be submitted prior to the user rq */
>   	GEM_BUG_ON(eb.reloc_cache.rq);
> @@ -3272,7 +3285,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>   	eb.request = i915_request_create(eb.context);
>   	if (IS_ERR(eb.request)) {
>   		err = PTR_ERR(eb.request);
> -		goto err_batch_unpin;
> +		goto err_vma;
>   	}
>   
>   	if (in_fence) {
> @@ -3333,9 +3346,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>   	}
>   	i915_request_put(eb.request);
>   
> -err_batch_unpin:
> -	if (eb.batch_flags & I915_DISPATCH_SECURE)
> -		i915_vma_unpin(batch);
>   err_vma:
>   	eb_release_vmas(&eb, true);
>   	if (eb.trampoline)
> @@ -3423,7 +3433,9 @@ i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data,
>   	/* Copy in the exec list from userland */
>   	exec_list = kvmalloc_array(count, sizeof(*exec_list),
>   				   __GFP_NOWARN | GFP_KERNEL);
> -	exec2_list = kvmalloc_array(count + 1, eb_element_size(),
> +
> +	/* Allocate extra slots for use by the command parser */
> +	exec2_list = kvmalloc_array(count + 2, eb_element_size(),
>   				    __GFP_NOWARN | GFP_KERNEL);
>   	if (exec_list == NULL || exec2_list == NULL) {
>   		drm_dbg(&i915->drm,
> @@ -3500,8 +3512,8 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
>   	if (err)
>   		return err;
>   
> -	/* Allocate an extra slot for use by the command parser */
> -	exec2_list = kvmalloc_array(count + 1, eb_element_size(),
> +	/* Allocate extra slots for use by the command parser */
> +	exec2_list = kvmalloc_array(count + 2, eb_element_size(),
>   				    __GFP_NOWARN | GFP_KERNEL);
>   	if (exec2_list == NULL) {
>   		drm_dbg(&i915->drm, "Failed to allocate exec list for %zd buffers\n",
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> index 563839cbaf1c..e1d50a5a1477 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> @@ -36,7 +36,7 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
>   	if (err)
>   		return err;
>   
> -	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
> +	err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, PIN_USER | PIN_HIGH);
>   	if (err)
>   		return err;
>   
> @@ -139,7 +139,7 @@ static int igt_gpu_reloc(void *arg)
>   
>   		i915_gem_ww_ctx_init(&eb.ww, false);
>   retry:
> -		err = intel_context_pin(eb.context);
> +		err = intel_context_pin_ww(eb.context, &eb.ww);
>   		if (!err) {
>   			err = __igt_gpu_reloc(&eb, scratch);
>   
> diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
> index 7e5a86b774a7..fd0d24d28763 100644
> --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
> +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c
> @@ -368,7 +368,7 @@ static struct i915_vma *pd_vma_create(struct gen6_ppgtt *ppgtt, int size)
>   	return vma;
>   }
>   
> -int gen6_ppgtt_pin(struct i915_ppgtt *base)
> +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww)
>   {
>   	struct gen6_ppgtt *ppgtt = to_gen6_ppgtt(base);
>   	int err;
> @@ -394,7 +394,7 @@ int gen6_ppgtt_pin(struct i915_ppgtt *base)
>   	 */
>   	err = 0;
>   	if (!atomic_read(&ppgtt->pin_count))
> -		err = i915_ggtt_pin(ppgtt->vma, GEN6_PD_ALIGN, PIN_HIGH);
> +		err = i915_ggtt_pin(ppgtt->vma, ww, GEN6_PD_ALIGN, PIN_HIGH);
>   	if (!err)
>   		atomic_inc(&ppgtt->pin_count);
>   	mutex_unlock(&ppgtt->pin_mutex);
> diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h
> index 7249672e5802..3357228f3304 100644
> --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.h
> +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.h
> @@ -8,6 +8,8 @@
>   
>   #include "intel_gtt.h"
>   
> +struct i915_gem_ww_ctx;
> +
>   struct gen6_ppgtt {
>   	struct i915_ppgtt base;
>   
> @@ -67,7 +69,7 @@ static inline struct gen6_ppgtt *to_gen6_ppgtt(struct i915_ppgtt *base)
>   		     (pt = i915_pt_entry(pd, iter), true);		\
>   	     ++iter)
>   
> -int gen6_ppgtt_pin(struct i915_ppgtt *base);
> +int gen6_ppgtt_pin(struct i915_ppgtt *base, struct i915_gem_ww_ctx *ww);
>   void gen6_ppgtt_unpin(struct i915_ppgtt *base);
>   void gen6_ppgtt_unpin_all(struct i915_ppgtt *base);
>   void gen6_ppgtt_enable(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
> index efe9a7a89ede..c05ef213bdc2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -93,12 +93,12 @@ static void intel_context_active_release(struct intel_context *ce)
>   	i915_active_release(&ce->active);
>   }
>   
> -static int __context_pin_state(struct i915_vma *vma)
> +static int __context_pin_state(struct i915_vma *vma, struct i915_gem_ww_ctx *ww)
>   {
>   	unsigned int bias = i915_ggtt_pin_bias(vma) | PIN_OFFSET_BIAS;
>   	int err;
>   
> -	err = i915_ggtt_pin(vma, 0, bias | PIN_HIGH);
> +	err = i915_ggtt_pin(vma, ww, 0, bias | PIN_HIGH);
>   	if (err)
>   		return err;
>   
> @@ -127,11 +127,12 @@ static void __context_unpin_state(struct i915_vma *vma)
>   	__i915_vma_unpin(vma);
>   }
>   
> -static int __ring_active(struct intel_ring *ring)
> +static int __ring_active(struct intel_ring *ring,
> +			 struct i915_gem_ww_ctx *ww)
>   {
>   	int err;
>   
> -	err = intel_ring_pin(ring);
> +	err = intel_ring_pin(ring, ww);
>   	if (err)
>   		return err;
>   
> @@ -152,24 +153,25 @@ static void __ring_retire(struct intel_ring *ring)
>   	intel_ring_unpin(ring);
>   }
>   
> -static int intel_context_pre_pin(struct intel_context *ce)
> +static int intel_context_pre_pin(struct intel_context *ce,
> +				 struct i915_gem_ww_ctx *ww)
>   {
>   	int err;
>   
>   	CE_TRACE(ce, "active\n");
>   
> -	err = __ring_active(ce->ring);
> +	err = __ring_active(ce->ring, ww);
>   	if (err)
>   		return err;
>   
> -	err = intel_timeline_pin(ce->timeline);
> +	err = intel_timeline_pin(ce->timeline, ww);
>   	if (err)
>   		goto err_ring;
>   
>   	if (!ce->state)
>   		return 0;
>   
> -	err = __context_pin_state(ce->state);
> +	err = __context_pin_state(ce->state, ww);
>   	if (err)
>   		goto err_timeline;
>   
> @@ -192,7 +194,8 @@ static void intel_context_post_unpin(struct intel_context *ce)
>   	__ring_retire(ce->ring);
>   }
>   
> -int __intel_context_do_pin(struct intel_context *ce)
> +int __intel_context_do_pin_ww(struct intel_context *ce,
> +			      struct i915_gem_ww_ctx *ww)
>   {
>   	bool handoff = false;
>   	void *vaddr;
> @@ -209,7 +212,14 @@ int __intel_context_do_pin(struct intel_context *ce)
>   	 * refcount for __intel_context_active(), which prevent a lock
>   	 * inversion of ce->pin_mutex vs dma_resv_lock().
>   	 */
> -	err = intel_context_pre_pin(ce);
> +
> +	err = i915_gem_object_lock(ce->timeline->hwsp_ggtt->obj, ww);

Since hwsp_ggtt->obj is a shared gem object due to sub-allocation, 
holding this lock across execbuf unnecessarily stalls submission of 
other clients that share the same suballocation slab. Since it's pinned 
using a pin-count rather than using a dma-fence, it should be completely 
safe to drop this lock before returning zero from this function. However 
if we in the future move to protecting the residency with the request 
dma-fence we can no longer drop it here, since we don't have that 
dma-fence yet.

An alternative brought up by Daniel would be to revert the commit that 
introduces the hwsp cacheline suballocation.

> +	if (!err && ce->ring->vma->obj)
> +		err = i915_gem_object_lock(ce->ring->vma->obj, ww);
> +	if (!err && ce->state)
> +		err = i915_gem_object_lock(ce->state->obj, ww);

Could these three locks be made interruptible?

> +int i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww,
> +		  u32 align, unsigned int flags);
>   
>   static inline int i915_vma_pin_count(const struct i915_vma *vma)
>   {

/Thomas