[Intel-gfx] [PATCH 08/13] drm/i915: Eliminate lots of iterations over the execobjects array

Tue Apr 4 14:57:34 UTC 2017

On ke, 2017-03-29 at 16:56 +0100, Chris Wilson wrote:
> The major scaling bottleneck in execbuffer is the processing of the
> execobjects. Creating an auxiliary list is inefficient when compared to
> using the execobject array we already have allocated.
> 
> Reservation is then split into phases. As we lookup up the VMA, we
> try and bind it back into active location. Only if that fails, do we add
> it to the unbound list for phase 2. In phase 2, we try and add all those
> objects that could not fit into their previous location, with fallback
> to retrying all objects and evicting the VM in case of severe
> fragmentation. (This is the same as before, except that phase 1 is now
> done inline with looking up the VMA to avoid an iteration over the
> execobject array. In the ideal case, we eliminate the separate reservation
> phase). During the reservation phase, we only evict from the VM between
> passes (rather than currently as we try to fit every new VMA). In
> testing with Unreal Engine's Atlantis demo which stresses the eviction
> logic on gen7 class hardware, this speed up the framerate by a factor of
> 2.
> 
> The second loop amalgamation is between move_to_gpu and move_to_active.
> As we always submit the request, even if incomplete, we can use the
> current request to track active VMA as we perform the flushes and
> synchronisation required.
> 
> The next big advancement is to avoid copying back to the user any
> execobjects and relocations that are not changed.
> 
> v2: Add a Theory of Operation spiel.
> v3: Fall back to slow relocations in preparation for flushing userptrs.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

<SNIP>

>  struct i915_execbuffer {
>  	struct drm_i915_private *i915;
>  	struct drm_file *file;
> @@ -63,19 +180,24 @@ struct i915_execbuffer {
>  	struct i915_address_space *vm;
>  	struct i915_vma *batch;
>  	struct drm_i915_gem_request *request;
> -	u32 batch_start_offset;
> -	u32 batch_len;
> -	unsigned int dispatch_flags;
> -	struct drm_i915_gem_exec_object2 shadow_exec_entry;
> -	bool need_relocs;
> -	struct list_head vmas;
> +	unsigned int buffer_count;
> +	struct list_head unbound;
> +	struct list_head relocs;
>  	struct reloc_cache {
>  		struct drm_mm_node node;
>  		unsigned long vaddr;
>  		unsigned int page;
>  		bool use_64bit_reloc : 1;
> +		bool has_llc : 1;
> +		bool has_fence : 1;
> +		bool needs_unfenced : 1;
>  	} reloc_cache;
> -	int lut_mask;
> +	u64 invalid_flags;
> +	u32 context_flags;
> +	u32 dispatch_flags;
> +	u32 batch_start_offset;
> +	u32 batch_len;
> +	int lut_size;
>  	struct hlist_head *buckets;

Please document (new) members.

> +static inline u64 gen8_noncanonical_addr(u64 address)
> +{
> +	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);

GENMASK_ULL

> @@ -106,71 +256,302 @@ eb_create(struct i915_execbuffer *eb)
>  				return -ENOMEM;
>  		}
>  
> -		eb->lut_mask = size;
> +		eb->lut_size = size;
>  	} else {
> -		eb->lut_mask = -eb->args->buffer_count;
> +		eb->lut_size = -eb->buffer_count;

Document the negative meaning in the doc mentioned above.

> +static bool
> +eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
> +		 const struct i915_vma *vma)
> +{
> +	if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)

Lets try to stick to one convention, we gave up == NULL too, so
!(a & FOO).

<SNIP>

> +	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
> +	    (vma->node.start + vma->node.size - 1) >> 32)

upper_32_bits for clarity?

> +static void
> +eb_pin_vma(struct i915_execbuffer *eb,
> +	   struct drm_i915_gem_exec_object2 *entry,
> +	   struct i915_vma *vma)
> +{
> +	u64 flags;
> +
> +	flags = vma->node.start;

I'd be more comfortable if some mask was applied here.

Or at least GEM_BUG_ON(flags & BAD_FLAGS);

> +static inline void
> +eb_unreserve_vma(struct i915_vma *vma,
> +		 struct drm_i915_gem_exec_object2 *entry)
>  {
> -	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -
> -	__eb_unreserve_vma(vma, entry);
> -	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
> +	if (entry->flags & __EXEC_OBJECT_HAS_PIN) {

I'd treat the if more as an early return, but I guess you have more
code coming.

> +static int
> +eb_add_vma(struct i915_execbuffer *eb,
> > +	   struct drm_i915_gem_exec_object2 *entry,
> > +	   struct i915_vma *vma)
>  {
> > -	struct i915_vma *vma;
> > +	int ret;
>  
> > -	list_for_each_entry(vma, &eb->vmas, exec_link) {
> > -		eb_unreserve_vma(vma);
> > -		i915_vma_put(vma);
> > -		vma->exec_entry = NULL;
> > +	GEM_BUG_ON(i915_vma_is_closed(vma));
> +
> +	if ((eb->args->flags & __EXEC_VALIDATED) == 0) {

smells like a function here.

<SNIP>

> 	}
>  
> -	if (eb->lut_mask >= 0)
> -		memset(eb->buckets, 0,
> -		       sizeof(struct hlist_head) << eb->lut_mask);
> +	vma->exec_entry = entry;
> +	entry->rsvd2 = (uintptr_t)vma;

Umm, there was a helper introduced in some patch.

<SNIP>

Could add a comment as to why do this?

> +	if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
> +		entry->flags |= eb->context_flags;
> +

<SNIP>

> +
> +static int
> +eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
> +{

<SNIP>

> -	i915_vma_get(vma);
> -	__exec_to_vma(&eb->exec[i]) = (uintptr_t)vma;
> -	return true;
> +	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
> +		ret = i915_vma_get_fence(vma);
> +		if (ret)
> +			return ret;
> +
> +		if (i915_vma_pin_fence(vma))
> +			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> +	}

Smells like duplicate code with eb_vma_pin.

<SNIP> 

> static int
> eb_lookup_vmas(struct i915_execbuffer *eb)
> {

<SNIP>

> -	if (ht_needs_resize(eb->ctx)) {
> -		eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
> -		queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize);
> -	}
> +	if (eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
> +		struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;

You could hoist the lut variable, its used quite a few times.

> @@ -616,16 +1048,15 @@ relocate_entry(struct drm_i915_gem_object *obj,
>  		goto repeat;
> 	}
>  
> -	return 0;
> +	return gen8_canonical_addr(target->node.start) | 1;

Magic bit.

> +static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
>  {
>  #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
> -	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
> -	struct drm_i915_gem_relocation_entry __user *user_relocs;
> -	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -	int remain, ret = 0;
> -
> -	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> +	struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
> +	struct drm_i915_gem_relocation_entry __user *urelocs;
> +	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> +	unsigned int remain;
>  
> +	urelocs = u64_to_user_ptr(entry->relocs_ptr);
>  	remain = entry->relocation_count;
> -	while (remain) {
> -		struct drm_i915_gem_relocation_entry *r = stack_reloc;
> -		unsigned long unwritten;
> -		unsigned int count;
> +	if (unlikely(remain > ULONG_MAX / sizeof(*urelocs)))

How bout N_RELOC(ULONG_MAX) ?

> @@ -732,66 +1164,66 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
>  		 * this is bad and so lockdep complains vehemently.
>  		 */
>  		pagefault_disable();
> -		unwritten = __copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]));
> -		pagefault_enable();
> -		if (unlikely(unwritten)) {
> -			ret = -EFAULT;
> +		if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) {
> +			pagefault_enable();
> +			remain = -EFAULT;
>  			goto out;
>  		}
> +		pagefault_enable();

Why dupe the pagefault_enable?

>  
> +		remain -= count;
>  		do {
> -			u64 offset = r->presumed_offset;
> +			u64 offset = eb_relocate_entry(eb, vma, r);
>  
> -			ret = eb_relocate_entry(vma, eb, r);
> -			if (ret)
> +			if (likely(offset == 0)) {

Sparse not complaining?

> +			} else if ((s64)offset < 0) {
> +				remain = (s64)offset;
>  				goto out;
> -
> -			if (r->presumed_offset != offset) {
> +			} else {
> +				/* Note that reporting an error now
> +				 * leaves everything in an inconsistent
> +				 * state as we have *already* changed
> +				 * the relocation value inside the
> +				 * object. As we have not changed the
> +				 * reloc.presumed_offset or will not
> +				 * change the execobject.offset, on the
> +				 * call we may not rewrite the value
> +				 * inside the object, leaving it
> +				 * dangling and causing a GPU hang.
> +				 */
>  				pagefault_disable();
> -				unwritten = __put_user(r->presumed_offset,
> -						       &user_relocs->presumed_offset);
> +				__put_user(offset & ~1,

Magic.

> +					   &urelocs[r-stack].presumed_offset);

There's the comment, so maybe worth if (__put_user()) DRM_DEBUG?

>  				pagefault_enable();

<SNIP>

> +static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
> {

<SNIP>
> +	const unsigned long relocs_max =
> +		ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry);

Could be a define, this is used above too.

<SNIP>

> +	return __get_user(c, end - 1);

What's the point in this final check?

>  }
>  
> -static bool
> -need_reloc_mappable(struct i915_vma *vma)
> +static int
> +eb_copy_relocations(const struct i915_execbuffer *eb)
>  {
> -	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -
> -	if (entry->relocation_count == 0)
> -		return false;
> -
> -	if (!i915_vma_is_ggtt(vma))
> -		return false;
> +	const unsigned int count = eb->buffer_count;
> +	unsigned int i;
> +	int ret;
>  
> -	/* See also use_cpu_reloc() */
> -	if (HAS_LLC(to_i915(vma->obj->base.dev)))
> -		return false;
> +	for (i = 0; i < count; i++) {
> +		struct drm_i915_gem_relocation_entry __user *urelocs;
> +		struct drm_i915_gem_relocation_entry *relocs;
> +		unsigned int nreloc = eb->exec[i].relocation_count, j;

No hidden variables in assignment lines.

> -static bool
> -eb_vma_misplaced(struct i915_vma *vma)
> -{
> -	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> +		urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
> +		size = nreloc * sizeof(*relocs);
>  
> -	WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
> -		!i915_vma_is_ggtt(vma));
> +		relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY);
> +		if (!relocs) {
> +			drm_free_large(relocs);
> +			ret = -ENOMEM;
> +			goto err;
> +		}
>  
> -	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
> -		return true;
> +		/* copy_from_user is limited to 4GiB */
> +		j = 0;
> +		do {
> +			u32 len = min_t(u64, 1ull<<31, size);

BIT_ULL(31)

> +static void eb_export_fence(struct drm_i915_gem_object *obj,
> +			    struct drm_i915_gem_request *req,
> +			    unsigned int flags)
> +{
> +	struct reservation_object *resv = obj->resv;
> +
> +	/* Ignore errors from failing to allocate the new fence, we can't
> +	 * handle an error right now. Worst case should be missed
> +	 * synchronisation leading to rendering corruption.
> +	 */

Worthy DRM_DEBUG?

> @@ -1155,10 +1524,33 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
>  		}
>  
>  		ret = i915_gem_request_await_object
> -			(eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
> +			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
>  		if (ret)
>  			return ret;
> +
> +skip_flushes:
> +		obj->base.write_domain = 0;
> +		if (entry->flags & EXEC_OBJECT_WRITE) {
> +			obj->base.read_domains = 0;
> +			if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
> +				obj->cache_dirty = true;
> +			intel_fb_obj_invalidate(obj, ORIGIN_CS);
> +		}
> +		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
> +
> +		i915_vma_move_to_active(vma, eb->request, entry->flags);
> +		__eb_unreserve_vma(vma, entry);
> +		vma->exec_entry = NULL;

This seems like a bugfix hunk lost to refactoring patch.

> @@ -1377,16 +1629,16 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
>  		return -EINVAL;
>  	}
>  
> -	cs = intel_ring_begin(req, 4 * 3);
> +	cs = intel_ring_begin(req, 4 * 2 + 2);
>  	if (IS_ERR(cs))
>  		return PTR_ERR(cs);
>  
> +	*cs++ = MI_LOAD_REGISTER_IMM(4);
>  	for (i = 0; i < 4; i++) {
> -		*cs++ = MI_LOAD_REGISTER_IMM(1);
>  		*cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
>  		*cs++ = 0;
>  	}
> -
> +	*cs++ = MI_NOOP;
>  	intel_ring_advance(req, cs);

Again a lost hunk.

>  
> >  	return 0;
> @@ -1422,10 +1674,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
>  		goto out;
>  
>  	vma->exec_entry =
> -		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
> +		memset(&eb->exec[eb->buffer_count++],
> +		       0, sizeof(*vma->exec_entry));
>  	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
> -	i915_gem_object_get(shadow_batch_obj);
> -	list_add_tail(&vma->exec_link, &eb->vmas);
> +	vma->exec_entry->rsvd2 = (uintptr_t)vma;

Use the helper.

> @@ -1901,56 +2135,63 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
>  		     struct drm_file *file)
>  {
>  	struct drm_i915_gem_execbuffer2 *args = data;
> -	struct drm_i915_gem_exec_object2 *exec2_list = NULL;
> +	struct drm_i915_gem_exec_object2 *exec2_list;
>  	int ret;
>  
>  	if (args->buffer_count < 1 ||
> -	    args->buffer_count > UINT_MAX / sizeof(*exec2_list)) {
> +	    args->buffer_count >= UINT_MAX / sizeof(*exec2_list) - 1) {
>  		DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count);
>  		return -EINVAL;
>  	}
>  
> -	exec2_list = drm_malloc_gfp(args->buffer_count,
> +	if (!i915_gem_check_execbuffer(args))
> +		return -EINVAL;
> +
> +	exec2_list = drm_malloc_gfp(args->buffer_count + 1,

The "+ 1" is very unintuitive without a comment.

With the comments, this is (90%);

Reviewed-by: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>

I'm pretty darn sure I ain't reviewing this again without some very
specific changelog and inter-diff.

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation