[Intel-gfx] [PATCH 13/17] drm/i915: Eliminate lots of iterations over the execobjects array

Mon Aug 22 08:03:46 UTC 2016

The major scaling bottleneck in execbuffer is the processing of the
execobjects. Creating an auxiliary list is inefficient when compared to
using the execobject array we already have allocated.

Reservation is then split into phases. As we lookup up the VMA, we
try and bind it back into active location. Only if that fails, do we add
it to the unbound list for phase 2. In phase 2, we try and add all those
objects that could not fit into their previous location, with fallback
to retrying all objects and evicting the VM in case of severe
fragmentation. (This is the same as before, except that phase 1 is now
done inline with looking up the VMA to avoid an iteration over the
execobject array. In the ideal case, we eliminate the separate reservation
phase). During the reservation phase, we only evict from the VM between
passes (rather than currently as we try to fit every new VMA). In
testing with Unreal Engine's Atlantis demo which stresses the eviction
logic on gen7 class hardware, this speed up the framerate by a factor of
2.

The second loop amalgamation is between move_to_gpu and move_to_active.
As we always submit the request, even if incomplete, we can use the
current request to track active VMA as we perform the flushes and
synchronisation required.

The next big advancement is to avoid copying back to the user any
execobjects and relocations that are not changed.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h            |    3 +-
 drivers/gpu/drm/i915/i915_gem.c            |    2 +-
 drivers/gpu/drm/i915/i915_gem_evict.c      |   64 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 1555 +++++++++++++++-------------
 drivers/gpu/drm/i915/i915_gem_gtt.h        |    4 +-
 5 files changed, 874 insertions(+), 754 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 899fe983e623..a92c14ad30eb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3111,6 +3111,7 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv);
 
 void *i915_gem_object_alloc(struct drm_device *dev);
 void i915_gem_object_free(struct drm_i915_gem_object *obj);
+bool i915_gem_object_flush_active(struct drm_i915_gem_object *obj);
 void i915_gem_object_init(struct drm_i915_gem_object *obj,
 			 const struct drm_i915_gem_object_ops *ops);
 struct drm_i915_gem_object *i915_gem_object_create(struct drm_device *dev,
@@ -3502,7 +3503,7 @@ int __must_check i915_gem_evict_something(struct i915_address_space *vm,
 					  unsigned flags);
 int __must_check i915_gem_evict_for_vma(struct i915_vma *vma,
 					unsigned int flags);
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
+int i915_gem_evict_vm(struct i915_address_space *vm);
 
 /* belongs in i915_gem_gtt.h */
 static inline void i915_gem_chipset_flush(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 78faac2b780c..1f35dd6219cb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3034,7 +3034,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 			  size, obj->base.size,
 			  flags & PIN_MAPPABLE ? "mappable" : "total",
 			  end);
-		return -E2BIG;
+		return -ENOSPC;
 	}
 
 	ret = i915_gem_object_get_pages(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 79ce873b4891..d59b085de48e 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -55,7 +55,7 @@ mark_free(struct i915_vma *vma, unsigned int flags, struct list_head *unwind)
 	if (flags & PIN_NONFAULT && vma->obj->fault_mappable)
 		return false;
 
-	list_add(&vma->exec_list, unwind);
+	list_add(&vma->evict_link, unwind);
 	return drm_mm_scan_add_block(&vma->node);
 }
 
@@ -134,7 +134,7 @@ search_again:
 	} while (*++phase);
 
 	/* Nothing found, clean up and bail out! */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		ret = drm_mm_scan_remove_block(&vma->node);
 		BUG_ON(ret);
 	}
@@ -179,16 +179,16 @@ found:
 	 * calling unbind (which may remove the active reference
 	 * of any of our objects, thus corrupting the list).
 	 */
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		if (drm_mm_scan_remove_block(&vma->node))
 			__i915_vma_pin(vma);
 		else
-			list_del(&vma->exec_list);
+			list_del(&vma->evict_link);
 	}
 
 	/* Unbinding will emit any required flushes */
 	ret = 0;
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -260,10 +260,10 @@ int i915_gem_evict_for_vma(struct i915_vma *target, unsigned int flags)
 		}
 
 		__i915_vma_pin(vma);
-		list_add(&vma->exec_list, &eviction_list);
+		list_add(&vma->evict_link, &eviction_list);
 	}
 
-	list_for_each_entry_safe(vma, next, &eviction_list, exec_list) {
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
 		__i915_vma_unpin(vma);
 		if (ret == 0)
 			ret = i915_vma_unbind(vma);
@@ -286,34 +286,48 @@ int i915_gem_evict_for_vma(struct i915_vma *target, unsigned int flags)
  * To clarify: This is for freeing up virtual address space, not for freeing
  * memory in e.g. the shrinker.
  */
-int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle)
+int i915_gem_evict_vm(struct i915_address_space *vm)
 {
+	struct list_head *phases[] = {
+		&vm->inactive_list,
+		&vm->active_list,
+		NULL
+	}, **phase;
+	struct list_head eviction_list;
 	struct i915_vma *vma, *next;
 	int ret;
 
 	WARN_ON(!mutex_is_locked(&vm->dev->struct_mutex));
 	trace_i915_gem_evict_vm(vm);
 
-	if (do_idle) {
-		struct drm_i915_private *dev_priv = to_i915(vm->dev);
-
-		if (i915_is_ggtt(vm)) {
-			ret = i915_gem_switch_to_kernel_context(dev_priv);
-			if (ret)
-				return ret;
-		}
-
-		ret = i915_gem_wait_for_idle(dev_priv, true);
+	/* Switch back to the default context in order to unpin
+	 * the existing context objects. However, such objects only
+	 * pin themselves inside the global GTT and performing the
+	 * switch otherwise is ineffective.
+	 */
+	if (i915_is_ggtt(vm)) {
+		ret = i915_gem_switch_to_kernel_context(to_i915(vm->dev));
 		if (ret)
 			return ret;
-
-		i915_gem_retire_requests(dev_priv);
-		WARN_ON(!list_empty(&vm->active_list));
 	}
 
-	list_for_each_entry_safe(vma, next, &vm->inactive_list, vm_link)
-		if (!i915_vma_is_pinned(vma))
-			WARN_ON(i915_vma_unbind(vma));
+	INIT_LIST_HEAD(&eviction_list);
+	phase = phases;
+	do {
+		list_for_each_entry(vma, *phase, vm_link) {
+			if (i915_vma_is_pinned(vma))
+				continue;
+
+			__i915_vma_pin(vma);
+			list_add(&vma->evict_link, &eviction_list);
+		}
+	} while (*++phase);
 
-	return 0;
+	ret = 0;
+	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
+		__i915_vma_unpin(vma);
+		if (ret == 0)
+			ret = i915_vma_unbind(vma);
+	}
+	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e9ac591f3a79..71b18fcbd8a7 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -41,11 +41,16 @@
 
 #define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
 
-#define  __EXEC_OBJECT_HAS_PIN		(1<<31)
-#define  __EXEC_OBJECT_HAS_FENCE	(1<<30)
-#define  __EXEC_OBJECT_NEEDS_MAP	(1<<29)
-#define  __EXEC_OBJECT_NEEDS_BIAS	(1<<28)
+#define  __EXEC_OBJECT_HAS_PIN		BIT(31)
+#define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
+#define  __EXEC_OBJECT_NEEDS_MAP	BIT(29)
+#define  __EXEC_OBJECT_NEEDS_BIAS	BIT(28)
 #define  __EXEC_OBJECT_INTERNAL_FLAGS (0xf<<28) /* all of the above */
+#define __EB_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
+
+#define __EXEC_HAS_RELOC	BIT(31)
+#define __EXEC_VALIDATED	BIT(30)
+#define UPDATE			PIN_OFFSET_FIXED
 
 #define BATCH_OFFSET_BIAS (256*1024)
 
@@ -59,21 +64,43 @@ struct i915_execbuffer {
 	struct i915_address_space *vm;
 	struct i915_vma *batch;
 	struct drm_i915_gem_request *request;
-	u32 batch_start_offset;
-	unsigned int dispatch_flags;
-	struct drm_i915_gem_exec_object2 shadow_exec_entry;
-	bool need_relocs;
-	struct list_head vmas;
+	struct list_head unbound;
+	struct list_head relocs;
 	struct reloc_cache {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
 		bool use_64bit_reloc;
+		bool has_llc;
+		bool has_fence;
 	} reloc_cache;
+	u64 invalid_flags;
+	u32 context_flags;
+	u32 dispatch_flags;
 	int lut_mask;
 	struct hlist_head *buckets;
 };
 
+#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+
+/* Used to convert any address to canonical form.
+ * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
+ * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
+ * addresses to be in a canonical form:
+ * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
+ * canonical form [63:48] == [47]."
+ */
+#define GEN8_HIGH_ADDRESS_BIT 47
+static inline u64 gen8_canonical_addr(u64 address)
+{
+	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+}
+
+static inline u64 gen8_noncanonical_addr(u64 address)
+{
+	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+}
+
 static int
 eb_create(struct i915_execbuffer *eb)
 {
@@ -101,80 +128,340 @@ eb_create(struct i915_execbuffer *eb)
 	return 0;
 }
 
+static bool
+eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
+		 const struct i915_vma *vma)
+{
+	if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)
+		return true;
+
+	if (vma->node.size < entry->pad_to_size)
+		return true;
+
+	if (entry->alignment && vma->node.start & (entry->alignment - 1))
+		return true;
+
+	if (entry->flags & EXEC_OBJECT_PINNED &&
+	    vma->node.start != entry->offset)
+		return true;
+
+	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	    vma->node.start < BATCH_OFFSET_BIAS)
+		return true;
+
+	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
+	    (vma->node.start + vma->node.size - 1) >> 32)
+		return true;
+
+	return false;
+}
+
+static void
+eb_pin_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
+{
+	u64 flags;
+
+	flags = vma->node.start;
+	flags |= PIN_USER | PIN_NONBLOCK | PIN_OFFSET_FIXED;
+	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
+		flags |= PIN_GLOBAL;
+	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
+		return;
+
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		if (unlikely(i915_vma_get_fence(vma))) {
+			i915_vma_unpin(vma);
+			return;
+		}
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+}
+
 static inline void
 __eb_unreserve_vma(struct i915_vma *vma,
 		   const struct drm_i915_gem_exec_object2 *entry)
 {
+	GEM_BUG_ON((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0);
+
 	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
 		i915_vma_unpin_fence(vma);
 
-	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-		__i915_vma_unpin(vma);
+	__i915_vma_unpin(vma);
 }
 
-static void
-eb_unreserve_vma(struct i915_vma *vma)
+static inline void
+eb_unreserve_vma(struct i915_vma *vma,
+		 struct drm_i915_gem_exec_object2 *entry)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	__eb_unreserve_vma(vma, entry);
-	entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
+	if (entry->flags & __EXEC_OBJECT_HAS_PIN) {
+		__eb_unreserve_vma(vma, entry);
+		entry->flags &= ~__EB_RESERVED;
+	}
 }
 
-static void
-eb_reset(struct i915_execbuffer *eb)
+static int
+eb_add_vma(struct i915_execbuffer *eb,
+	   struct drm_i915_gem_exec_object2 *entry,
+	   struct i915_vma *vma)
 {
-	struct i915_vma *vma;
+	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		eb_unreserve_vma(vma);
-		vma->exec_entry = NULL;
-	}
+	GEM_BUG_ON(i915_vma_is_closed(vma));
 
-	if (eb->lut_mask >= 0)
-		memset(eb->buckets, 0,
-		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
-}
+	if ((eb->args->flags & __EXEC_VALIDATED) == 0) {
+		if (unlikely(entry->flags & eb->invalid_flags))
+			return -EINVAL;
 
-#define to_ptr(T, x) ((T *)(uintptr_t)(x))
+		if (unlikely(entry->alignment && !is_power_of_2(entry->alignment)))
+			return -EINVAL;
 
-static bool
-eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
-{
-	if (unlikely(vma->exec_entry)) {
-		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
-			  eb->exec[i].handle, i);
-		return false;
+		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
+		 * any non-page-aligned or non-canonical addresses.
+		 */
+		if (entry->flags & EXEC_OBJECT_PINNED) {
+			if (unlikely(entry->offset !=
+				     gen8_canonical_addr(entry->offset & PAGE_MASK)))
+				return -EINVAL;
+		}
+
+		/* From drm_mm perspective address space is continuous,
+		 * so from this point we're always using non-canonical
+		 * form internally.
+		 */
+		entry->offset = gen8_noncanonical_addr(entry->offset);
+
+		/* pad_to_size was once a reserved field, so sanitize it */
+		if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) {
+			if (unlikely(offset_in_page(entry->pad_to_size)))
+				return -EINVAL;
+		} else {
+			entry->pad_to_size = 0;
+		}
+
+		if (unlikely(vma->exec_entry)) {
+			DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+				  entry->handle, (int)(entry - eb->exec));
+			return -EINVAL;
+		}
 	}
-	list_add_tail(&vma->exec_list, &eb->vmas);
 
-	vma->exec_entry = &eb->exec[i];
+	vma->exec_entry = entry;
+	entry->rsvd2 = (uintptr_t)vma;
+
 	if (eb->lut_mask >= 0) {
-		vma->exec_handle = eb->exec[i].handle;
+		vma->exec_handle = entry->handle;
 		hlist_add_head(&vma->exec_node,
-			       &eb->buckets[hash_32(vma->exec_handle,
+			       &eb->buckets[hash_32(entry->handle,
 						    eb->lut_mask)]);
 	}
 
-	eb->exec[i].rsvd2 = (uintptr_t)vma;
-	return true;
+	if (entry->relocation_count)
+		list_add_tail(&vma->reloc_link, &eb->relocs);
+
+	if (!eb->reloc_cache.has_fence) {
+		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+	} else {
+		if (entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
+		    i915_gem_object_is_tiled(vma->obj))
+			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+	}
+
+	if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
+		entry->flags |= eb->context_flags;
+
+	ret = 0;
+	if (vma->node.size)
+		eb_pin_vma(eb, entry, vma);
+	if (eb_vma_misplaced(entry, vma)) {
+		eb_unreserve_vma(vma, entry);
+
+		list_add_tail(&vma->exec_link, &eb->unbound);
+		if (drm_mm_node_allocated(&vma->node))
+			ret = i915_vma_unbind(vma);
+	} else {
+		if (entry->offset != vma->node.start) {
+			entry->offset = vma->node.start | UPDATE;
+			eb->args->flags |= __EXEC_HAS_RELOC;
+		}
+	}
+	return ret;
 }
 
-static inline struct hlist_head *ht_head(struct i915_gem_context *ctx,
-					 u32 handle)
+static inline int use_cpu_reloc(const struct reloc_cache *cache,
+				const struct drm_i915_gem_object *obj)
+{
+	if (!i915_gem_object_has_struct_page(obj))
+		return false;
+
+	if (DBG_USE_CPU_RELOC)
+		return DBG_USE_CPU_RELOC > 0;
+
+	return (cache->has_llc ||
+		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
+static int
+eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	u64 flags;
+	int ret;
+
+	flags = PIN_USER | PIN_NONBLOCK;
+	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
+		flags |= PIN_GLOBAL;
+
+	if (!drm_mm_node_allocated(&vma->node)) {
+		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+		 * limit address to the first 4GBs for unflagged objects.
+		 */
+		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
+			flags |= PIN_ZONE_4G;
+
+		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+			flags |= PIN_MAPPABLE;
+
+		if (entry->flags & EXEC_OBJECT_PINNED) {
+			flags |= entry->offset | PIN_OFFSET_FIXED;
+			/* force overlapping PINNED checks */
+			flags &= ~PIN_NONBLOCK;
+		} else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
+			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
+	}
+
+	ret = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+	if (ret)
+		return ret;
+
+	if (entry->offset != vma->node.start) {
+		entry->offset = vma->node.start | UPDATE;
+		eb->args->flags |= __EXEC_HAS_RELOC;
+	}
+	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+
+	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
+		ret = i915_vma_get_fence(vma);
+		if (ret)
+			return ret;
+
+		if (i915_vma_pin_fence(vma))
+			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+	}
+
+	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
+	return 0;
+}
+
+static int eb_reserve(struct i915_execbuffer *eb)
+{
+	const unsigned int count = eb->args->buffer_count;
+	struct list_head last;
+	struct i915_vma *vma;
+	unsigned int i, pass;
+	int ret;
+
+	/* Attempt to pin all of the buffers into the GTT.
+	 * This is done in 3 phases:
+	 *
+	 * 1a. Unbind all objects that do not match the GTT constraints for
+	 *     the execbuffer (fenceable, mappable, alignment etc).
+	 * 1b. Increment pin count for already bound objects.
+	 * 2.  Bind new objects.
+	 * 3.  Decrement pin count.
+	 *
+	 * This avoid unnecessary unbinding of later objects in order to make
+	 * room for the earlier objects *unless* we need to defragment.
+	 */
+
+	pass = 0;
+	ret = 0;
+	do {
+		list_for_each_entry(vma, &eb->unbound, exec_link) {
+			ret = eb_reserve_vma(eb, vma);
+			if (ret)
+				break;
+		}
+		if (ret != -ENOSPC || pass++)
+			return ret;
+
+		/* Resort *all* the objects into priority order */
+		INIT_LIST_HEAD(&eb->unbound);
+		INIT_LIST_HEAD(&last);
+		for (i = 0; i < count; i++) {
+			struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+
+			vma = to_ptr(struct i915_vma, entry->rsvd2);
+			eb_unreserve_vma(vma, entry);
+
+			if (entry->flags & EXEC_OBJECT_PINNED)
+				list_add(&vma->exec_link, &eb->unbound);
+			else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+				list_add_tail(&vma->exec_link, &eb->unbound);
+			else
+				list_add_tail(&vma->exec_link, &last);
+		}
+		list_splice_tail(&last, &eb->unbound);
+
+		/* Too fragmented, unbind everything and retry */
+		ret = i915_gem_evict_vm(eb->vm);
+		if (ret)
+			return ret;
+	} while (1);
+}
+
+static inline struct hlist_head *
+ht_head(const struct i915_gem_context *ctx, u32 handle)
 {
 	return &ctx->vma.ht[hash_32(handle, ctx->vma.ht_bits)];
 }
 
+static int eb_batch_index(const struct i915_execbuffer *eb)
+{
+	return eb->args->buffer_count - 1;
+}
+
+static int eb_select_context(struct i915_execbuffer *eb)
+{
+	struct i915_gem_context *ctx;
+
+	ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1);
+	if (unlikely(IS_ERR(ctx)))
+		return PTR_ERR(ctx);
+
+	if (unlikely(ctx->hang_stats.banned)) {
+		DRM_DEBUG("Context %u tried to submit while banned\n",
+			  ctx->user_handle);
+		return -EIO;
+	}
+
+	eb->ctx = i915_gem_context_get(ctx);
+	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
+
+	eb->context_flags = 0;
+	if (ctx->flags & CONTEXT_NO_ZEROMAP)
+		eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS;
+
+	return 0;
+}
+
 static int
 eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	const int count = eb->args->buffer_count;
 	struct i915_vma *vma;
+	struct idr *idr;
 	int slow_pass = -1;
-	int i;
+	int i, ret;
 
-	INIT_LIST_HEAD(&eb->vmas);
+	INIT_LIST_HEAD(&eb->relocs);
+	INIT_LIST_HEAD(&eb->unbound);
 
 	if (unlikely(eb->ctx->vma.ht_size & 1))
 		flush_work(&eb->ctx->vma.resize);
@@ -187,8 +474,9 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
 			if (vma->ctx_handle != eb->exec[i].handle)
 				continue;
 
-			if (!eb_add_vma(eb, vma, i))
-				return -EINVAL;
+			ret = eb_add_vma(eb, &eb->exec[i], vma);
+			if (unlikely(ret))
+				return ret;
 
 			goto next_vma;
 		}
@@ -199,24 +487,25 @@ next_vma: ;
 	}
 
 	if (slow_pass < 0)
-		return 0;
+		goto out;
 
 	spin_lock(&eb->file->table_lock);
 	/* Grab a reference to the object and release the lock so we can lookup
 	 * or create the VMA without using GFP_ATOMIC */
+	idr = &eb->file->object_idr;
 	for (i = slow_pass; i < count; i++) {
 		struct drm_i915_gem_object *obj;
 
 		if (eb->exec[i].rsvd2)
 			continue;
 
-		obj = to_intel_bo(idr_find(&eb->file->object_idr,
-					   eb->exec[i].handle));
+		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
 		if (unlikely(!obj)) {
 			spin_unlock(&eb->file->table_lock);
 			DRM_DEBUG("Invalid object handle %d at index %d\n",
 				  eb->exec[i].handle, i);
-			return -ENOENT;
+			ret = -ENOENT;
+			goto err;
 		}
 
 		eb->exec[i].rsvd2 = 1 | (uintptr_t)obj;
@@ -237,11 +526,12 @@ next_vma: ;
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		obj = to_ptr(struct drm_i915_gem_object, eb->exec[i].rsvd2 & ~1);
+		obj = to_ptr(typeof(*obj), eb->exec[i].rsvd2 & ~1);
 		vma = i915_gem_obj_lookup_or_create_vma(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
-			return PTR_ERR(vma);
+			ret = PTR_ERR(vma);
+			goto err;
 		}
 
 		/* First come, first served */
@@ -257,8 +547,9 @@ next_vma: ;
 			}
 		}
 
-		if (!eb_add_vma(eb, vma, i))
-			return -EINVAL;
+		ret = eb_add_vma(eb, &eb->exec[i], vma);
+		if (unlikely(ret))
+			goto err;
 	}
 	if (4*eb->ctx->vma.ht_count > 3*eb->ctx->vma.ht_size ||
 	    4*eb->ctx->vma.ht_count < eb->ctx->vma.ht_size) {
@@ -266,15 +557,10 @@ next_vma: ;
 		queue_work(system_highpri_wq, &eb->ctx->vma.resize);
 	}
 
-	return 0;
-}
-
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	vma = to_ptr(struct i915_vma, eb->exec[eb->args->buffer_count-1].rsvd2);
+out:
+	/* take note of the batch buffer before we might reorder the lists */
+	i = eb_batch_index(eb);
+	eb->batch = to_ptr(struct i915_vma, eb->exec[i].rsvd2);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -285,14 +571,24 @@ eb_get_batch(struct i915_execbuffer *eb)
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
-		vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if ((eb->exec[i].flags & EXEC_OBJECT_PINNED) == 0)
+		eb->exec[i].flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if (eb->reloc_cache.has_fence)
+		eb->exec[i].flags |= EXEC_OBJECT_NEEDS_FENCE;
 
-	return vma;
+	eb->args->flags |= __EXEC_VALIDATED;
+	return eb_reserve(eb);
+
+err:
+	for (i = slow_pass; i < count; i++) {
+		if (eb->exec[i].rsvd2 & 1)
+			eb->exec[i].rsvd2 = 0;
+	}
+	return ret;
 }
 
 static struct i915_vma *
-eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
 {
 	if (eb->lut_mask < 0) {
 		if (handle >= -eb->lut_mask)
@@ -311,60 +607,58 @@ eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
 	}
 }
 
-static void eb_destroy(struct i915_execbuffer *eb)
+static void
+eb_reset(const struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		if (!vma->exec_entry)
-			continue;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 
-		__eb_unreserve_vma(vma, vma->exec_entry);
+		eb_unreserve_vma(vma, entry);
 		vma->exec_entry = NULL;
 	}
 
-	i915_gem_context_put(eb->ctx);
-
 	if (eb->lut_mask >= 0)
-		kfree(eb->buckets);
+		memset(eb->buckets, 0,
+		       (1<<eb->lut_mask)*sizeof(struct hlist_head));
 }
 
-static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+static void eb_release_vma(const struct i915_execbuffer *eb)
 {
-	if (!i915_gem_object_has_struct_page(obj))
-		return false;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (!eb->exec)
+		return;
 
-	return (HAS_LLC(obj->base.dev) ||
-		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
-		obj->cache_level != I915_CACHE_NONE);
-}
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 
-/* Used to convert any address to canonical form.
- * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
- * MI_LOAD_REGISTER_MEM and others, see Broadwell PRM Vol2a) require the
- * addresses to be in a canonical form:
- * "GraphicsAddress[63:48] are ignored by the HW and assumed to be in correct
- * canonical form [63:48] == [47]."
- */
-#define GEN8_HIGH_ADDRESS_BIT 47
-static inline uint64_t gen8_canonical_addr(uint64_t address)
-{
-	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+		if (!vma || !vma->exec_entry)
+			continue;
+
+		GEM_BUG_ON(vma->exec_entry != entry);
+		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
+	}
 }
 
-static inline uint64_t gen8_noncanonical_addr(uint64_t address)
+static void eb_destroy(const struct i915_execbuffer *eb)
 {
-	return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
+	if (eb->lut_mask >= 0)
+		kfree(eb->buckets);
 }
 
-static inline uint64_t
+static inline u64
 relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
-		  uint64_t target_offset)
+		  const struct i915_vma *target)
 {
-	return gen8_canonical_addr((int)reloc->delta + target_offset);
+	return gen8_canonical_addr((int)reloc->delta + target->node.start);
 }
 
 static void reloc_cache_init(struct reloc_cache *cache,
@@ -372,6 +666,8 @@ static void reloc_cache_init(struct reloc_cache *cache,
 {
 	cache->page = -1;
 	cache->vaddr = 0;
+	cache->has_llc = HAS_LLC(i915);
+	cache->has_fence = INTEL_GEN(i915) < 4;
 	cache->use_64bit_reloc = INTEL_GEN(i915) >= 8;
 	cache->node.allocated = false;
 }
@@ -484,7 +780,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		struct i915_vma *vma;
 		int ret;
 
-		if (use_cpu_reloc(obj))
+		if (use_cpu_reloc(cache, obj))
 			return NULL;
 
 		ret = i915_gem_object_set_to_gtt_domain(obj, true);
@@ -572,17 +868,17 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
-static int
+static u64
 relocate_entry(struct drm_i915_gem_object *obj,
 	       const struct drm_i915_gem_relocation_entry *reloc,
 	       struct reloc_cache *cache,
-	       u64 target_offset)
+	       const struct i915_vma *target)
 {
 	u64 offset = reloc->offset;
+	u64 target_offset = relocation_target(reloc, target);
 	bool wide = cache->use_64bit_reloc;
 	void *vaddr;
 
-	target_offset = relocation_target(reloc, target_offset);
 repeat:
 	vaddr = reloc_vaddr(obj, cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
@@ -599,7 +895,7 @@ repeat:
 		goto repeat;
 	}
 
-	return 0;
+	return gen8_canonical_addr(target->node.start) | 1;
 }
 
 static bool object_is_idle(struct drm_i915_gem_object *obj)
@@ -616,13 +912,12 @@ static bool object_is_idle(struct drm_i915_gem_object *obj)
 	return true;
 }
 
-static int
-eb_relocate_entry(struct i915_vma *vma,
-		  struct i915_execbuffer *eb,
-		  struct drm_i915_gem_relocation_entry *reloc)
+static u64
+eb_relocate_entry(struct i915_execbuffer *eb,
+		  const struct i915_vma *vma,
+		  const struct drm_i915_gem_relocation_entry *reloc)
 {
 	struct i915_vma *target;
-	u64 target_offset;
 	int ret;
 
 	/* we've already hold a reference to all valid objects */
@@ -653,26 +948,28 @@ eb_relocate_entry(struct i915_vma *vma,
 		return -EINVAL;
 	}
 
-	if (reloc->write_domain)
+	if (reloc->write_domain) {
 		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
 
-	/* Sandybridge PPGTT errata: We need a global gtt mapping for MI and
-	 * pipe_control writes because the gpu doesn't properly redirect them
-	 * through the ppgtt for non_secure batchbuffers.
-	 */
-	if (unlikely(IS_GEN6(eb->i915) &&
-		     reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION)) {
-		ret = i915_vma_bind(target, target->obj->cache_level,
-				    PIN_GLOBAL);
-		if (WARN_ONCE(ret, "Unexpected failure to bind target VMA!"))
-			return ret;
+		/* Sandybridge PPGTT errata: We need a global gtt mapping
+		 * for MI and pipe_control writes because the gpu doesn't
+		 * properly redirect them through the ppgtt for non_secure
+		 * batchbuffers.
+		 */
+		if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
+		    IS_GEN6(eb->i915)) {
+			ret = i915_vma_bind(target, target->obj->cache_level,
+					    PIN_GLOBAL);
+			if (WARN_ONCE(ret,
+				      "Unexpected failure to bind target VMA!"))
+				return ret;
+		}
 	}
 
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	target_offset = gen8_canonical_addr(target->node.start);
-	if (target_offset == reloc->presumed_offset)
+	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
@@ -695,85 +992,90 @@ eb_relocate_entry(struct i915_vma *vma,
 
 	/* We can't wait for rendering with pagefaults disabled */
 	if (pagefault_disabled() && !object_is_idle(vma->obj))
-		return -EFAULT;
-
-	ret = relocate_entry(vma->obj, reloc, &eb->reloc_cache, target_offset);
-	if (ret)
-		return ret;
+		return -EBUSY;
 
 	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-	return 0;
+	return relocate_entry(vma->obj, reloc, &eb->reloc_cache, target);
 }
 
-static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
+static int eb_relocate_vma(struct i915_execbuffer *eb,
+			   const struct i915_vma *vma)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
-	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
-	struct drm_i915_gem_relocation_entry __user *user_relocs;
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int remain, ret = 0;
-
-	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
+	struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
+	struct drm_i915_gem_relocation_entry __user *urelocs;
+	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	unsigned int remain;
 
+	urelocs = u64_to_user_ptr(entry->relocs_ptr);
 	remain = entry->relocation_count;
-	while (remain) {
-		struct drm_i915_gem_relocation_entry *r = stack_reloc;
-		int count = remain;
-		if (count > ARRAY_SIZE(stack_reloc))
-			count = ARRAY_SIZE(stack_reloc);
-		remain -= count;
+	if (unlikely(remain > ULONG_MAX / sizeof(*urelocs)))
+		return -EINVAL;
+
+	/*
+	 * We must check that the entire relocation array is safe
+	 * to read. However, if the array is not writable the user loses
+	 * the updated relocation values.
+	 */
 
-		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) {
-			ret = -EFAULT;
+	do {
+		struct drm_i915_gem_relocation_entry *r = stack;
+		unsigned int count =
+			min_t(unsigned int, remain, ARRAY_SIZE(stack));
+
+		if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) {
+			remain = -EFAULT;
 			goto out;
 		}
 
+		remain -= count;
 		do {
-			u64 offset = r->presumed_offset;
+			u64 offset = eb_relocate_entry(eb, vma, r);
 
-			ret = eb_relocate_entry(vma, eb, r);
-			if (ret)
-				goto out;
-
-			if (r->presumed_offset != offset &&
-			    __put_user(r->presumed_offset,
-				       &user_relocs->presumed_offset)) {
-				ret = -EFAULT;
+			if (offset == 0) {
+			} else if ((s64)offset < 0) {
+				remain = (s64)offset;
 				goto out;
+			} else {
+				__put_user(offset & ~1,
+					   &urelocs[r-stack].presumed_offset);
 			}
-
-			user_relocs++;
-			r++;
-		} while (--count);
-	}
-
+		} while (r++, --count);
+		urelocs += ARRAY_SIZE(stack);
+	} while (remain);
 out:
 	reloc_cache_reset(&eb->reloc_cache);
-	return ret;
+	return remain;
 #undef N_RELOC
 }
 
 static int
-eb_relocate_vma_slow(struct i915_vma *vma,
-		     struct i915_execbuffer *eb,
-		     struct drm_i915_gem_relocation_entry *relocs)
+eb_relocate_vma_slow(struct i915_execbuffer *eb,
+		     const struct i915_vma *vma)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int i, ret = 0;
+	struct drm_i915_gem_relocation_entry *relocs =
+		to_ptr(typeof(*relocs), entry->relocs_ptr);
+	unsigned int i;
+	int ret;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = eb_relocate_entry(vma, eb, &relocs[i]);
-		if (ret)
-			break;
+		u64 offset = eb_relocate_entry(eb, vma, &relocs[i]);
+
+		if ((s64)offset < 0) {
+			ret = (s64)offset;
+			goto err;
+		}
 	}
+	ret = 0;
+err:
 	reloc_cache_reset(&eb->reloc_cache);
 	return ret;
 }
 
 static int eb_relocate(struct i915_execbuffer *eb)
 {
-	struct i915_vma *vma;
+	const struct i915_vma *vma;
 	int ret = 0;
 
 	/* This is the fast path and we cannot handle a pagefault whilst
@@ -784,293 +1086,175 @@ static int eb_relocate(struct i915_execbuffer *eb)
 	 * lockdep complains vehemently.
 	 */
 	pagefault_disable();
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = eb_relocate_vma(vma, eb);
-		if (ret)
-			break;
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+retry:
+		ret = eb_relocate_vma(eb, vma);
+		if (ret == 0)
+			continue;
+
+		if (ret == -EBUSY) {
+			pagefault_enable();
+			ret = i915_gem_object_wait_rendering(vma->obj, false);
+			pagefault_disable();
+			if (ret == 0)
+				goto retry;
+		}
+		break;
 	}
 	pagefault_enable();
 
 	return ret;
 }
 
-static bool only_mappable_for_reloc(unsigned int flags)
+static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
 {
-	return (flags & (EXEC_OBJECT_NEEDS_FENCE | __EXEC_OBJECT_NEEDS_MAP)) ==
-		__EXEC_OBJECT_NEEDS_MAP;
-}
-
-static int
-eb_reserve_vma(struct i915_vma *vma,
-	       struct intel_engine_cs *engine,
-	       bool *need_reloc)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	uint64_t flags;
-	int ret;
-
-	flags = PIN_USER;
-	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
-		flags |= PIN_GLOBAL;
+	const unsigned long relocs_max =
+		ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry);
+	const char __user *addr, *end;
+	unsigned long size;
+	unsigned int nreloc;
+	char c;
+
+	nreloc = entry->relocation_count;
+	if (nreloc == 0)
+		return 0;
 
-	if (!drm_mm_node_allocated(&vma->node)) {
-		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
-		 * limit address to the first 4GBs for unflagged objects.
-		 */
-		if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0)
-			flags |= PIN_ZONE_4G;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
-			flags |= PIN_GLOBAL | PIN_MAPPABLE;
-		if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
-			flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			flags |= entry->offset | PIN_OFFSET_FIXED;
-		if ((flags & PIN_MAPPABLE) == 0)
-			flags |= PIN_HIGH;
-	}
-
-	ret = i915_vma_pin(vma,
-			   entry->pad_to_size,
-			   entry->alignment,
-			   flags);
-	if ((ret == -ENOSPC || ret == -E2BIG) &&
-	    only_mappable_for_reloc(entry->flags))
-		ret = i915_vma_pin(vma,
-				   entry->pad_to_size,
-				   entry->alignment,
-				   flags & ~PIN_MAPPABLE);
-	if (ret)
-		return ret;
+	if (nreloc > relocs_max)
+		return -EINVAL;
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+	addr = u64_to_user_ptr(entry->relocs_ptr);
+	size = nreloc * sizeof(struct drm_i915_gem_relocation_entry);
+	if (!access_ok(VERIFY_WRITE, addr, size))
+		return -EFAULT;
 
-	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
-		ret = i915_vma_get_fence(vma);
+	end = addr + size;
+	for (; addr < end; addr += PAGE_SIZE) {
+		int ret = __get_user(c, addr);
 		if (ret)
 			return ret;
-
-		if (i915_vma_pin_fence(vma))
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
-
-	if (entry->offset != vma->node.start) {
-		entry->offset = vma->node.start;
-		*need_reloc = true;
-	}
-
-	return 0;
+	return __get_user(c, end - 1);
 }
 
-static bool
-need_reloc_mappable(struct i915_vma *vma)
+static int
+eb_copy_relocations(const struct i915_execbuffer *eb)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-
-	if (entry->relocation_count == 0)
-		return false;
-
-	if (!i915_vma_is_ggtt(vma))
-		return false;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
+	int ret;
 
-	/* See also use_cpu_reloc() */
-	if (HAS_LLC(vma->obj->base.dev))
-		return false;
+	for (i = 0; i < count; i++) {
+		struct drm_i915_gem_relocation_entry __user *urelocs;
+		struct drm_i915_gem_relocation_entry *relocs;
+		unsigned int nreloc = eb->exec[i].relocation_count, j;
+		unsigned long size;
 
-	if (vma->obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-		return false;
+		if (nreloc == 0)
+			continue;
 
-	return true;
-}
+		ret = check_relocations(&eb->exec[i]);
+		if (ret)
+			goto err;
 
-static bool
-eb_vma_misplaced(struct i915_vma *vma)
-{
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+		urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
+		size = nreloc * sizeof(*relocs);
 
-	WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
-		!i915_vma_is_ggtt(vma));
+		relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY);
+		if (!relocs) {
+			ret = -ENOMEM;
+			goto err;
+		}
 
-	if (entry->alignment &&
-	    vma->node.start & (entry->alignment - 1))
-		return true;
+		/* copy_from_user is limited to 4GiB */
+		j = 0;
+		do {
+			u32 len = min_t(u64, 1ull<<31, size);
 
-	if (vma->node.size < entry->pad_to_size)
-		return true;
+			if (__copy_from_user(relocs + j, urelocs + j, len)) {
+				ret = -EFAULT;
+				goto err;
+			}
 
-	if (entry->flags & EXEC_OBJECT_PINNED &&
-	    vma->node.start != entry->offset)
-		return true;
+			size -= len;
+			BUILD_BUG_ON_NOT_POWER_OF_2(sizeof(*relocs));
+			j += len / sizeof(*relocs);
+		} while (size);
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
-	    vma->node.start < BATCH_OFFSET_BIAS)
-		return true;
+		/* As we do not update the known relocation offsets after
+		 * relocating (due to the complexities in lock handling),
+		 * we need to mark them as invalid now so that we force the
+		 * relocation processing next time. Just in case the target
+		 * object is evicted and then rebound into its old
+		 * presumed_offset before the next execbuffer - if that
+		 * happened we would make the mistake of assuming that the
+		 * relocations were valid.
+		 */
+		user_access_begin();
+		for (j = 0; j < nreloc; j++)
+			unsafe_put_user(-1,
+					&urelocs[j].presumed_offset,
+					end_user);
+end_user:
+		user_access_end();
 
-	/* avoid costly ping-pong once a batch bo ended up non-mappable */
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
-	    !i915_vma_is_map_and_fenceable(vma))
-		return !only_mappable_for_reloc(entry->flags);
+		eb->exec[i].relocs_ptr = (uintptr_t)relocs;
+	}
 
-	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
-	    (vma->node.start + vma->node.size - 1) >> 32)
-		return true;
+	return 0;
 
-	return false;
+err:
+	while (i--) {
+		struct drm_i915_gem_relocation_entry *relocs =
+			to_ptr(typeof(*relocs), eb->exec[i].relocs_ptr);
+		if (eb->exec[i].relocation_count)
+			drm_free_large(relocs);
+	}
+	return ret;
 }
 
-static int eb_reserve(struct i915_execbuffer *eb)
+static int eb_prefault_relocations(const struct i915_execbuffer *eb)
 {
-	const bool has_fenced_gpu_access = INTEL_GEN(eb->i915) < 4;
-	struct i915_vma *vma;
-	struct list_head ordered_vmas;
-	struct list_head pinned_vmas;
-	int retry;
-
-	INIT_LIST_HEAD(&ordered_vmas);
-	INIT_LIST_HEAD(&pinned_vmas);
-	while (!list_empty(&eb->vmas)) {
-		struct drm_i915_gem_exec_object2 *entry;
-		bool need_fence, need_mappable;
-
-		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
-		entry = vma->exec_entry;
-
-		if (eb->ctx->flags & CONTEXT_NO_ZEROMAP)
-			entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
-
-		if (!has_fenced_gpu_access)
-			entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
-		need_fence =
-			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-			i915_gem_object_is_tiled(vma->obj);
-		need_mappable = need_fence || need_reloc_mappable(vma);
-
-		if (entry->flags & EXEC_OBJECT_PINNED)
-			list_move_tail(&vma->exec_list, &pinned_vmas);
-		else if (need_mappable) {
-			entry->flags |= __EXEC_OBJECT_NEEDS_MAP;
-			list_move(&vma->exec_list, &ordered_vmas);
-		} else
-			list_move_tail(&vma->exec_list, &ordered_vmas);
-	}
-	list_splice(&ordered_vmas, &eb->vmas);
-	list_splice(&pinned_vmas, &eb->vmas);
-
-	/* Attempt to pin all of the buffers into the GTT.
-	 * This is done in 3 phases:
-	 *
-	 * 1a. Unbind all objects that do not match the GTT constraints for
-	 *     the execbuffer (fenceable, mappable, alignment etc).
-	 * 1b. Increment pin count for already bound objects.
-	 * 2.  Bind new objects.
-	 * 3.  Decrement pin count.
-	 *
-	 * This avoid unnecessary unbinding of later objects in order to make
-	 * room for the earlier objects *unless* we need to defragment.
-	 */
-	retry = 0;
-	do {
-		int ret = 0;
-
-		/* Unbind any ill-fitting objects or pin. */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
-			if (!drm_mm_node_allocated(&vma->node))
-				continue;
-
-			if (eb_vma_misplaced(vma))
-				ret = i915_vma_unbind(vma);
-			else
-				ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
-
-		/* Bind fresh objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list) {
-			if (drm_mm_node_allocated(&vma->node))
-				continue;
-
-			ret = eb_reserve_vma(vma, eb->engine, &eb->need_relocs);
-			if (ret)
-				goto err;
-		}
-
-err:
-		if (ret != -ENOSPC || retry++)
-			return ret;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 
-		/* Decrement pin count for bound objects */
-		list_for_each_entry(vma, &eb->vmas, exec_list)
-			eb_unreserve_vma(vma);
+	for (i = 0; i < count; i++) {
+		int ret;
 
-		ret = i915_gem_evict_vm(eb->vm, true);
+		ret = check_relocations(&eb->exec[i]);
 		if (ret)
 			return ret;
-	} while (1);
+	}
+
+	return 0;
 }
 
-static int
-eb_relocate_slow(struct i915_execbuffer *eb)
+static int eb_relocate_slow(struct i915_execbuffer *eb)
 {
-	const unsigned int count = eb->args->buffer_count;
 	struct drm_device *dev = &eb->i915->drm;
-	struct drm_i915_gem_relocation_entry *reloc;
-	struct i915_vma *vma;
-	int *reloc_offset;
-	int i, total, ret;
-
-	/* We may process another execbuffer during the unlock... */
-	eb_reset(eb);
-	mutex_unlock(&dev->struct_mutex);
-
-	total = 0;
-	for (i = 0; i < count; i++)
-		total += eb->exec[i].relocation_count;
-
-	reloc_offset = drm_malloc_ab(count, sizeof(*reloc_offset));
-	reloc = drm_malloc_ab(total, sizeof(*reloc));
-	if (reloc == NULL || reloc_offset == NULL) {
-		drm_free_large(reloc);
-		drm_free_large(reloc_offset);
-		mutex_lock(&dev->struct_mutex);
-		return -ENOMEM;
-	}
-
-	total = 0;
-	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_relocation_entry __user *user_relocs;
-		u64 invalid_offset = (u64)-1;
-		int j;
-
-		user_relocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
-
-		if (copy_from_user(reloc+total, user_relocs,
-				   eb->exec[i].relocation_count * sizeof(*reloc))) {
-			ret = -EFAULT;
-			mutex_lock(&dev->struct_mutex);
-			goto err;
-		}
+	bool have_copy = false;
+	const struct i915_vma *vma;
+	int ret = 0;
 
-		/* As we do not update the known relocation offsets after
-		 * relocating (due to the complexities in lock handling),
-		 * we need to mark them as invalid now so that we force the
-		 * relocation processing next time. Just in case the target
-		 * object is evicted and then rebound into its old
-		 * presumed_offset before the next execbuffer - if that
-		 * happened we would make the mistake of assuming that the
-		 * relocations were valid.
-		 */
-		for (j = 0; j < eb->exec[i].relocation_count; j++) {
-			if (__copy_to_user(&user_relocs[j].presumed_offset,
-					   &invalid_offset,
-					   sizeof(invalid_offset))) {
-				ret = -EFAULT;
-				mutex_lock(&dev->struct_mutex);
-				goto err;
-			}
-		}
+repeat:
+	if (signal_pending(current))
+		return -ERESTARTSYS;
 
-		reloc_offset[i] = total;
-		total += eb->exec[i].relocation_count;
+	/* We may process another execbuffer during the unlock... */
+	eb_reset(eb);
+	mutex_unlock(&dev->struct_mutex);
+
+	if (ret == 0 && likely(!i915.prefault_disable)) {
+		ret = eb_prefault_relocations(eb);
+	} else if (!have_copy) {
+		ret = eb_copy_relocations(eb);
+		have_copy = true;
+	} else {
+		cond_resched();
+		ret = 0;
+	}
+	if (ret) {
+		mutex_lock(&dev->struct_mutex);
+		return ret;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1084,16 +1268,18 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	if (ret)
 		goto err;
 
-	ret = eb_reserve(eb);
-	if (ret)
-		goto err;
-
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		int idx = vma->exec_entry - eb->exec;
-
-		ret = eb_relocate_vma_slow(vma, eb, reloc + reloc_offset[idx]);
-		if (ret)
-			goto err;
+	list_for_each_entry(vma, &eb->relocs, reloc_link) {
+		if (!have_copy) {
+			pagefault_disable();
+			ret = eb_relocate_vma(eb, vma);
+			pagefault_enable();
+			if (ret)
+				goto repeat;
+		} else {
+			ret = eb_relocate_vma_slow(eb, vma);
+			if (ret)
+				goto err;
+		}
 	}
 
 	/* Leave the user relocations as are, this is the painfully slow path,
@@ -1103,11 +1289,51 @@ eb_relocate_slow(struct i915_execbuffer *eb)
 	 */
 
 err:
-	drm_free_large(reloc);
-	drm_free_large(reloc_offset);
+	if (ret == -EAGAIN)
+		goto repeat;
+
+	if (have_copy) {
+		const unsigned int count = eb->args->buffer_count;
+		unsigned int i;
+
+		for (i = 0; i < count; i++) {
+			const struct drm_i915_gem_exec_object2 *entry =
+				&eb->exec[i];
+			struct drm_i915_gem_relocation_entry *relocs;
+
+			if (entry->relocation_count == 0)
+				continue;
+
+			relocs = to_ptr(typeof(*relocs), entry->relocs_ptr);
+			drm_free_large(relocs);
+		}
+	}
+
 	return ret;
 }
 
+static void eb_export_fence(struct drm_i915_gem_object *obj,
+			    struct drm_i915_gem_request *req,
+			    unsigned int flags)
+{
+	struct reservation_object *resv;
+
+	resv = i915_gem_object_get_dmabuf_resv(obj);
+	if (!resv)
+		return;
+
+	/* Ignore errors from failing to allocate the new fence, we can't
+	 * handle an error right now. Worst case should be missed
+	 * synchronisation leading to rendering corruption.
+	 */
+	ww_mutex_lock(&resv->lock, NULL);
+	if (flags & EXEC_OBJECT_WRITE)
+		reservation_object_add_excl_fence(resv, &req->fence);
+	else if (reservation_object_reserve_shared(resv) == 0)
+		reservation_object_add_shared_fence(resv, &req->fence);
+	ww_mutex_unlock(&resv->lock);
+}
+
 static unsigned int eb_other_engines(struct i915_execbuffer *eb)
 {
 	unsigned int mask;
@@ -1122,23 +1348,40 @@ static int
 eb_move_to_gpu(struct i915_execbuffer *eb)
 {
 	const unsigned int other_rings = eb_other_engines(eb);
-	struct i915_vma *vma;
+	const unsigned int count = eb->args->buffer_count;
+	unsigned int i;
 	int ret;
 
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
+	for (i = 0; i < count; i++) {
+		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+		struct i915_vma *vma = to_ptr(struct i915_vma, entry->rsvd2);
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->flags & other_rings) {
-			ret = i915_gem_object_sync(obj,
-						   eb->request,
-						   vma->exec_entry->flags & EXEC_OBJECT_WRITE);
+			ret = i915_gem_object_sync(obj, eb->request,
+						   entry->flags & EXEC_OBJECT_WRITE);
 			if (ret)
 				return ret;
 		}
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
-			i915_gem_clflush_object(obj, false);
+		if (obj->base.write_domain) {
+			if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
+				i915_gem_clflush_object(obj, false);
+
+			obj->base.write_domain = 0;
+		}
+
+		if (entry->flags & EXEC_OBJECT_WRITE)
+			obj->base.read_domains = 0;
+		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
+
+		i915_vma_move_to_active(vma, eb->request, entry->flags);
+		eb_export_fence(obj, eb->request, entry->flags);
+
+		__eb_unreserve_vma(vma, entry);
+		vma->exec_entry = NULL;
 	}
+	eb->exec = NULL;
 
 	/* Unconditionally flush any chipset caches (for streaming writes). */
 	i915_gem_chipset_flush(eb->i915);
@@ -1170,104 +1413,6 @@ i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 	return true;
 }
 
-static int
-validate_exec_list(struct drm_device *dev,
-		   struct drm_i915_gem_exec_object2 *exec,
-		   int count)
-{
-	unsigned relocs_total = 0;
-	unsigned relocs_max = UINT_MAX / sizeof(struct drm_i915_gem_relocation_entry);
-	unsigned invalid_flags;
-	int i;
-
-	/* INTERNAL flags must not overlap with external ones */
-	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS);
-
-	invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
-	if (USES_FULL_PPGTT(dev))
-		invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
-
-	for (i = 0; i < count; i++) {
-		char __user *ptr = u64_to_user_ptr(exec[i].relocs_ptr);
-		int length; /* limited by fault_in_pages_readable() */
-
-		if (exec[i].flags & invalid_flags)
-			return -EINVAL;
-
-		/* Offset can be used as input (EXEC_OBJECT_PINNED), reject
-		 * any non-page-aligned or non-canonical addresses.
-		 */
-		if (exec[i].flags & EXEC_OBJECT_PINNED) {
-			if (exec[i].offset !=
-			    gen8_canonical_addr(exec[i].offset & PAGE_MASK))
-				return -EINVAL;
-
-			/* From drm_mm perspective address space is continuous,
-			 * so from this point we're always using non-canonical
-			 * form internally.
-			 */
-			exec[i].offset = gen8_noncanonical_addr(exec[i].offset);
-		}
-
-		if (exec[i].alignment && !is_power_of_2(exec[i].alignment))
-			return -EINVAL;
-
-		/* pad_to_size was once a reserved field, so sanitize it */
-		if (exec[i].flags & EXEC_OBJECT_PAD_TO_SIZE) {
-			if (offset_in_page(exec[i].pad_to_size))
-				return -EINVAL;
-		} else {
-			exec[i].pad_to_size = 0;
-		}
-
-		/* First check for malicious input causing overflow in
-		 * the worst case where we need to allocate the entire
-		 * relocation tree as a single array.
-		 */
-		if (exec[i].relocation_count > relocs_max - relocs_total)
-			return -EINVAL;
-		relocs_total += exec[i].relocation_count;
-
-		length = exec[i].relocation_count *
-			sizeof(struct drm_i915_gem_relocation_entry);
-		/*
-		 * We must check that the entire relocation array is safe
-		 * to read, but since we may need to update the presumed
-		 * offsets during execution, check for full write access.
-		 */
-		if (!access_ok(VERIFY_WRITE, ptr, length))
-			return -EFAULT;
-
-		if (likely(!i915.prefault_disable)) {
-			if (fault_in_multipages_readable(ptr, length))
-				return -EFAULT;
-		}
-	}
-
-	return 0;
-}
-
-static int eb_select_context(struct i915_execbuffer *eb)
-{
-	struct i915_gem_context *ctx;
-	unsigned int ctx_id;
-
-	ctx_id = i915_execbuffer2_get_context_id(*eb->args);
-	ctx = i915_gem_context_lookup(eb->file->driver_priv, ctx_id);
-	if (unlikely(IS_ERR(ctx)))
-		return PTR_ERR(ctx);
-
-	if (unlikely(ctx->hang_stats.banned)) {
-		DRM_DEBUG("Context %u tried to submit while banned\n", ctx_id);
-		return -EIO;
-	}
-
-	eb->ctx = i915_gem_context_get(ctx);
-	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
-
-	return 0;
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req,
 			     unsigned int flags)
@@ -1289,11 +1434,7 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 
 	if (flags & EXEC_OBJECT_WRITE) {
 		i915_gem_active_set(&obj->last_write, req);
-
 		intel_fb_obj_invalidate(obj, ORIGIN_CS);
-
-		/* update for the implicit flush after a batch */
-		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
 	}
 
 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
@@ -1304,49 +1445,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 	list_move_tail(&vma->vm_link, &vma->vm->active_list);
 }
 
-static void eb_export_fence(struct drm_i915_gem_object *obj,
-			    struct drm_i915_gem_request *req,
-			    unsigned int flags)
-{
-	struct reservation_object *resv;
-
-	resv = i915_gem_object_get_dmabuf_resv(obj);
-	if (!resv)
-		return;
-
-	/* Ignore errors from failing to allocate the new fence, we can't
-	 * handle an error right now. Worst case should be missed
-	 * synchronisation leading to rendering corruption.
-	 */
-	ww_mutex_lock(&resv->lock, NULL);
-	if (flags & EXEC_OBJECT_WRITE)
-		reservation_object_add_excl_fence(resv, &req->fence);
-	else if (reservation_object_reserve_shared(resv) == 0)
-		reservation_object_add_shared_fence(resv, &req->fence);
-	ww_mutex_unlock(&resv->lock);
-}
-
-static void
-eb_move_to_active(struct i915_execbuffer *eb)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		struct drm_i915_gem_object *obj = vma->obj;
-		u32 old_read = obj->base.read_domains;
-		u32 old_write = obj->base.write_domain;
-
-		obj->base.write_domain = 0;
-		if (vma->exec_entry->flags & EXEC_OBJECT_WRITE)
-			obj->base.read_domains = 0;
-		obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
-
-		i915_vma_move_to_active(vma, eb->request, vma->exec_entry->flags);
-		eb_export_fence(obj, eb->request, vma->exec_entry->flags);
-		trace_i915_gem_object_change_domain(obj, old_read, old_write);
-	}
-}
-
 static int
 i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 {
@@ -1358,16 +1456,16 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 		return -EINVAL;
 	}
 
-	ret = intel_ring_begin(req, 4 * 3);
+	ret = intel_ring_begin(req, 4 * 2 + 2);
 	if (ret)
 		return ret;
 
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(4));
 	for (i = 0; i < 4; i++) {
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
 		intel_ring_emit_reg(ring, GEN7_SO_WRITE_OFFSET(i));
 		intel_ring_emit(ring, 0);
 	}
-
+	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
 
 	return 0;
@@ -1403,9 +1501,10 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 		goto out;
 
 	vma->exec_entry =
-		memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
+		memset(&eb->exec[eb->args->buffer_count++],
+		       0, sizeof(*vma->exec_entry));
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
-	list_add_tail(&vma->exec_list, &eb->vmas);
+	vma->exec_entry->rsvd2 = (uintptr_t)vma;
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
@@ -1421,70 +1520,81 @@ add_to_client(struct drm_i915_gem_request *req,
 }
 
 static int
-execbuf_submit(struct i915_execbuffer *eb)
+eb_set_constants_offset(struct i915_execbuffer *eb)
 {
-	int instp_mode;
-	u32 instp_mask;
+	struct drm_i915_private *dev_priv = eb->i915;
+	struct intel_ring *ring;
+	u32 mode, mask;
 	int ret;
 
-	ret = eb_move_to_gpu(eb);
-	if (ret)
-		return ret;
-
-	ret = i915_switch_context(eb->request);
-	if (ret)
-		return ret;
-
-	instp_mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
-	instp_mask = I915_EXEC_CONSTANTS_MASK;
-	switch (instp_mode) {
+	mode = eb->args->flags & I915_EXEC_CONSTANTS_MASK;
+	switch (mode) {
 	case I915_EXEC_CONSTANTS_REL_GENERAL:
 	case I915_EXEC_CONSTANTS_ABSOLUTE:
 	case I915_EXEC_CONSTANTS_REL_SURFACE:
-		if (instp_mode != 0 && eb->engine->id != RCS) {
-			DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
-			return -EINVAL;
-		}
-
-		if (instp_mode != eb->i915->relative_constants_mode) {
-			if (INTEL_INFO(eb->i915)->gen < 4) {
-				DRM_DEBUG("no rel constants on pre-gen4\n");
-				return -EINVAL;
-			}
-
-			if (INTEL_INFO(eb->i915)->gen > 5 &&
-			    instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
-				DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
-				return -EINVAL;
-			}
-
-			/* The HW changed the meaning on this bit on gen6 */
-			if (INTEL_INFO(eb->i915)->gen >= 6)
-				instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
-		}
 		break;
 	default:
-		DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode);
+		DRM_DEBUG("execbuf with unknown constants: %d\n", mode);
 		return -EINVAL;
 	}
 
-	if (eb->engine->id == RCS &&
-	    instp_mode != eb->i915->relative_constants_mode) {
-		struct intel_ring *ring = eb->request->ring;
+	if (mode == dev_priv->relative_constants_mode)
+		return 0;
 
-		ret = intel_ring_begin(eb->request, 4);
-		if (ret)
-			return ret;
+	if (eb->engine->id != RCS) {
+		DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
+		return -EINVAL;
+	}
 
-		intel_ring_emit(ring, MI_NOOP);
-		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-		intel_ring_emit_reg(ring, INSTPM);
-		intel_ring_emit(ring, instp_mask << 16 | instp_mode);
-		intel_ring_advance(ring);
+	if (INTEL_GEN(dev_priv) < 4) {
+		DRM_DEBUG("no rel constants on pre-gen4\n");
+		return -EINVAL;
+	}
 
-		eb->i915->relative_constants_mode = instp_mode;
+	if (INTEL_GEN(dev_priv) > 5 &&
+	    mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
+		DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
+		return -EINVAL;
 	}
 
+	/* The HW changed the meaning on this bit on gen6 */
+	mask = I915_EXEC_CONSTANTS_MASK;
+	if (INTEL_GEN(dev_priv) >= 6)
+		mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
+
+	ret = intel_ring_begin(eb->request, 4);
+	if (ret)
+		return ret;
+
+	ring = eb->request->ring;
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit_reg(ring, INSTPM);
+	intel_ring_emit(ring, mask << 16 | mode);
+	intel_ring_advance(ring);
+
+	dev_priv->relative_constants_mode = mode;
+
+	return 0;
+}
+
+static int
+eb_submit(struct i915_execbuffer *eb)
+{
+	int ret;
+
+	ret = eb_move_to_gpu(eb);
+	if (ret)
+		return ret;
+
+	ret = i915_switch_context(eb->request);
+	if (ret)
+		return ret;
+
+	ret = eb_set_constants_offset(eb);
+	if (ret)
+		return ret;
+
 	if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
 		ret = i915_reset_gen7_sol_offsets(eb->request);
 		if (ret)
@@ -1493,15 +1603,13 @@ execbuf_submit(struct i915_execbuffer *eb)
 
 	ret = eb->engine->emit_bb_start(eb->request,
 					eb->batch->node.start +
-					eb->batch_start_offset,
+					eb->args->batch_start_offset,
 					eb->args->batch_len,
 					eb->dispatch_flags);
 	if (ret)
 		return ret;
 
 	trace_i915_gem_ring_dispatch(eb->request, eb->dispatch_flags);
-
-	eb_move_to_active(eb);
 	add_to_client(eb->request, eb->file);
 
 	return 0;
@@ -1596,18 +1704,21 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	struct i915_execbuffer eb;
 	int ret;
 
+	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & ~__EXEC_OBJECT_UNKNOWN_FLAGS);
+
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
 
-	ret = validate_exec_list(dev, exec, args->buffer_count);
-	if (ret)
-		return ret;
-
 	eb.i915 = to_i915(dev);
 	eb.file = file;
 	eb.args = args;
-	eb.exec = exec;
-	eb.need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+	if ((args->flags & I915_EXEC_NO_RELOC) == 0)
+		args->flags |= __EXEC_HAS_RELOC;
+	eb.exec = NULL;
+	eb.ctx = NULL;
+	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
+	if (USES_FULL_PPGTT(eb.i915))
+		eb.invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
 	reloc_cache_init(&eb.reloc_cache, eb.i915);
 
 	eb.dispatch_flags = 0;
@@ -1638,6 +1749,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		eb.dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (eb_create(&eb))
+		return -ENOMEM;
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1645,70 +1759,57 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * 100ms.
 	 */
 	intel_runtime_pm_get(eb.i915);
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		goto pre_mutex_err;
+		goto err_rpm;
 
 	ret = eb_select_context(&eb);
-	if (ret) {
-		mutex_unlock(&dev->struct_mutex);
-		goto pre_mutex_err;
-	}
+	if (unlikely(ret))
+		goto err_unlock;
 
-	if (eb_create(&eb)) {
-		i915_gem_context_put(eb.ctx);
-		mutex_unlock(&dev->struct_mutex);
-		ret = -ENOMEM;
-		goto pre_mutex_err;
-	}
-
-	/* Look up object handles */
+	eb.exec = exec;
 	ret = eb_lookup_vmas(&eb);
-	if (ret)
-		goto err;
-
-	/* take note of the batch buffer before we might reorder the lists */
-	eb.batch = eb_get_batch(&eb);
-
-	/* Move the objects en-masse into the GTT, evicting if necessary. */
-	ret = eb_reserve(&eb);
-	if (ret)
-		goto err;
+	if (unlikely(ret))
+		goto err_context;
 
 	/* The objects are in their final locations, apply the relocations. */
-	if (eb.need_relocs)
+	if (args->flags & __EXEC_HAS_RELOC && !list_empty(&eb.relocs)) {
 		ret = eb_relocate(&eb);
-	if (ret) {
-		if (ret == -EFAULT) {
+		if (ret == -EAGAIN || ret == -EFAULT)
 			ret = eb_relocate_slow(&eb);
-			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
+		if (ret) {
+			/* If the user expects the execobject.offset and
+			 * reloc.presumed_offset to be an exact match,
+			 * as for using NO_RELOC, then we cannot update
+			 * the execobject.offset until we have completed
+			 * relocation.
+			 */
+			if (args->flags & I915_EXEC_NO_RELOC)
+				args->flags &= ~__EXEC_HAS_RELOC;
+			goto err_vma;
 		}
-		if (ret)
-			goto err;
 	}
 
 	/* Set the pending read domains for the batch buffer to COMMAND */
-	if (eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE) {
+	if (unlikely(eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE)) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		ret = -EINVAL;
-		goto err;
+		goto err_vma;
 	}
 	if (args->batch_start_offset > eb.batch->size ||
 	    args->batch_len > eb.batch->size - args->batch_start_offset) {
 		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
 		ret = -EINVAL;
-		goto err;
+		goto err_vma;
 	}
 
-	eb.batch_start_offset = args->batch_start_offset;
 	if (intel_engine_needs_cmd_parser(eb.engine) && args->batch_len) {
 		struct i915_vma *vma;
 
 		vma = eb_parse(&eb, drm_is_current_master(file));
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
-			goto err;
+			goto err_vma;
 		}
 
 		if (vma) {
@@ -1722,7 +1823,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			 * command parser has accepted.
 			 */
 			eb.dispatch_flags |= I915_DISPATCH_SECURE;
-			eb.batch_start_offset = 0;
+			eb.args->batch_start_offset = 0;
 			eb.batch = vma;
 		}
 	}
@@ -1731,7 +1832,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.
 	 * hsw should have this fixed, but bdw mucks it up again. */
 	if (eb.dispatch_flags & I915_DISPATCH_SECURE) {
-		struct drm_i915_gem_object *obj = eb.batch->obj;
 		struct i915_vma *vma;
 
 		/*
@@ -1744,17 +1844,18 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		 *   fitting due to fragmentation.
 		 * So this is actually safe.
 		 */
-		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
+		vma = i915_gem_object_ggtt_pin(eb.batch->obj, NULL,
+					       0, 0, 0);
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
-			goto err;
+			goto err_vma;
 		}
 
 		eb.batch = vma;
 	}
 
 	if (args->batch_len == 0)
-		args->batch_len = eb.batch->size - eb.batch_start_offset;
+		args->batch_len = eb.batch->size - eb.args->batch_start_offset;
 
 	/* Allocate a request for this batch buffer nice and early. */
 	eb.request = i915_gem_request_alloc(eb.engine, eb.ctx);
@@ -1771,7 +1872,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 */
 	eb.request->batch = eb.batch;
 
-	ret = execbuf_submit(&eb);
+	ret = eb_submit(&eb);
 	__i915_add_request(eb.request, ret == 0);
 
 err_batch_unpin:
@@ -1783,15 +1884,18 @@ err_batch_unpin:
 	 */
 	if (eb.dispatch_flags & I915_DISPATCH_SECURE)
 		i915_vma_unpin(eb.batch);
-err:
 	/* the request owns the ref now */
-	eb_destroy(&eb);
+err_vma:
+	eb_release_vma(&eb);
+err_context:
+	i915_gem_context_put(eb.ctx);
+err_unlock:
 	mutex_unlock(&dev->struct_mutex);
-
-pre_mutex_err:
+err_rpm:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(eb.i915);
+	eb_destroy(&eb);
 	return ret;
 }
 
@@ -1815,8 +1919,12 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	}
 
 	/* Copy in the exec list from userland */
-	exec_list = drm_malloc_ab(sizeof(*exec_list), args->buffer_count);
-	exec2_list = drm_malloc_ab(sizeof(*exec2_list), args->buffer_count);
+	exec_list = drm_malloc_gfp(args->buffer_count,
+				   sizeof(*exec_list),
+				   __GFP_NOWARN | GFP_TEMPORARY);
+	exec2_list = drm_malloc_gfp(args->buffer_count + 1,
+				    sizeof(*exec2_list),
+				    __GFP_NOWARN | GFP_TEMPORARY);
 	if (exec_list == NULL || exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
@@ -1841,7 +1949,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		exec2_list[i].relocs_ptr = exec_list[i].relocs_ptr;
 		exec2_list[i].alignment = exec_list[i].alignment;
 		exec2_list[i].offset = exec_list[i].offset;
-		if (INTEL_INFO(dev)->gen < 4)
+		if (INTEL_GEN(dev) < 4)
 			exec2_list[i].flags = EXEC_OBJECT_NEEDS_FENCE;
 		else
 			exec2_list[i].flags = 0;
@@ -1859,24 +1967,22 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 	i915_execbuffer2_set_context_id(exec2, 0);
 
 	ret = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
-	if (!ret) {
+	if (exec2.flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			u64_to_user_ptr(args->buffers_ptr);
 
 		/* Copy the new buffer offsets back to the user's exec list. */
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user (%d)\n",
-					  args->buffer_count, ret);
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			exec2_list[i].offset &= PIN_OFFSET_MASK;
+			if (__copy_to_user(&user_exec_list[i].offset,
+					   &exec2_list[i].offset,
+					   sizeof(user_exec_list[i].offset)))
 				break;
-			}
 		}
 	}
 
@@ -1890,11 +1996,11 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
 	struct drm_i915_gem_execbuffer2 *args = data;
-	struct drm_i915_gem_exec_object2 *exec2_list = NULL;
+	struct drm_i915_gem_exec_object2 *exec2_list;
 	int ret;
 
 	if (args->buffer_count < 1 ||
-	    args->buffer_count > UINT_MAX / sizeof(*exec2_list)) {
+	    args->buffer_count >= UINT_MAX / sizeof(*exec2_list)) {
 		DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count);
 		return -EINVAL;
 	}
@@ -1904,45 +2010,42 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	exec2_list = drm_malloc_gfp(args->buffer_count,
+	exec2_list = drm_malloc_gfp(args->buffer_count + 1,
 				    sizeof(*exec2_list),
-				    GFP_TEMPORARY);
+				    __GFP_NOWARN | GFP_TEMPORARY);
 	if (exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
 		return -ENOMEM;
 	}
-	ret = copy_from_user(exec2_list,
-			     u64_to_user_ptr(args->buffers_ptr),
-			     sizeof(*exec2_list) * args->buffer_count);
-	if (ret != 0) {
-		DRM_DEBUG("copy %d exec entries failed %d\n",
-			  args->buffer_count, ret);
+	if (copy_from_user(exec2_list,
+			   u64_to_user_ptr(args->buffers_ptr),
+			   sizeof(*exec2_list) * args->buffer_count)) {
+		DRM_DEBUG("copy %d exec entries failed\n", args->buffer_count);
 		drm_free_large(exec2_list);
 		return -EFAULT;
 	}
 
 	ret = i915_gem_do_execbuffer(dev, file, args, exec2_list);
-	if (!ret) {
+	if (args->flags & __EXEC_HAS_RELOC) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		struct drm_i915_gem_exec_object2 __user *user_exec_list =
-				   u64_to_user_ptr(args->buffers_ptr);
+			u64_to_user_ptr(args->buffers_ptr);
 		int i;
 
+		user_access_begin();
 		for (i = 0; i < args->buffer_count; i++) {
+			if ((exec2_list[i].offset & UPDATE) == 0)
+				continue;
+
 			exec2_list[i].offset =
-				gen8_canonical_addr(exec2_list[i].offset);
-			ret = __copy_to_user(&user_exec_list[i].offset,
-					     &exec2_list[i].offset,
-					     sizeof(user_exec_list[i].offset));
-			if (ret) {
-				ret = -EFAULT;
-				DRM_DEBUG("failed to copy %d exec entries "
-					  "back to user\n",
-					  args->buffer_count);
-				break;
-			}
+				gen8_canonical_addr(exec2_list[i].offset & PIN_OFFSET_MASK);
+			unsafe_put_user(exec2_list[i].offset,
+					&user_exec_list[i].offset,
+					end_user);
 		}
+end_user:
+		user_access_end();
 	}
 
 	drm_free_large(exec2_list);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index f9540683d2c0..ba04b0bf7fe0 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -230,7 +230,9 @@ struct i915_vma {
 	struct hlist_node obj_node;
 
 	/** This vma's place in the batchbuffer or on the eviction list */
-	struct list_head exec_list;
+	struct list_head exec_link;
+	struct list_head reloc_link;
+	struct list_head evict_link;
 
 	/**
 	 * Used for performing relocations during execbuffer insertion.
-- 
2.9.3