[PATCH 12/14] drm/i915: Lockless object unreference and free

Sat Aug 6 13:34:06 UTC 2016

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/dma-buf/fence.c                    |  58 +++
 drivers/dma-buf/reservation.c              |  48 +++
 drivers/gpu/drm/i915/i915_debugfs.c        |   2 +-
 drivers/gpu/drm/i915/i915_drv.c            |   2 +-
 drivers/gpu/drm/i915/i915_drv.h            |  12 +-
 drivers/gpu/drm/i915/i915_gem.c            |  32 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  66 +++-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  54 +--
 drivers/gpu/drm/i915/i915_gem_gtt.h        |   1 -
 drivers/gpu/drm/i915/i915_gem_request.c    |  12 +-
 drivers/gpu/drm/i915/i915_gem_request.h    |  24 +-
 drivers/gpu/drm/i915/i915_gem_tiling.c     |   6 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c    |   4 +-
 drivers/gpu/drm/i915/intel_display.c       | 161 ++++----
 drivers/gpu/drm/i915/intel_drv.h           |   7 +-
 drivers/gpu/drm/i915/intel_fbdev.c         |  21 +-
 drivers/gpu/drm/i915/intel_guc_loader.c    |   3 -
 drivers/gpu/drm/i915/intel_lrc.c           | 419 +++++++--------------
 drivers/gpu/drm/i915/intel_lrc.h           |   2 -
 drivers/gpu/drm/i915/intel_overlay.c       |   4 +-
 drivers/gpu/drm/i915/intel_pm.c            |   2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   7 +-
 include/linux/fence.h                      |   6 +
 include/linux/kfence.h                     |  86 +++++
 include/linux/reservation.h                |   7 +
 kernel/Makefile                            |   2 +-
 kernel/kfence.c                            | 497 ++++++++++++++++++++++++
 lib/Kconfig.debug                          |  23 ++
 lib/Makefile                               |   1 +
 lib/test-kfence.c                          | 580 +++++++++++++++++++++++++++++
 tools/testing/selftests/lib/kfence.sh      |  10 +
 31 files changed, 1683 insertions(+), 476 deletions(-)
 create mode 100644 include/linux/kfence.h
 create mode 100644 kernel/kfence.c
 create mode 100644 lib/test-kfence.c
 create mode 100755 tools/testing/selftests/lib/kfence.sh

diff --git a/drivers/dma-buf/fence.c b/drivers/dma-buf/fence.c
index 4d51f9e83fa8..97091894ff16 100644
--- a/drivers/dma-buf/fence.c
+++ b/drivers/dma-buf/fence.c
@@ -22,6 +22,7 @@
 #include <linux/export.h>
 #include <linux/atomic.h>
 #include <linux/fence.h>
+#include <linux/kfence.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/fence.h>
@@ -530,3 +531,60 @@ fence_init(struct fence *fence, const struct fence_ops *ops,
 	trace_fence_init(fence);
 }
 EXPORT_SYMBOL(fence_init);
+
+struct dma_fence_cb {
+	struct fence_cb base;
+	struct kfence *fence;
+};
+
+static void dma_kfence_wake(struct fence *dma, struct fence_cb *data)
+{
+	struct dma_fence_cb *cb = container_of(data, typeof(*cb), base);
+
+	kfence_complete(cb->fence);
+	kfence_put(cb->fence);
+	kfree(cb);
+}
+
+/**
+ * kfence_await_dma_fence - set the fence to wait upon a DMA fence
+ * @fence: this kfence
+ * @dma: target DMA fence to wait upon
+ * @gfp: the allowed allocation type
+ *
+ * kfence_add_dma() causes the @fence to wait upon completion of a DMA fence.
+ *
+ * Returns 1 if the @fence was successfully to the waitqueue of @dma, 0
+ * if @dma was already signaled (and so not added), or a negative error code.
+ */
+int kfence_await_dma_fence(struct kfence *fence, struct fence *dma, gfp_t gfp)
+{
+	struct dma_fence_cb *cb;
+	int ret;
+
+	if (fence_is_signaled(dma))
+		return 0;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		return fence_wait(dma, false);
+	}
+
+	cb->fence = kfence_get(fence);
+	kfence_await(fence);
+
+	ret = fence_add_callback(dma, &cb->base, dma_kfence_wake);
+	if (ret == 0) {
+		ret = 1;
+	} else {
+		dma_kfence_wake(dma, &cb->base);
+		if (ret == -ENOENT) /* fence already signaled */
+			ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kfence_await_dma_fence);
diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 9566a62ad8e3..138b792af0c3 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -543,3 +543,51 @@ unlock_retry:
 	goto retry;
 }
 EXPORT_SYMBOL_GPL(reservation_object_test_signaled_rcu);
+
+/**
+ * kfence_add_reservation - set the fence to wait upon a reservation_object
+ * @fence: this kfence
+ * @resv: target reservation_object (collection of DMA fences) to wait upon
+ * @write: Wait for read or read/write access
+ * @gfp: the allowed allocation type
+ *
+ * kfence_add_reservation() causes the @fence to wait upon completion of the
+ * reservation object (a collection of DMA fences), either for read access
+ * or for read/write access.
+ *
+ * Returns 1 if the @fence was successfully to the waitqueues of @resv, 0
+ * if @resev was already signaled (and so not added), or a negative error code.
+ */
+int kfence_await_reservation(struct kfence *fence,
+			     struct reservation_object *resv,
+			     bool write,
+			     gfp_t gfp)
+{
+	struct fence *excl, **shared;
+	unsigned int count, i;
+	int ret;
+
+	ret = reservation_object_get_fences_rcu(resv, &excl, &count, &shared);
+	if (ret)
+		return ret;
+
+	if (write) {
+		for (i = 0; i < count; i++) {
+			ret |= kfence_await_dma_fence(fence, shared[i], gfp);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+	if (excl)
+		ret |= kfence_await_dma_fence(fence, excl, gfp);
+
+out:
+	fence_put(excl);
+	for (i = 0; i < count; i++)
+		fence_put(shared[i]);
+	kfree(shared);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kfence_await_reservation);
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 60bcdb228207..34f44050af25 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2190,7 +2190,7 @@ static int i915_execlists(struct seq_file *m, void *data)
 		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
 		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
 
-		read_pointer = engine->next_context_status_buffer;
+		read_pointer = GEN8_CSB_READ_PTR(status_pointer);
 		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ccae2ec4c78e..4036dbe1334a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2568,7 +2568,7 @@ static struct drm_driver driver = {
 	.set_busid = drm_pci_set_busid,
 
 	.gem_close_object = i915_gem_close_object,
-	.gem_free_object = i915_gem_free_object,
+	.gem_free_object_unlocked = i915_gem_free_object,
 	.gem_vm_ops = &i915_gem_vm_ops,
 
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e8778c97eaa5..568ff1b35f8b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2275,7 +2275,7 @@ struct drm_i915_gem_object {
 	struct i915_gem_active last_write;
 
 	/** References from framebuffers, locks out tiling changes. */
-	unsigned long framebuffer_references;
+	atomic_t framebuffer_references;
 
 	/** Record of address bit 17 of each page at last unbind. */
 	unsigned long *bit_17;
@@ -2331,19 +2331,11 @@ __attribute__((nonnull))
 static inline void
 i915_gem_object_put(struct drm_i915_gem_object *obj)
 {
-	drm_gem_object_unreference(&obj->base);
+	__drm_gem_object_unreference(&obj->base);
 }
 
 __deprecated
 extern void drm_gem_object_unreference(struct drm_gem_object *);
-
-__attribute__((nonnull))
-static inline void
-i915_gem_object_put_unlocked(struct drm_i915_gem_object *obj)
-{
-	drm_gem_object_unreference_unlocked(&obj->base);
-}
-
 __deprecated
 extern void drm_gem_object_unreference_unlocked(struct drm_gem_object *);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1cb71630d61a..c03fea6b0677 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -501,7 +501,7 @@ i915_gem_create(struct drm_file *file,
 
 	ret = drm_gem_handle_create(file, &obj->base, &handle);
 	/* drop reference from allocate - handle holds it now */
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	if (ret)
 		return ret;
 
@@ -981,7 +981,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 
 	i915_gem_object_unpin_pages(obj);
 out:
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
 
@@ -1310,7 +1310,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
 	i915_gem_object_unpin_pages(obj);
 err:
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
 
@@ -1387,7 +1387,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 err_pages:
 	i915_gem_object_unpin_pages(obj);
 err_unlocked:
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
 
@@ -1418,7 +1418,7 @@ i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
 		}
 	}
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return err;
 }
 
@@ -1464,7 +1464,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	 * pages from.
 	 */
 	if (!obj->base.filp) {
-		i915_gem_object_put_unlocked(obj);
+		i915_gem_object_put(obj);
 		return -EINVAL;
 	}
 
@@ -1476,7 +1476,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 		struct vm_area_struct *vma;
 
 		if (down_write_killable(&mm->mmap_sem)) {
-			i915_gem_object_put_unlocked(obj);
+			i915_gem_object_put(obj);
 			return -EINTR;
 		}
 		vma = find_vma(mm, addr);
@@ -1490,7 +1490,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 		/* This may race, but that's ok, it only gets set */
 		WRITE_ONCE(obj->has_wc_mmap, true);
 	}
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	if (IS_ERR((void *)addr))
 		return addr;
 
@@ -1841,7 +1841,7 @@ i915_gem_mmap_gtt(struct drm_file *file,
 	if (ret == 0)
 		*offset = drm_vma_node_offset_addr(&obj->base.vma_node);
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
 
@@ -2409,7 +2409,11 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 		/* Ensure irq handler finishes or is cancelled. */
 		tasklet_kill(&engine->irq_tasklet);
 
-		intel_execlists_cancel_requests(engine);
+		INIT_LIST_HEAD(&engine->execlist_queue);
+		i915_gem_request_assign(&engine->execlist_port[0].request,
+					NULL);
+		i915_gem_request_assign(&engine->execlist_port[1].request,
+					NULL);
 	}
 
 	/*
@@ -2610,7 +2614,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 			break;
 	}
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
 
@@ -3311,7 +3315,7 @@ int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
 		break;
 	}
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return 0;
 }
 
@@ -3828,7 +3832,7 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 		rcu_read_unlock();
 	}
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return 0;
 }
 
@@ -3884,7 +3888,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 	mutex_unlock(&obj->mm.lock);
 
 err:
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 520145cb2580..150a6cff6f2f 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1352,6 +1352,68 @@ static unsigned int eb_other_engines(struct i915_execbuffer *eb)
 }
 
 static int
+__eb_sync(struct drm_i915_gem_request *to,
+	  struct drm_i915_gem_request *from)
+{
+	int idx, ret;
+
+	if (to->engine == from->engine)
+		return 0;
+
+	idx = intel_engine_sync_index(from->engine, to->engine);
+	if (from->fence.seqno <= from->engine->semaphore.sync_seqno[idx])
+		return 0;
+
+	trace_i915_gem_ring_sync_to(to, from);
+	if (!i915.semaphores) {
+		ret = i915_wait_request(from, true, NULL, NO_WAITBOOST);
+		if (ret)
+			return ret;
+	} else {
+		ret = to->engine->semaphore.sync_to(to, from);
+		if (ret)
+			return ret;
+	}
+
+	from->engine->semaphore.sync_seqno[idx] = from->fence.seqno;
+	return 0;
+}
+
+static int
+eb_sync(struct drm_i915_gem_object *obj,
+	struct drm_i915_gem_request *to,
+	bool write)
+{
+	struct i915_gem_active *active;
+	unsigned long active_mask;
+	int idx;
+
+	if (write) {
+		active_mask = i915_gem_object_get_active(obj);
+		active = obj->last_read;
+	} else {
+		active_mask = 1;
+		active = &obj->last_write;
+	}
+
+	for_each_active(active_mask, idx) {
+		struct drm_i915_gem_request *request;
+		int ret;
+
+		request = i915_gem_active_peek(&active[idx],
+					       &obj->base.dev->struct_mutex);
+		if (!request)
+			continue;
+
+		ret = __eb_sync(to, request);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int
 eb_move_to_gpu(struct i915_execbuffer *eb)
 {
 	const unsigned int other_rings = eb_other_engines(eb);
@@ -1366,8 +1428,8 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->flags & other_rings) {
-			ret = i915_gem_object_sync(obj, eb->request,
-						   entry->flags & EXEC_OBJECT_WRITE);
+			ret = eb_sync(obj, eb->request,
+				      entry->flags & EXEC_OBJECT_WRITE);
 			if (ret)
 				return ret;
 		}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 3aeced17d369..0b7e7a6bb65c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2515,14 +2515,16 @@ static int ggtt_bind_vma(struct i915_vma *vma,
 {
 	struct drm_i915_private *i915 = to_i915(vma->vm->dev);
 	struct drm_i915_gem_object *obj = vma->obj;
-	u32 pte_flags = 0;
-	int ret;
+	u32 pte_flags;
 
-	ret = i915_get_ggtt_vma_pages(vma);
-	if (ret)
-		return ret;
+	if (!vma->ggtt_view.pages) {
+		int ret = i915_get_ggtt_vma_pages(vma);
+		if (ret)
+			return ret;
+	}
 
 	/* Currently applicable only to VLV */
+	pte_flags = 0;
 	if (obj->gt_ro)
 		pte_flags |= PTE_READ_ONLY;
 
@@ -2548,18 +2550,18 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 {
 	struct drm_i915_private *i915 = to_i915(vma->vm->dev);
 	u32 pte_flags;
-	int ret;
 
-	ret = i915_get_ggtt_vma_pages(vma);
-	if (ret)
-		return ret;
+	if (!vma->ggtt_view.pages) {
+		int ret = i915_get_ggtt_vma_pages(vma);
+		if (ret)
+			return ret;
+	}
 
 	/* Currently applicable only to VLV */
 	pte_flags = 0;
 	if (vma->obj->gt_ro)
 		pte_flags |= PTE_READ_ONLY;
 
-
 	if (flags & I915_VMA_GLOBAL_BIND) {
 		intel_runtime_pm_get(i915);
 		vma->vm->insert_entries(vma->vm,
@@ -3465,7 +3467,7 @@ rotate_pages(const dma_addr_t *in, unsigned int offset,
 	return sg;
 }
 
-static struct sg_table *
+noinline static struct sg_table *
 intel_rotate_fb_obj_pages(struct intel_rotation_info *rot_info,
 			  struct drm_i915_gem_object *obj)
 {
@@ -3555,7 +3557,7 @@ err_st_alloc:
 	return ERR_PTR(ret);
 }
 
-static struct sg_table *
+noinline static struct sg_table *
 intel_partial_pages(const struct i915_ggtt_view *view,
 		    struct drm_i915_gem_object *obj)
 {
@@ -3599,34 +3601,36 @@ err_st_alloc:
 static int
 i915_get_ggtt_vma_pages(struct i915_vma *vma)
 {
-	int ret = 0;
+	int ret;
 
-	if (vma->ggtt_view.pages)
+	switch (vma->ggtt_view.type) {
+	case I915_GGTT_VIEW_NORMAL:
+		vma->ggtt_view.pages = vma->obj->mm.pages;
 		return 0;
 
-	if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL)
-		vma->ggtt_view.pages = vma->obj->mm.pages;
-	else if (vma->ggtt_view.type == I915_GGTT_VIEW_ROTATED)
+	case I915_GGTT_VIEW_ROTATED:
 		vma->ggtt_view.pages =
 			intel_rotate_fb_obj_pages(&vma->ggtt_view.params.rotated, vma->obj);
-	else if (vma->ggtt_view.type == I915_GGTT_VIEW_PARTIAL)
+		break;
+
+	case I915_GGTT_VIEW_PARTIAL:
 		vma->ggtt_view.pages =
 			intel_partial_pages(&vma->ggtt_view, vma->obj);
-	else
+		break;
+
+	default:
 		WARN_ONCE(1, "GGTT view %u not implemented!\n",
 			  vma->ggtt_view.type);
+		return -EINVAL;
+	}
 
-	if (!vma->ggtt_view.pages) {
-		DRM_ERROR("Failed to get pages for GGTT view type %u!\n",
-			  vma->ggtt_view.type);
-		ret = -EINVAL;
-	} else if (IS_ERR(vma->ggtt_view.pages)) {
+	ret = 0;
+	if (IS_ERR(vma->ggtt_view.pages)) {
 		ret = PTR_ERR(vma->ggtt_view.pages);
 		vma->ggtt_view.pages = NULL;
 		DRM_ERROR("Failed to get pages for VMA view type %u (%d)!\n",
 			  vma->ggtt_view.type, ret);
 	}
-
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 05b5eda5c4a3..da71fdfd97ce 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -465,7 +465,6 @@ struct i915_hw_ppgtt {
 
 	gen6_pte_t __iomem *pd_addr;
 
-	int (*enable)(struct i915_hw_ppgtt *ppgtt);
 	int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
 			 struct drm_i915_gem_request *req);
 	void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 8f98a7e34212..c23008c63470 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -380,7 +380,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
-	req->elsp_submitted = 0;
 
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
@@ -725,16 +724,18 @@ complete:
 	return ret;
 }
 
-static void engine_retire_requests(struct intel_engine_cs *engine)
+static bool engine_retire_requests(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *request, *next;
 
 	list_for_each_entry_safe(request, next, &engine->request_list, link) {
 		if (!i915_gem_request_completed(request))
-			break;
+			return false;
 
 		i915_gem_request_retire(request);
 	}
+
+	return true;
 }
 
 void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
@@ -748,9 +749,8 @@ void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
 
 	GEM_BUG_ON(!dev_priv->gt.awake);
 
-	for_each_engine(engine, dev_priv) {
-		engine_retire_requests(engine);
-		if (!intel_engine_is_active(engine))
+	for_each_engine_masked(engine, dev_priv, dev_priv->gt.active_engines) {
+		if (engine_retire_requests(engine))
 			dev_priv->gt.active_engines &= ~intel_engine_flag(engine);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 1f83a38e69b4..edca7f74d471 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -101,6 +101,9 @@ struct drm_i915_gem_request {
 	/** Position in the ringbuffer of the end of the whole request */
 	u32 tail;
 
+	/** Position in the ringbuffer after the end of the whole request */
+	u32 wa_tail;
+
 	/** Preallocate space in the ringbuffer for the emitting the request */
 	u32 reserved_space;
 
@@ -134,27 +137,8 @@ struct drm_i915_gem_request {
 	/** file_priv list entry for this request */
 	struct list_head client_link;
 
-	/**
-	 * The ELSP only accepts two elements at a time, so we queue
-	 * context/tail pairs on a given queue (ring->execlist_queue) until the
-	 * hardware is available. The queue serves a double purpose: we also use
-	 * it to keep track of the up to 2 contexts currently in the hardware
-	 * (usually one in execution and the other queued up by the GPU): We
-	 * only remove elements from the head of the queue when the hardware
-	 * informs us that an element has been completed.
-	 *
-	 * All accesses to the queue are mediated by a spinlock
-	 * (ring->execlist_lock).
-	 */
-
-	/** Execlist link in the submission queue.*/
+	/** Link in the execlist submission queue, guarded by execlist_lock. */
 	struct list_head execlist_link;
-
-	/** Execlists no. of times this request has been sent to the ELSP */
-	int elsp_submitted;
-
-	/** Execlists context hardware id. */
-	unsigned int ctx_hw_id;
 };
 
 extern const struct fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index a4a5cb6f1781..b3512cf9c921 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -200,12 +200,12 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 
 	if (!i915_tiling_ok(dev,
 			    args->stride, obj->base.size, args->tiling_mode)) {
-		i915_gem_object_put_unlocked(obj);
+		i915_gem_object_put(obj);
 		return -EINVAL;
 	}
 
 	mutex_lock(&dev->struct_mutex);
-	if (obj->pin_display || obj->framebuffer_references) {
+	if (obj->pin_display || atomic_read(&obj->framebuffer_references)) {
 		ret = -EBUSY;
 		goto err;
 	}
@@ -353,6 +353,6 @@ i915_gem_get_tiling(struct drm_device *dev, void *data,
 	if (args->swizzle_mode == I915_BIT_6_SWIZZLE_9_10_17)
 		args->swizzle_mode = I915_BIT_6_SWIZZLE_9_10;
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 257cc3cc8468..4a00c7673bed 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -550,7 +550,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 	release_pages(pvec, pinned, 0);
 	drm_free_large(pvec);
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	put_task_struct(work->task);
 	kfree(work);
 }
@@ -810,7 +810,7 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 		ret = drm_gem_handle_create(file, &obj->base, &handle);
 
 	/* drop reference from allocate - handle holds it now */
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 71ad09ac374b..9979fce88ea7 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -97,10 +97,9 @@ static void i9xx_crtc_clock_get(struct intel_crtc *crtc,
 static void ironlake_pch_clock_get(struct intel_crtc *crtc,
 				   struct intel_crtc_state *pipe_config);
 
-static int intel_framebuffer_init(struct drm_device *dev,
-				  struct intel_framebuffer *ifb,
-				  struct drm_mode_fb_cmd2 *mode_cmd,
-				  struct drm_i915_gem_object *obj);
+static int intel_framebuffer_init(struct intel_framebuffer *ifb,
+				  struct drm_i915_gem_object *obj,
+				  struct drm_mode_fb_cmd2 *mode_cmd);
 static void i9xx_set_pipeconf(struct intel_crtc *intel_crtc);
 static void intel_set_pipe_timings(struct intel_crtc *intel_crtc);
 static void intel_set_pipe_src_size(struct intel_crtc *intel_crtc);
@@ -2114,11 +2113,13 @@ static void intel_tile_dims(const struct drm_i915_private *dev_priv,
 }
 
 unsigned int
-intel_fb_align_height(struct drm_device *dev, unsigned int height,
-		      uint32_t pixel_format, uint64_t fb_modifier)
+intel_fb_align_height(struct drm_i915_private *dev_priv,
+		      unsigned int height,
+		      uint32_t pixel_format,
+		      uint64_t fb_modifier)
 {
 	unsigned int cpp = drm_format_plane_cpp(pixel_format, 0);
-	unsigned int tile_height = intel_tile_height(to_i915(dev), fb_modifier, cpp);
+	unsigned int tile_height = intel_tile_height(dev_priv, fb_modifier, cpp);
 
 	return ALIGN(height, tile_height);
 }
@@ -2446,15 +2447,13 @@ intel_alloc_initial_plane_obj(struct intel_crtc *crtc,
 		return false;
 
 	mutex_lock(&dev->struct_mutex);
-
 	obj = i915_gem_object_create_stolen_for_preallocated(dev,
 							     base_aligned,
 							     base_aligned,
 							     size_aligned);
-	if (!obj) {
-		mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&dev->struct_mutex);
+	if (!obj)
 		return false;
-	}
 
 	if (plane_config->tiling == I915_TILING_X)
 		obj->tiling_and_stride = fb->pitches[0] | I915_TILING_X;
@@ -2466,13 +2465,11 @@ intel_alloc_initial_plane_obj(struct intel_crtc *crtc,
 	mode_cmd.modifier[0] = fb->modifier[0];
 	mode_cmd.flags = DRM_MODE_FB_MODIFIERS;
 
-	if (intel_framebuffer_init(dev, to_intel_framebuffer(fb),
-				   &mode_cmd, obj)) {
+	if (intel_framebuffer_init(to_intel_framebuffer(fb), obj, &mode_cmd)) {
 		DRM_DEBUG_KMS("intel fb init failed\n");
 		goto out_unref_obj;
 	}
 
-	mutex_unlock(&dev->struct_mutex);
 
 	DRM_DEBUG_KMS("initial plane fb obj %p\n", obj);
 	return true;
@@ -8156,7 +8153,8 @@ i9xx_get_initial_plane_config(struct intel_crtc *crtc,
 	val = I915_READ(DSPSTRIDE(pipe));
 	fb->pitches[0] = val & 0xffffffc0;
 
-	aligned_height = intel_fb_align_height(dev, fb->height,
+	aligned_height = intel_fb_align_height(dev_priv,
+					       fb->height,
 					       fb->pixel_format,
 					       fb->modifier[0]);
 
@@ -9180,7 +9178,8 @@ skylake_get_initial_plane_config(struct intel_crtc *crtc,
 						fb->pixel_format);
 	fb->pitches[0] = (val & 0x3ff) * stride_mult;
 
-	aligned_height = intel_fb_align_height(dev, fb->height,
+	aligned_height = intel_fb_align_height(dev_priv,
+					       fb->height,
 					       fb->pixel_format,
 					       fb->modifier[0]);
 
@@ -9277,7 +9276,8 @@ ironlake_get_initial_plane_config(struct intel_crtc *crtc,
 	val = I915_READ(DSPSTRIDE(pipe));
 	fb->pitches[0] = val & 0xffffffc0;
 
-	aligned_height = intel_fb_align_height(dev, fb->height,
+	aligned_height = intel_fb_align_height(dev_priv,
+					       fb->height,
 					       fb->pixel_format,
 					       fb->modifier[0]);
 
@@ -10348,9 +10348,8 @@ static struct drm_display_mode load_detect_mode = {
 };
 
 struct drm_framebuffer *
-__intel_framebuffer_create(struct drm_device *dev,
-			   struct drm_mode_fb_cmd2 *mode_cmd,
-			   struct drm_i915_gem_object *obj)
+intel_framebuffer_create(struct drm_i915_gem_object *obj,
+			 struct drm_mode_fb_cmd2 *mode_cmd)
 {
 	struct intel_framebuffer *intel_fb;
 	int ret;
@@ -10359,7 +10358,7 @@ __intel_framebuffer_create(struct drm_device *dev,
 	if (!intel_fb)
 		return ERR_PTR(-ENOMEM);
 
-	ret = intel_framebuffer_init(dev, intel_fb, mode_cmd, obj);
+	ret = intel_framebuffer_init(intel_fb, obj, mode_cmd);
 	if (ret)
 		goto err;
 
@@ -10370,23 +10369,6 @@ err:
 	return ERR_PTR(ret);
 }
 
-static struct drm_framebuffer *
-intel_framebuffer_create(struct drm_device *dev,
-			 struct drm_mode_fb_cmd2 *mode_cmd,
-			 struct drm_i915_gem_object *obj)
-{
-	struct drm_framebuffer *fb;
-	int ret;
-
-	ret = i915_mutex_lock_interruptible(dev);
-	if (ret)
-		return ERR_PTR(ret);
-	fb = __intel_framebuffer_create(dev, mode_cmd, obj);
-	mutex_unlock(&dev->struct_mutex);
-
-	return fb;
-}
-
 static u32
 intel_framebuffer_pitch_for_width(int width, int bpp)
 {
@@ -10421,9 +10403,9 @@ intel_framebuffer_create_for_mode(struct drm_device *dev,
 								bpp);
 	mode_cmd.pixel_format = drm_mode_legacy_fb_format(bpp, depth);
 
-	fb = intel_framebuffer_create(dev, &mode_cmd, obj);
+	fb = intel_framebuffer_create(obj, &mode_cmd);
 	if (IS_ERR(fb))
-		i915_gem_object_put_unlocked(obj);
+		i915_gem_object_put(obj);
 
 	return fb;
 }
@@ -11737,7 +11719,7 @@ cleanup:
 	crtc->primary->fb = old_fb;
 	update_state_fb(crtc->primary);
 
-	i915_gem_object_put_unlocked(obj);
+	i915_gem_object_put(obj);
 	drm_framebuffer_unreference(work->old_fb);
 
 	spin_lock_irq(&dev->event_lock);
@@ -14833,14 +14815,14 @@ static void intel_setup_outputs(struct drm_device *dev)
 
 static void intel_user_framebuffer_destroy(struct drm_framebuffer *fb)
 {
-	struct drm_device *dev = fb->dev;
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 
 	drm_framebuffer_cleanup(fb);
-	mutex_lock(&dev->struct_mutex);
-	WARN_ON(!intel_fb->obj->framebuffer_references--);
+
+	WARN_ON(atomic_read(&intel_fb->obj->framebuffer_references) == 0);
+	atomic_dec(&intel_fb->obj->framebuffer_references);
 	i915_gem_object_put(intel_fb->obj);
-	mutex_unlock(&dev->struct_mutex);
+
 	kfree(intel_fb);
 }
 
@@ -14883,10 +14865,11 @@ static const struct drm_framebuffer_funcs intel_fb_funcs = {
 };
 
 static
-u32 intel_fb_pitch_limit(struct drm_device *dev, uint64_t fb_modifier,
+u32 intel_fb_pitch_limit(struct drm_i915_private *dev_priv,
+			 uint64_t fb_modifier,
 			 uint32_t pixel_format)
 {
-	u32 gen = INTEL_INFO(dev)->gen;
+	u32 gen = INTEL_GEN(dev_priv);
 
 	if (gen >= 9) {
 		int cpp = drm_format_plane_cpp(pixel_format, 0);
@@ -14895,7 +14878,7 @@ u32 intel_fb_pitch_limit(struct drm_device *dev, uint64_t fb_modifier,
 		 *  pixels and 32K bytes."
 		 */
 		return min(8192 * cpp, 32768);
-	} else if (gen >= 5 && !IS_VALLEYVIEW(dev) && !IS_CHERRYVIEW(dev)) {
+	} else if (gen >= 5 && !IS_VALLEYVIEW(dev_priv) && !IS_CHERRYVIEW(dev_priv)) {
 		return 32*1024;
 	} else if (gen >= 4) {
 		if (fb_modifier == I915_FORMAT_MOD_X_TILED)
@@ -14913,17 +14896,16 @@ u32 intel_fb_pitch_limit(struct drm_device *dev, uint64_t fb_modifier,
 	}
 }
 
-static int intel_framebuffer_init(struct drm_device *dev,
-				  struct intel_framebuffer *intel_fb,
-				  struct drm_mode_fb_cmd2 *mode_cmd,
-				  struct drm_i915_gem_object *obj)
+static int intel_framebuffer_init(struct intel_framebuffer *intel_fb,
+				  struct drm_i915_gem_object *obj,
+				  struct drm_mode_fb_cmd2 *mode_cmd)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	unsigned int aligned_height;
-	int ret;
+	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	u32 pitch_limit, stride_alignment;
+	unsigned int aligned_height;
+	int ret = -EINVAL;
 
-	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+	atomic_inc(&obj->framebuffer_references);
 
 	if (mode_cmd->flags & DRM_MODE_FB_MODIFIERS) {
 		/* Enforce that fb modifier and tiling mode match, but only for
@@ -14931,14 +14913,14 @@ static int intel_framebuffer_init(struct drm_device *dev,
 		if (!!(i915_gem_object_get_tiling(obj) == I915_TILING_X) !=
 		    !!(mode_cmd->modifier[0] == I915_FORMAT_MOD_X_TILED)) {
 			DRM_DEBUG("tiling_mode doesn't match fb modifier\n");
-			return -EINVAL;
+			goto err;
 		}
 	} else {
 		if (i915_gem_object_get_tiling(obj) == I915_TILING_X)
 			mode_cmd->modifier[0] = I915_FORMAT_MOD_X_TILED;
 		else if (i915_gem_object_get_tiling(obj) == I915_TILING_Y) {
 			DRM_DEBUG("No Y tiling for legacy addfb\n");
-			return -EINVAL;
+			goto err;
 		}
 	}
 
@@ -14946,10 +14928,10 @@ static int intel_framebuffer_init(struct drm_device *dev,
 	switch (mode_cmd->modifier[0]) {
 	case I915_FORMAT_MOD_Y_TILED:
 	case I915_FORMAT_MOD_Yf_TILED:
-		if (INTEL_INFO(dev)->gen < 9) {
+		if (INTEL_GEN(dev_priv) < 9) {
 			DRM_DEBUG("Unsupported tiling 0x%llx!\n",
 				  mode_cmd->modifier[0]);
-			return -EINVAL;
+			goto err;
 		}
 	case DRM_FORMAT_MOD_NONE:
 	case I915_FORMAT_MOD_X_TILED:
@@ -14957,7 +14939,7 @@ static int intel_framebuffer_init(struct drm_device *dev,
 	default:
 		DRM_DEBUG("Unsupported fb modifier 0x%llx!\n",
 			  mode_cmd->modifier[0]);
-		return -EINVAL;
+		goto err;
 	}
 
 	stride_alignment = intel_fb_stride_alignment(dev_priv,
@@ -14966,17 +14948,18 @@ static int intel_framebuffer_init(struct drm_device *dev,
 	if (mode_cmd->pitches[0] & (stride_alignment - 1)) {
 		DRM_DEBUG("pitch (%d) must be at least %u byte aligned\n",
 			  mode_cmd->pitches[0], stride_alignment);
-		return -EINVAL;
+		goto err;
 	}
 
-	pitch_limit = intel_fb_pitch_limit(dev, mode_cmd->modifier[0],
+	pitch_limit = intel_fb_pitch_limit(dev_priv,
+					   mode_cmd->modifier[0],
 					   mode_cmd->pixel_format);
 	if (mode_cmd->pitches[0] > pitch_limit) {
 		DRM_DEBUG("%s pitch (%u) must be at less than %d\n",
 			  mode_cmd->modifier[0] != DRM_FORMAT_MOD_NONE ?
 			  "tiled" : "linear",
 			  mode_cmd->pitches[0], pitch_limit);
-		return -EINVAL;
+		goto err;
 	}
 
 	if (mode_cmd->modifier[0] == I915_FORMAT_MOD_X_TILED &&
@@ -14984,7 +14967,7 @@ static int intel_framebuffer_init(struct drm_device *dev,
 		DRM_DEBUG("pitch (%d) must match tiling stride (%d)\n",
 			  mode_cmd->pitches[0],
 			  i915_gem_object_get_stride(obj));
-		return -EINVAL;
+		goto err;
 	}
 
 	/* Reject formats not supported by any plane early. */
@@ -14995,77 +14978,83 @@ static int intel_framebuffer_init(struct drm_device *dev,
 	case DRM_FORMAT_ARGB8888:
 		break;
 	case DRM_FORMAT_XRGB1555:
-		if (INTEL_INFO(dev)->gen > 3) {
+		if (INTEL_GEN(dev_priv) > 3) {
 			DRM_DEBUG("unsupported pixel format: %s\n",
 				  drm_get_format_name(mode_cmd->pixel_format));
-			return -EINVAL;
+			goto err;
 		}
 		break;
 	case DRM_FORMAT_ABGR8888:
-		if (!IS_VALLEYVIEW(dev) && !IS_CHERRYVIEW(dev) &&
-		    INTEL_INFO(dev)->gen < 9) {
+		if (!IS_VALLEYVIEW(dev_priv) &&
+		    !IS_CHERRYVIEW(dev_priv) &&
+		    INTEL_GEN(dev_priv) < 9) {
 			DRM_DEBUG("unsupported pixel format: %s\n",
 				  drm_get_format_name(mode_cmd->pixel_format));
-			return -EINVAL;
+			goto err;
 		}
 		break;
 	case DRM_FORMAT_XBGR8888:
 	case DRM_FORMAT_XRGB2101010:
 	case DRM_FORMAT_XBGR2101010:
-		if (INTEL_INFO(dev)->gen < 4) {
+		if (INTEL_GEN(dev_priv) < 4) {
 			DRM_DEBUG("unsupported pixel format: %s\n",
 				  drm_get_format_name(mode_cmd->pixel_format));
-			return -EINVAL;
+			goto err;
 		}
 		break;
 	case DRM_FORMAT_ABGR2101010:
-		if (!IS_VALLEYVIEW(dev) && !IS_CHERRYVIEW(dev)) {
+		if (!IS_VALLEYVIEW(dev_priv) && !IS_CHERRYVIEW(dev_priv)) {
 			DRM_DEBUG("unsupported pixel format: %s\n",
 				  drm_get_format_name(mode_cmd->pixel_format));
-			return -EINVAL;
+			goto err;
 		}
 		break;
 	case DRM_FORMAT_YUYV:
 	case DRM_FORMAT_UYVY:
 	case DRM_FORMAT_YVYU:
 	case DRM_FORMAT_VYUY:
-		if (INTEL_INFO(dev)->gen < 5) {
+		if (INTEL_GEN(dev_priv) < 5) {
 			DRM_DEBUG("unsupported pixel format: %s\n",
 				  drm_get_format_name(mode_cmd->pixel_format));
-			return -EINVAL;
+			goto err;
 		}
 		break;
 	default:
 		DRM_DEBUG("unsupported pixel format: %s\n",
 			  drm_get_format_name(mode_cmd->pixel_format));
-		return -EINVAL;
+		goto err;
 	}
 
 	/* FIXME need to adjust LINOFF/TILEOFF accordingly. */
 	if (mode_cmd->offsets[0] != 0)
-		return -EINVAL;
+		goto err;
 
-	aligned_height = intel_fb_align_height(dev, mode_cmd->height,
+	aligned_height = intel_fb_align_height(dev_priv,
+					       mode_cmd->height,
 					       mode_cmd->pixel_format,
 					       mode_cmd->modifier[0]);
 	/* FIXME drm helper for size checks (especially planar formats)? */
 	if (obj->base.size < aligned_height * mode_cmd->pitches[0])
-		return -EINVAL;
+		goto err;
 
 	drm_helper_mode_fill_fb_struct(&intel_fb->base, mode_cmd);
 	intel_fb->obj = obj;
 
 	intel_fill_fb_info(dev_priv, &intel_fb->base);
 
-	ret = drm_framebuffer_init(dev, &intel_fb->base, &intel_fb_funcs);
+	ret = drm_framebuffer_init(obj->base.dev,
+				   &intel_fb->base,
+				   &intel_fb_funcs);
 	if (ret) {
 		DRM_ERROR("framebuffer init failed %d\n", ret);
-		return ret;
+		goto err;
 	}
 
-	intel_fb->obj->framebuffer_references++;
-
 	return 0;
+
+err:
+	atomic_dec(&obj->framebuffer_references);
+	return ret;
 }
 
 static struct drm_framebuffer *
@@ -15081,9 +15070,9 @@ intel_user_framebuffer_create(struct drm_device *dev,
 	if (!obj)
 		return ERR_PTR(-ENOENT);
 
-	fb = intel_framebuffer_create(dev, &mode_cmd, obj);
+	fb = intel_framebuffer_create(obj, &mode_cmd);
 	if (IS_ERR(fb))
-		i915_gem_object_put_unlocked(obj);
+		i915_gem_object_put(obj);
 
 	return fb;
 }
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index a4870eac84c1..69b2a735deea 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1136,7 +1136,7 @@ void intel_ddi_clock_get(struct intel_encoder *encoder,
 void intel_ddi_set_vc_payload_alloc(struct drm_crtc *crtc, bool state);
 uint32_t ddi_signal_levels(struct intel_dp *intel_dp);
 
-unsigned int intel_fb_align_height(struct drm_device *dev,
+unsigned int intel_fb_align_height(struct drm_i915_private *dev_priv,
 				   unsigned int height,
 				   uint32_t pixel_format,
 				   uint64_t fb_format_modifier);
@@ -1222,9 +1222,8 @@ struct i915_vma *
 intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation);
 void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation);
 struct drm_framebuffer *
-__intel_framebuffer_create(struct drm_device *dev,
-			   struct drm_mode_fb_cmd2 *mode_cmd,
-			   struct drm_i915_gem_object *obj);
+intel_framebuffer_create(struct drm_i915_gem_object *obj,
+			 struct drm_mode_fb_cmd2 *mode_cmd);
 void intel_finish_page_flip_cs(struct drm_i915_private *dev_priv, int pipe);
 void intel_finish_page_flip_mmio(struct drm_i915_private *dev_priv, int pipe);
 void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe);
diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c
index 692bf75db3bd..2893a2d67403 100644
--- a/drivers/gpu/drm/i915/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/intel_fbdev.c
@@ -125,7 +125,7 @@ static int intelfb_alloc(struct drm_fb_helper *helper,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
 	struct drm_mode_fb_cmd2 mode_cmd = {};
-	struct drm_i915_gem_object *obj = NULL;
+	struct drm_i915_gem_object *obj;
 	int size, ret;
 
 	/* we don't do packed 24bpp */
@@ -140,14 +140,13 @@ static int intelfb_alloc(struct drm_fb_helper *helper,
 	mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp,
 							  sizes->surface_depth);
 
-	mutex_lock(&dev->struct_mutex);
-
 	size = mode_cmd.pitches[0] * mode_cmd.height;
 	size = PAGE_ALIGN(size);
 
 	/* If the FB is too big, just don't use it since fbdev is not very
 	 * important and we should probably use that space with FBC or other
 	 * features. */
+	obj = NULL;
 	if (size * 2 < ggtt->stolen_usable_size)
 		obj = i915_gem_object_create_stolen(dev, size);
 	if (obj == NULL)
@@ -155,24 +154,22 @@ static int intelfb_alloc(struct drm_fb_helper *helper,
 	if (IS_ERR(obj)) {
 		DRM_ERROR("failed to allocate framebuffer\n");
 		ret = PTR_ERR(obj);
-		goto out;
+		goto err;
 	}
 
-	fb = __intel_framebuffer_create(dev, &mode_cmd, obj);
+	fb = intel_framebuffer_create(obj, &mode_cmd);
 	if (IS_ERR(fb)) {
-		i915_gem_object_put(obj);
 		ret = PTR_ERR(fb);
-		goto out;
+		goto err_obj;
 	}
 
-	mutex_unlock(&dev->struct_mutex);
-
 	ifbdev->fb = to_intel_framebuffer(fb);
 
 	return 0;
 
-out:
-	mutex_unlock(&dev->struct_mutex);
+err_obj:
+	i915_gem_object_put(obj);
+err:
 	return ret;
 }
 
@@ -634,7 +631,7 @@ static bool intel_fbdev_init_bios(struct drm_device *dev,
 		}
 
 		cur_size = intel_crtc->config->base.adjusted_mode.crtc_vdisplay;
-		cur_size = intel_fb_align_height(dev, cur_size,
+		cur_size = intel_fb_align_height(to_i915(dev), cur_size,
 						 fb->base.pixel_format,
 						 fb->base.modifier[0]);
 		cur_size *= fb->base.pitches[0];
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index 7f17edfe5cb5..78288f0af669 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -661,9 +661,6 @@ fail:
 	DRM_ERROR("Failed to fetch GuC firmware from %s (error %d)\n",
 		  guc_fw->guc_fw_path, err);
 
-	i915_gem_object_put_unlocked(guc_fw->guc_fw_obj);
-	guc_fw->guc_fw_obj = NULL;
-
 	release_firmware(fw);		/* OK even if fw is NULL */
 	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_FAIL;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 75ddaf372045..c1189962258e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -156,6 +156,11 @@
 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
 
+#define GEN8_CTX_STATUS_COMPLETED_MASK \
+	 (GEN8_CTX_STATUS_ACTIVE_IDLE | \
+	  GEN8_CTX_STATUS_PREEMPTED | \
+	  GEN8_CTX_STATUS_ELEMENT_SWITCH)
+
 #define CTX_LRI_HEADER_0		0x01
 #define CTX_CONTEXT_CONTROL		0x02
 #define CTX_RING_HEAD			0x04
@@ -263,12 +268,10 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv))
-		engine->idle_lite_restore_wa = ~0;
-
-	engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
-					IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
-					(engine->id == VCS || engine->id == VCS2);
+	engine->disable_lite_restore_wa =
+		(IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
+		 IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
+		(engine->id == VCS || engine->id == VCS2);
 
 	engine->ctx_desc_template = GEN8_CTX_VALID;
 	if (IS_GEN8(dev_priv))
@@ -328,36 +331,6 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 	return ctx->engine[engine->id].lrc_desc;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-				 struct drm_i915_gem_request *rq1)
-{
-
-	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_i915_private *dev_priv = rq0->i915;
-	uint64_t desc[2];
-
-	if (rq1) {
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-		rq1->elsp_submitted++;
-	} else {
-		desc[1] = 0;
-	}
-
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
-
-	/* You must always write both descriptors in the order below. */
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
-
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
-	/* The context is automatically loaded after the following */
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
-
-	/* ELSP is a wo register, use another nearby reg for posting */
-	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
-}
-
 static void
 execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 {
@@ -367,13 +340,13 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
 }
 
-static void execlists_update_context(struct drm_i915_gem_request *rq)
+static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 {
-	struct intel_engine_cs *engine = rq->engine;
+	struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
 	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-	uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state;
 
-	reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
+	ce->lrc_reg_state[CTX_RING_TAIL+1] =
+		intel_ring_offset(rq->ring, rq->tail);
 
 	/* True 32b PPGTT with dynamic page allocation: update PDP
 	 * registers and point the unallocated PDPs to scratch page.
@@ -381,32 +354,14 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 	 * in 48-bit mode.
 	 */
 	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
-		execlists_update_context_pdps(ppgtt, reg_state);
-}
-
-static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
-					   struct drm_i915_gem_request *rq1)
-{
-	struct drm_i915_private *dev_priv = rq0->i915;
-	unsigned int fw_domains = rq0->engine->fw_domains;
-
-	execlists_update_context(rq0);
-
-	if (rq1)
-		execlists_update_context(rq1);
+		execlists_update_context_pdps(ppgtt, ce->lrc_reg_state);
 
-	spin_lock_irq(&dev_priv->uncore.lock);
-	intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
-
-	execlists_elsp_write(rq0, rq1);
-
-	intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
-	spin_unlock_irq(&dev_priv->uncore.lock);
+	return ce->lrc_desc;
 }
 
-static inline void execlists_context_status_change(
-		struct drm_i915_gem_request *rq,
-		unsigned long status)
+static inline void
+execlists_context_status_change(struct drm_i915_gem_request *rq,
+				unsigned long status)
 {
 	/*
 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
@@ -418,122 +373,106 @@ static inline void execlists_context_status_change(
 	atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
 }
 
-static void execlists_unqueue(struct intel_engine_cs *engine)
+static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-	struct drm_i915_gem_request *cursor, *tmp;
-
-	assert_spin_locked(&engine->execlist_lock);
-
-	/*
-	 * If irqs are not active generate a warning as batches that finish
-	 * without the irqs may get lost and a GPU Hang may occur.
-	 */
-	WARN_ON(!intel_irqs_enabled(engine->i915));
-
-	/* Try to read in pairs */
-	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
-				 execlist_link) {
-		if (!req0) {
-			req0 = cursor;
-		} else if (req0->ctx == cursor->ctx) {
-			/* Same ctx: ignore first request, as second request
-			 * will update tail past first request's workload */
-			cursor->elsp_submitted = req0->elsp_submitted;
-			list_del(&req0->execlist_link);
-			i915_gem_request_put(req0);
-			req0 = cursor;
-		} else {
-			if (IS_ENABLED(CONFIG_DRM_I915_GVT)) {
-				/*
-				 * req0 (after merged) ctx requires single
-				 * submission, stop picking
-				 */
-				if (req0->ctx->execlists_force_single_submission)
-					break;
-				/*
-				 * req0 ctx doesn't require single submission,
-				 * but next req ctx requires, stop picking
-				 */
-				if (cursor->ctx->execlists_force_single_submission)
-					break;
-			}
-			req1 = cursor;
-			WARN_ON(req1->elsp_submitted);
-			break;
-		}
-	}
-
-	if (unlikely(!req0))
-		return;
+	struct drm_i915_private *dev_priv = engine->i915;
+	u32 *elsp = dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u64 desc[2];
 
-	execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN);
+	if (!engine->execlist_port[0].count)
+		execlists_context_status_change(engine->execlist_port[0].request,
+						INTEL_CONTEXT_SCHEDULE_IN);
+	desc[0] = execlists_update_context(engine->execlist_port[0].request);
+	engine->preempt_wa = engine->execlist_port[0].count++;
 
-	if (req1)
-		execlists_context_status_change(req1,
+	if (engine->execlist_port[1].request) {
+		GEM_BUG_ON(engine->execlist_port[1].count);
+		execlists_context_status_change(engine->execlist_port[1].request,
 						INTEL_CONTEXT_SCHEDULE_IN);
+		desc[1] = execlists_update_context(engine->execlist_port[1].request);
+		engine->execlist_port[1].count = 1;
+	} else
+		desc[1] = 0;
 
-	if (req0->elsp_submitted & engine->idle_lite_restore_wa) {
-		/*
-		 * WaIdleLiteRestore: make sure we never cause a lite restore
-		 * with HEAD==TAIL.
-		 *
-		 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
-		 * resubmit the request. See gen8_emit_request() for where we
-		 * prepare the padding after the end of the request.
-		 */
-		req0->tail += 8;
-		req0->tail &= req0->ring->size - 1;
-	}
+	/* You must always write both descriptors in the order below. */
+	writel(upper_32_bits(desc[1]), elsp);
+	writel(lower_32_bits(desc[1]), elsp);
 
-	execlists_elsp_submit_contexts(req0, req1);
+	writel(upper_32_bits(desc[0]), elsp);
+	/* The context is automatically loaded after the following */
+	writel(lower_32_bits(desc[0]), elsp);
 }
 
-static unsigned int
-execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id)
+static bool merge_ctx(struct i915_gem_context *prev,
+		      struct i915_gem_context *next)
 {
-	struct drm_i915_gem_request *head_req;
+	if (prev != next)
+		return false;
 
-	assert_spin_locked(&engine->execlist_lock);
+	if (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
+	    prev->execlists_force_single_submission)
+		return false;
 
-	head_req = list_first_entry_or_null(&engine->execlist_queue,
-					    struct drm_i915_gem_request,
-					    execlist_link);
+	return true;
+}
 
-	if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id)))
-               return 0;
+static void execlists_context_unqueue(struct intel_engine_cs *engine)
+{
+	struct drm_i915_gem_request *cursor, *last;
+	struct execlist_port *port = engine->execlist_port;
+	bool submit = false;
+
+	last = port->request;
+	if (last != NULL) {
+		/* WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+		 * as we resubmit the request. See gen8_emit_request()
+		 * for where we prepare the padding after the end of the
+		 * request.
+		 */
+		last->tail = last->wa_tail;
+	}
 
-	WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
+	/* Try to read in pairs and fill both submission ports */
+	spin_lock(&engine->execlist_lock);
+	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+		if (last && !merge_ctx(cursor->ctx, last->ctx)) {
+			if (port != engine->execlist_port)
+				break;
 
-	if (--head_req->elsp_submitted > 0)
-		return 0;
+			if (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
+			    cursor->ctx->execlists_force_single_submission)
+				break;
 
-	execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT);
+			i915_gem_request_assign(&port->request, last);
+			port++;
 
-	list_del(&head_req->execlist_link);
-	i915_gem_request_put(head_req);
+		}
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		engine->execlist_queue.next = &cursor->execlist_link;
+		cursor->execlist_link.prev = &engine->execlist_queue;
+	}
+	spin_unlock(&engine->execlist_lock);
 
-	return 1;
+	if (submit)
+		execlists_submit_ports(engine);
 }
 
-static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
-		   u32 *context_id)
+static bool execlists_elsp_idle(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status;
-
-	read_pointer %= GEN8_CSB_ENTRIES;
-
-	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
-
-	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-		return 0;
-
-	*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
-							      read_pointer));
+	return engine->execlist_port[0].request == NULL;
+}
 
-	return status;
+static bool execlists_elsp_ready(struct intel_engine_cs *engine)
+{
+	if (engine->disable_lite_restore_wa || engine->preempt_wa)
+		return engine->execlist_port[0].request == NULL;
+	else
+		return engine->execlist_port[1].request == NULL;
 }
 
 /*
@@ -544,100 +483,62 @@ static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
 	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status_pointer;
-	unsigned int read_pointer, write_pointer;
-	u32 csb[GEN8_CSB_ENTRIES][2];
-	unsigned int csb_read = 0, i;
-	unsigned int submit_contexts = 0;
 
 	intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
 
-	status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine));
-
-	read_pointer = engine->next_context_status_buffer;
-	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
-	if (read_pointer > write_pointer)
-		write_pointer += GEN8_CSB_ENTRIES;
-
-	while (read_pointer < write_pointer) {
-		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
-			break;
-		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
-						      &csb[csb_read][1]);
-		csb_read++;
-	}
-
-	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
-
-	/* Update the read pointer to the old write pointer. Manual ringbuffer
-	 * management ftw </sarcasm> */
-	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
-		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-				    engine->next_context_status_buffer << 8));
-
-	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
-
-	spin_lock(&engine->execlist_lock);
-
-	for (i = 0; i < csb_read; i++) {
-		if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
-			if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
-				if (execlists_check_remove_request(engine, csb[i][1]))
-					WARN(1, "Lite Restored request removed from queue\n");
-			} else
-				WARN(1, "Preemption without Lite Restore\n");
+	if (!execlists_elsp_idle(engine)) {
+		u32 *ring_mmio =
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
+		u32 *csb_mmio =
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
+		unsigned ring, head, tail;
+
+		ring = readl(ring_mmio);
+		head = GEN8_CSB_READ_PTR(ring);
+		tail = GEN8_CSB_WRITE_PTR(ring);
+		if (tail < head)
+			tail += GEN8_CSB_ENTRIES;
+		while (head < tail) {
+			unsigned idx = ++head % GEN8_CSB_ENTRIES;
+			unsigned status = readl(&csb_mmio[2*idx]);
+
+			if (status & GEN8_CTX_STATUS_COMPLETED_MASK) {
+				GEM_BUG_ON(engine->execlist_port[0].count == 0);
+				if (--engine->execlist_port[0].count == 0) {
+					GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
+					execlists_context_status_change(engine->execlist_port[0].request,
+									INTEL_CONTEXT_SCHEDULE_OUT);
+					i915_gem_request_put(engine->execlist_port[0].request);
+					engine->execlist_port[0] = engine->execlist_port[1];
+					memset(&engine->execlist_port[1], 0,
+					       sizeof(engine->execlist_port[1]));
+					engine->preempt_wa = false;
+				}
+			}
+			GEM_BUG_ON(engine->execlist_port[0].count == 0 &&
+				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
 		}
 
-		if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
-		    GEN8_CTX_STATUS_ELEMENT_SWITCH))
-			submit_contexts +=
-				execlists_check_remove_request(engine, csb[i][1]);
+		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+				     GEN8_CSB_WRITE_PTR(ring) << 8),
+		       ring_mmio);
 	}
 
-	if (submit_contexts) {
-		if (!engine->disable_lite_restore_wa ||
-		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
-			execlists_unqueue(engine);
-	}
+	if (execlists_elsp_ready(engine))
+		execlists_context_unqueue(engine);
 
-	spin_unlock(&engine->execlist_lock);
-
-	if (unlikely(submit_contexts > 2))
-		DRM_ERROR("More than two context complete events?\n");
+	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	struct drm_i915_gem_request *cursor;
-	int num_elements = 0;
 
 	spin_lock_bh(&engine->execlist_lock);
 
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
-		if (++num_elements > 2)
-			break;
-
-	if (num_elements > 2) {
-		struct drm_i915_gem_request *tail_req;
-
-		tail_req = list_last_entry(&engine->execlist_queue,
-					   struct drm_i915_gem_request,
-					   execlist_link);
-
-		if (request->ctx == tail_req->ctx) {
-			WARN(tail_req->elsp_submitted != 0,
-				"More than 2 already-submitted reqs queued\n");
-			list_del(&tail_req->execlist_link);
-			i915_gem_request_put(tail_req);
-		}
-	}
-
-	i915_gem_request_get(request);
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
-	request->ctx_hw_id = request->ctx->hw_id;
-	if (num_elements == 0)
-		execlists_unqueue(engine);
+	if (execlists_elsp_idle(engine))
+		tasklet_hi_schedule(&engine->irq_tasklet);
 
 	spin_unlock_bh(&engine->execlist_lock);
 }
@@ -731,6 +632,7 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_advance(ring);
+	request->wa_tail = request->ring->tail;
 
 	/* We keep the previous context alive until we retire the following
 	 * request. This ensures that any the context object is still pinned
@@ -743,23 +645,6 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	return 0;
 }
 
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_request *req, *tmp;
-	LIST_HEAD(cancel_list);
-
-	WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex));
-
-	spin_lock_bh(&engine->execlist_lock);
-	list_replace_init(&engine->execlist_queue, &cancel_list);
-	spin_unlock_bh(&engine->execlist_lock);
-
-	list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
-		list_del(&req->execlist_link);
-		i915_gem_request_put(req);
-	}
-}
-
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine)
 {
@@ -1276,7 +1161,6 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	unsigned int next_context_status_buffer_hw;
 
 	lrc_init_hws(engine);
 
@@ -1287,32 +1171,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	I915_WRITE(RING_MODE_GEN7(engine),
 		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
 		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
-	POSTING_READ(RING_MODE_GEN7(engine));
 
-	/*
-	 * Instead of resetting the Context Status Buffer (CSB) read pointer to
-	 * zero, we need to read the write pointer from hardware and use its
-	 * value because "this register is power context save restored".
-	 * Effectively, these states have been observed:
-	 *
-	 *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
-	 * BDW  | CSB regs not reset       | CSB regs reset       |
-	 * CHT  | CSB regs not reset       | CSB regs not reset   |
-	 * SKL  |         ?                |         ?            |
-	 * BXT  |         ?                |         ?            |
-	 */
-	next_context_status_buffer_hw =
-		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine)));
-
-	/*
-	 * When the CSB registers are reset (also after power-up / gpu reset),
-	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
-	 * this special case, so the first element read is CSB[0].
-	 */
-	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
+	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine),
+		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK |
+				 GEN8_CSB_WRITE_PTR_MASK,
+				 0));
 
-	engine->next_context_status_buffer = next_context_status_buffer_hw;
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
 
 	intel_engine_init_hangcheck(engine);
@@ -1697,7 +1561,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	}
 	intel_lr_context_unpin(dev_priv->kernel_context, engine);
 
-	engine->idle_lite_restore_wa = 0;
 	engine->disable_lite_restore_wa = false;
 	engine->ctx_desc_template = 0;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index a52cf57dbd40..4d70346500c2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -97,6 +97,4 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
 void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
 
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
-
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index ede09f000af5..aa8ee7b870bf 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -1215,7 +1215,7 @@ int intel_overlay_put_image_ioctl(struct drm_device *dev, void *data,
 out_unlock:
 	mutex_unlock(&dev->struct_mutex);
 	drm_modeset_unlock_all(dev);
-	i915_gem_object_put_unlocked(new_bo);
+	i915_gem_object_put(new_bo);
 out_free:
 	kfree(params);
 
@@ -1459,7 +1459,7 @@ void intel_cleanup_overlay(struct drm_i915_private *dev_priv)
 	 * hardware should be off already */
 	WARN_ON(dev_priv->overlay->active);
 
-	i915_gem_object_put_unlocked(dev_priv->overlay->reg_bo);
+	i915_gem_object_put(dev_priv->overlay->reg_bo);
 	kfree(dev_priv->overlay);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index df2efa4c713e..073d12502736 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -5720,7 +5720,7 @@ static void valleyview_cleanup_pctx(struct drm_i915_private *dev_priv)
 	if (WARN_ON(!dev_priv->vlv_pctx))
 		return;
 
-	i915_gem_object_put_unlocked(dev_priv->vlv_pctx);
+	i915_gem_object_put(dev_priv->vlv_pctx);
 	dev_priv->vlv_pctx = NULL;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 7da768a8d06e..5cdd3c960903 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -292,11 +292,14 @@ struct intel_engine_cs {
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
 	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
+	struct execlist_port {
+		struct drm_i915_gem_request *request;
+		unsigned count;
+	} execlist_port[2];
 	struct list_head execlist_queue;
 	unsigned int fw_domains;
-	unsigned int next_context_status_buffer;
-	unsigned int idle_lite_restore_wa;
 	bool disable_lite_restore_wa;
+	bool preempt_wa;
 	u32 ctx_desc_template;
 
 	/**
diff --git a/include/linux/fence.h b/include/linux/fence.h
index 523ea3fbbddd..798bde6fde74 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -34,6 +34,8 @@ struct fence;
 struct fence_ops;
 struct fence_cb;
 
+struct kfence;
+
 /**
  * struct fence - software synchronization primitive
  * @refcount: refcount for this fence
@@ -377,4 +379,8 @@ u64 fence_context_alloc(unsigned num);
 			##args);					\
 	} while (0)
 
+int kfence_await_dma_fence(struct kfence *fence,
+			   struct fence *dma,
+			   gfp_t gfp);
+
 #endif /* __LINUX_FENCE_H */
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
new file mode 100644
index 000000000000..9b7dbd892218
--- /dev/null
+++ b/include/linux/kfence.h
@@ -0,0 +1,86 @@
+/*
+ * kfence.h - library routines for N:M synchronisation points
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#ifndef _KFENCE_H_
+#define _KFENCE_H_
+
+#include <linux/gfp.h>
+#include <linux/kref.h>
+#include <linux/notifier.h> /* for NOTIFY_DONE */
+#include <linux/wait.h>
+
+struct completion;
+struct fence;
+struct reservation_object;
+enum hrtimer_mode;
+
+/**
+ * struct kfence - used for tracking pending events and a set of listeners.
+ * @wait: a waitqueue of listeners (includes both kfences and tasks)
+ * @flags: a bitmask of interesting bits, mixed in with a notification function
+ * @kref: a reference counter
+ * @pending: an atomic counter of pending events
+ */
+struct kfence {
+	wait_queue_head_t wait;
+	unsigned long flags;
+	struct kref kref;
+	atomic_t pending;
+};
+
+#define KFENCE_CHECKED_BIT	0 /* used internally for DAG checking */
+#define KFENCE_PRIVATE_BIT	1 /* available for use by owner */
+#define KFENCE_MASK		(~3)
+
+/**
+ * typedef kfence_notify_t - callback function type
+ *
+ * An optional callback to a fence may be provided that is called first
+ * when the fence is complete, and later when the fence is released. The
+ * callback must of &kfence_notify_t function type, and be aligned using
+ * __kfence_call function attribute.
+ */
+typedef int (*kfence_notify_t)(struct kfence *);
+#define __kfence_call __aligned(4)
+
+void kfence_init(struct kfence *fence, kfence_notify_t fn);
+
+struct kfence *kfence_get(struct kfence *fence);
+void kfence_put(struct kfence *fence);
+
+void kfence_await(struct kfence *fence);
+int kfence_await_kfence(struct kfence *fence,
+			struct kfence *after,
+			gfp_t gfp);
+int kfence_await_completion(struct kfence *fence,
+			    struct completion *x,
+			    gfp_t gfp);
+int kfence_await_hrtimer(struct kfence *fence,
+			 clockid_t clock, enum hrtimer_mode mode,
+			 ktime_t delay, u64 slack,
+			 gfp_t gfp);
+void kfence_complete(struct kfence *fence);
+void kfence_wake_up_all(struct kfence *fence);
+void kfence_wait(struct kfence *fence);
+
+/**
+ * kfence_done - report when the fence has been passed
+ * @fence: the kfence to query
+ *
+ * kfence_done() reports true when the fence is no longer waiting for any
+ * events and has completed its fence-complete notification.
+ *
+ * Returns true when the fence has been passed, false otherwise.
+ */
+static inline bool kfence_done(const struct kfence *fence)
+{
+	return atomic_read(&fence->pending) < 0;
+}
+
+#endif /* _KFENCE_H_ */
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index b0f305e77b7f..1954bab95db9 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -49,6 +49,8 @@ extern struct ww_class reservation_ww_class;
 extern struct lock_class_key reservation_seqcount_class;
 extern const char reservation_seqcount_string[];
 
+struct kfence;
+
 /**
  * struct reservation_object_list - a list of shared fences
  * @rcu: for internal use
@@ -210,4 +212,9 @@ long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
 bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
 					  bool test_all);
 
+int kfence_await_reservation(struct kfence *fence,
+			     struct reservation_object *resv,
+			     bool write,
+			     gfp_t gfp);
+
 #endif /* _LINUX_RESERVATION_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..ff11f31b7ec9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o
+	    async.o kfence.o range.o smpboot.o
 
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/kfence.c b/kernel/kfence.c
new file mode 100644
index 000000000000..4605eabc2c1b
--- /dev/null
+++ b/kernel/kfence.c
@@ -0,0 +1,497 @@
+/*
+ * (C) Copyright 2016 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/fence.h>
+#include <linux/kfence.h>
+#include <linux/reservation.h>
+#include <linux/slab.h>
+
+/**
+ * DOC: kfence overview
+ *
+ * kfences provide synchronisation barriers between multiple tasks. They are
+ * very similar to completions, or the OpenGL fence synchronisation object.
+ * Where kfences differ from completions is their ability to track multiple
+ * event sources rather than being a singular "completion event". Similar to
+ * completions multiple processes can wait upon a kfence. However, unlike
+ * completions, a kfence can wait upon other kfences allowing for a graph
+ * of interdependent events.
+ *
+ * Each kfence is a one-shot flag, signaling that work has progressed past
+ * a certain point (as measured by completion of all events the kfence is
+ * listening for) and the waiters upon that kfence may proceed.
+ *
+ * kfences provide both signaling and waiting routines:
+ *
+ * - kfence_await(): indicates that the kfence is asynchronously waiting for
+ *   another event.
+ *
+ * - kfence_complete(): undoes the earlier await and marks the fence as done
+ *   if all of its pending events have been completed.
+ *
+ * - kfence_done(): reports whether or not the kfence has been passed.
+ *
+ * - kfence_wait(): allows the caller to sleep (uninterruptibly) until the
+ *   fence is passed.
+ *
+ * This interface is very similar to completions, with the exception of
+ * allowing the fence to await multiple events. kfences can wait upon other
+ * fences or other hardware events, building an ordered dependency graph:
+ *
+ * - kfence_await_kfence(): the kfence asynchronously waits upon completion
+ *   of another kfence
+ *
+ * - kfence_await_completion(): the kfence asynchronously waits upon a
+ *   completion
+ *
+ * - kfence_await_hrtimer(): the kfence asynchronously wait for an expiration
+ *   of a timer
+ *
+ * - kfence_await_dma_fence(): the kfence asynchronously waits for a DMA
+ *   (hardware signaled) fence
+ *
+ * - kfence_await_reservation(): the kfence asynchronously waits for a DMA
+ *   reservation object
+ *
+ * A kfence is initialised using kfence_init(), and starts off awaiting an
+ * event. Once you have finished setting up the fence, including adding
+ * all of its asynchronous waits, call kfence_complete().
+ *
+ * Unlike completions, kfences are expected to live inside more complex graphs
+ * and form the basis for parallel execution of interdependent tasks and so are
+ * reference counted. Use kfence_get() and kfence_put() to acquire or release
+ * a reference to the kfence respectively.
+ *
+ * The kfence can be embedded inside a larger structure and be used as part
+ * of its event driven mechanism. As such kfence_init() can be passed a
+ * callback function that will be called first when the kfence is completed,
+ * and again when the kfence is to be freed. If no callback is provided, the
+ * kfence will be freed using kfree() when its reference count hits zero -
+ * if it is embedded inside another structure and no callback is provided,
+ * it must be the first member of its parent struct.
+ *
+ * The fence-completed notification is called before any listeners upon the
+ * fence are signaled, or any waiters woken. You can defer their wake up by
+ * returning NOTIFY_OK from the fence-completed notification and calling
+ * kfence_wake_up_all() later when ready.
+ */
+
+static DEFINE_SPINLOCK(kfence_lock);
+
+static int __kfence_notify(struct kfence *fence)
+{
+	kfence_notify_t fn;
+
+	fn = (kfence_notify_t)(fence->flags & KFENCE_MASK);
+	return fn(fence);
+}
+
+static void kfence_free(struct kref *kref)
+{
+	struct kfence *fence = container_of(kref, typeof(*fence), kref);
+
+	WARN_ON(atomic_read(&fence->pending) > 0);
+
+	if (fence->flags & KFENCE_MASK)
+		WARN_ON(__kfence_notify(fence) != NOTIFY_DONE);
+	else
+		kfree(fence);
+}
+
+/**
+ * kfence_put - release a reference to a kfence
+ * @fence: the kfence being disposed of
+ *
+ * kfence_put() decrements the reference count on the @fence, and when
+ * it hits zero the fence will be freed.
+ */
+void kfence_put(struct kfence *fence)
+{
+	kref_put(&fence->kref, kfence_free);
+}
+EXPORT_SYMBOL_GPL(kfence_put);
+
+/**
+ * kfence_get - acquire a reference to a kfence
+ * @fence: the kfence being used
+ *
+ * Returns the pointer to the kfence, with its reference count incremented.
+ */
+struct kfence *kfence_get(struct kfence *fence)
+{
+	kref_get(&fence->kref);
+	return fence;
+}
+EXPORT_SYMBOL_GPL(kfence_get);
+
+static void __kfence_wake_up_all(struct kfence *fence,
+				 struct list_head *continuation)
+{
+	wait_queue_head_t *x = &fence->wait;
+	unsigned long flags;
+
+	atomic_dec(&fence->pending);
+
+	/*
+	 * To prevent unbounded recursion as we traverse the graph of kfences,
+	 * we move the task_list from this the next ready fence to the tail of
+	 * the original fence's task_list (and so added to the list to be
+	 * woken).
+	 */
+	smp_mb__before_spinlock();
+	spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation);
+	if (continuation) {
+		list_splice_tail_init(&x->task_list, continuation);
+	} else {
+		while (!list_empty(&x->task_list))
+			__wake_up_locked_key(x, TASK_NORMAL, &x->task_list);
+	}
+	spin_unlock_irqrestore(&x->lock, flags);
+}
+
+/**
+ * kfence_wake_up_all - wake all waiters upon a fence
+ * @fence: the kfence to signal
+ *
+ * If the fence-complete notification is deferred, when the callback is
+ * complete it should call kfence_wake_up_all() to wake up all waiters
+ * upon the fence.
+ *
+ * It is invalid to call kfence_wake_up_all() at any time other than from
+ * inside a deferred fence-complete notification.
+ */
+void kfence_wake_up_all(struct kfence *fence)
+{
+	WARN_ON(atomic_read(&fence->pending) != 0);
+	__kfence_wake_up_all(fence, NULL);
+}
+
+static void __kfence_complete(struct kfence *fence,
+			      struct list_head *continuation)
+{
+	if (!atomic_dec_and_test(&fence->pending))
+		return;
+
+	if (fence->flags & KFENCE_MASK && __kfence_notify(fence) != NOTIFY_DONE)
+		return;
+
+	__kfence_wake_up_all(fence, continuation);
+}
+
+/**
+ * kfence_await - increment the count of events being asynchronously waited upon
+ * @fence: the kfence
+ *
+ * kfence_await() indicates that the @fence is waiting upon the completion
+ * of an event. The @fence may wait upon multiple events concurrently.
+ * When that event is complete, a corresponding call to kfence_complete()
+ * should be made.
+ */
+void kfence_await(struct kfence *fence)
+{
+	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
+}
+EXPORT_SYMBOL_GPL(kfence_await);
+
+/**
+ * kfence_complete - decrement the count of events waited upon
+ * @fence: the kfence
+ *
+ * When all event sources for the @fence are completed, i.e. the event count
+ * hits zero, all waiters upon the @fence are woken up.
+ */
+void kfence_complete(struct kfence *fence)
+{
+	if (WARN_ON(kfence_done(fence)))
+		return;
+
+	__kfence_complete(fence, NULL);
+}
+EXPORT_SYMBOL_GPL(kfence_complete);
+
+/**
+ * kfence_wait - wait upon a fence to be completed
+ * @fence: the kfence to wait upon
+ *
+ * Blocks (uninterruptibly waits) until the @fence event counter reaches zero
+ * and then also waits for the fence-completed notification to finish.
+ */
+void kfence_wait(struct kfence *fence)
+{
+	wait_event(fence->wait, kfence_done(fence));
+}
+EXPORT_SYMBOL_GPL(kfence_wait);
+
+/**
+ * kfence_init - initialize a fence for embedded use within a struct
+ * @fence: this kfence
+ * @fn: a callback function for when the fence is complete, and when the
+ * fence is released
+ *
+ * This function initialises the @fence for use embedded within a parent
+ * structure. The optional @fn hook is first called when the fence is completed
+ * (when all its pending event count hits 0) and again when the fence is
+ * to be freed. Note that the @fn will be called from atomic context. The @fn
+ * is stored inside the fence mixed with some flags, and so the @fn must
+ * be aligned using the __kfence_call function attribute.
+ *
+ * If the @fn is not provided, the kfence must be the first member in its
+ * parent struct as it will be freed using kfree().
+ *
+ * fence-complete notification: @fn will be called when the pending event
+ * count hits 0, however the fence is not completed unless the callback
+ * returns NOTIFY_DONE. During this notification callback fence_done() reports
+ * false. You can suspend completion of the fence by returning
+ * NOTIFY_OK instead and then later calling kfence_wake_up_all().
+ *
+ * fence-release notification: @fn will be called when the reference count
+ * hits 0, fence_done() will report true.
+ */
+void kfence_init(struct kfence *fence, kfence_notify_t fn)
+{
+	BUG_ON((unsigned long)fn & ~KFENCE_MASK);
+
+	init_waitqueue_head(&fence->wait);
+	kref_init(&fence->kref);
+	atomic_set(&fence->pending, 1);
+	fence->flags = (unsigned long)fn;
+}
+EXPORT_SYMBOL_GPL(kfence_init);
+
+static int kfence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
+{
+	list_del(&wq->task_list);
+	__kfence_complete(wq->private, key);
+	kfence_put(wq->private);
+	kfree(wq);
+	return 0;
+}
+
+static bool __kfence_check_if_after(struct kfence *fence,
+				    const struct kfence * const signaler)
+{
+	wait_queue_t *wq;
+
+	if (__test_and_set_bit(KFENCE_CHECKED_BIT, &fence->flags))
+		return false;
+
+	if (fence == signaler)
+		return true;
+
+	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+		if (wq->func != kfence_wake)
+			continue;
+
+		if (__kfence_check_if_after(wq->private, signaler))
+			return true;
+	}
+
+	return false;
+}
+
+static void __kfence_clear_checked_bit(struct kfence *fence)
+{
+	wait_queue_t *wq;
+
+	if (!__test_and_clear_bit(KFENCE_CHECKED_BIT, &fence->flags))
+		return;
+
+	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+		if (wq->func != kfence_wake)
+			continue;
+
+		__kfence_clear_checked_bit(wq->private);
+	}
+}
+
+static bool kfence_check_if_after(struct kfence *fence,
+				  const struct kfence * const signaler)
+{
+	unsigned long flags;
+	bool err;
+
+	if (!config_enabled(CONFIG_KFENCE_CHECK_DAG))
+		return false;
+
+	spin_lock_irqsave(&kfence_lock, flags);
+	err = __kfence_check_if_after(fence, signaler);
+	__kfence_clear_checked_bit(fence);
+	spin_unlock_irqrestore(&kfence_lock, flags);
+
+	return err;
+}
+
+static wait_queue_t *__kfence_create_wq(struct kfence *fence, gfp_t gfp)
+{
+	wait_queue_t *wq;
+
+	wq = kmalloc(sizeof(*wq), gfp);
+	if (unlikely(!wq))
+		return NULL;
+
+	INIT_LIST_HEAD(&wq->task_list);
+	wq->flags = 0;
+	wq->func = kfence_wake;
+	wq->private = kfence_get(fence);
+
+	kfence_await(fence);
+
+	return wq;
+}
+
+/**
+ * kfence_await_kfence - set one fence to wait upon another
+ * @fence: this kfence
+ * @signaler: target kfence to wait upon
+ * @gfp: the allowed allocation mask
+ *
+ * kfence_await_kfence() causes the @fence to asynchronously wait upon the
+ * completion of @signaler.
+ *
+ * Returns 1 if the @fence was added to the waitqueue of @signaler, 0
+ * if @signaler was already complete, or a negative error code.
+ */
+int kfence_await_kfence(struct kfence *fence,
+			struct kfence *signaler,
+			gfp_t gfp)
+{
+	wait_queue_t *wq;
+	unsigned long flags;
+	int pending;
+
+	if (kfence_done(signaler))
+		return 0;
+
+	/* The dependency graph must be acyclic. */
+	if (unlikely(kfence_check_if_after(fence, signaler)))
+		return -EINVAL;
+
+	wq = __kfence_create_wq(fence, gfp);
+	if (unlikely(!wq)) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		kfence_wait(signaler);
+		return 0;
+	}
+
+	spin_lock_irqsave(&signaler->wait.lock, flags);
+	if (likely(!kfence_done(signaler))) {
+		__add_wait_queue_tail(&signaler->wait, wq);
+		pending = 1;
+	} else {
+		kfence_wake(wq, 0, 0, NULL);
+		pending = 0;
+	}
+	spin_unlock_irqrestore(&signaler->wait.lock, flags);
+
+	return pending;
+}
+EXPORT_SYMBOL_GPL(kfence_await_kfence);
+
+/**
+ * kfence_await_completion - set the fence to wait upon a completion
+ * @fence: this kfence
+ * @x: target completion to wait upon
+ * @gfp: the allowed allocation mask
+ *
+ * kfence_await_completion() causes the @fence to asynchronously wait upon
+ * the completion.
+ *
+ * Returns 1 if the @fence was added to the waitqueue of @x, 0
+ * if @x was already complete, or a negative error code.
+ */
+int kfence_await_completion(struct kfence *fence,
+			    struct completion *x,
+			    gfp_t gfp)
+{
+	wait_queue_t *wq;
+	unsigned long flags;
+	int pending;
+
+	if (completion_done(x))
+		return 0;
+
+	wq = __kfence_create_wq(fence, gfp);
+	if (unlikely(!wq)) {
+		if (!gfpflags_allow_blocking(gfp))
+			return -ENOMEM;
+
+		wait_for_completion(x);
+		return 0;
+	}
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	if (likely(!READ_ONCE(x->done))) {
+		__add_wait_queue_tail(&x->wait, wq);
+		pending = 1;
+	} else {
+		kfence_wake(wq, 0, 0, NULL);
+		pending = 0;
+	}
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+
+	return pending;
+}
+EXPORT_SYMBOL_GPL(kfence_await_completion);
+
+struct timer_cb {
+	struct hrtimer timer;
+	struct kfence *fence;
+};
+
+static enum hrtimer_restart
+timer_kfence_wake(struct hrtimer *timer)
+{
+	struct timer_cb *cb = container_of(timer, typeof(*cb), timer);
+
+	kfence_complete(cb->fence);
+	kfence_put(cb->fence);
+	kfree(cb);
+
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * kfence_await_hrtimer - set the fence to wait for a period of time
+ * @fence: this kfence
+ * @clock: which clock to program
+ * @mode: delay given as relative or absolute
+ * @delay: how long or until what time to wait
+ * @slack: how much slack that may be applied to the delay
+ *
+ * kfence_await_hrtimer() causes the @fence to wait for a a period of time, or
+ * until a certain point in time. It is a convenience wrapper around
+ * hrtimer_start_range_ns(). For more details on @clock, @mode, @delay and
+ * @slack please consult the hrtimer documentation.
+ *
+ * Returns 1 if the delay was sucessfuly added to the @fence, or a negative
+ * error code on failure.
+ */
+int kfence_await_hrtimer(struct kfence *fence,
+			 clockid_t clock, enum hrtimer_mode mode,
+			 ktime_t delay, u64 slack,
+			 gfp_t gfp)
+{
+	struct timer_cb *cb;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = kfence_get(fence);
+	kfence_await(fence);
+
+	hrtimer_init(&cb->timer, clock, mode);
+	cb->timer.function = timer_kfence_wake;
+
+	hrtimer_start_range_ns(&cb->timer, delay, slack, mode);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kfence_await_hrtimer);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index eb8917a71489..d8297d313a2c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1733,6 +1733,29 @@ config KPROBES_SANITY_TEST
 
 	  Say N if you are unsure.
 
+config KFENCE_SELFTEST
+	tristate "Kfence self tests"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option provides a kernel modules that can be used to test
+	  the kfence handling. This option is not useful for distributions
+	  or general kernels, but only for kernel developers working on the
+	  kfence and async_domain facility.
+
+	  Say N if you are unsure.
+
+config KFENCE_CHECK_DAG
+	bool "Check that kfence are only used with directed acyclic graphs"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option enforces that kfences are only used with directed acyclic
+	  graphs (DAG), as otherwise the cycles in the graph means that they
+	  will never be signaled (or the corresponding task executed).
+
+	  Say N if you are unsure.
+
 config BACKTRACE_SELF_TEST
 	tristate "Self test for the backtrace code"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index cfa68eb269e4..ca9ce8e700eb 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,6 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
+obj-$(CONFIG_KFENCE_SELFTEST) += test-kfence.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 lib-$(CONFIG_HAS_DMA) += dma-noop.o
diff --git a/lib/test-kfence.c b/lib/test-kfence.c
new file mode 100644
index 000000000000..1b0853fda7c3
--- /dev/null
+++ b/lib/test-kfence.c
@@ -0,0 +1,580 @@
+/*
+ * Test cases for kfence facility.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/kfence.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+static struct kfence *alloc_kfence(void)
+{
+	struct kfence *fence;
+
+	fence = kmalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return NULL;
+
+	kfence_init(fence, NULL);
+	return fence;
+}
+
+static int __init __test_self(struct kfence *fence)
+{
+	if (kfence_done(fence))
+		return -EINVAL;
+
+	kfence_complete(fence);
+	if (!kfence_done(fence))
+		return -EINVAL;
+
+	kfence_wait(fence);
+	if (!kfence_done(fence))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int __init test_self(void)
+{
+	struct kfence *fence;
+	int ret;
+
+	/* Test kfence signaling and completion testing */
+	pr_debug("%s\n", __func__);
+
+	fence = alloc_kfence();
+	if (!fence)
+		return -ENOMEM;
+
+	ret = __test_self(fence);
+
+	kfence_put(fence);
+	return ret;
+}
+
+struct test_stack {
+	struct kfence fence;
+	bool seen;
+};
+
+static int __init __kfence_call fence_callback(struct kfence *fence)
+{
+	container_of(fence, typeof(struct test_stack), fence)->seen = true;
+	return NOTIFY_DONE;
+}
+
+static int __init test_stack(void)
+{
+	struct test_stack ts;
+	int ret;
+
+	/* Test kfence signaling and completion testing (on stack) */
+	pr_debug("%s\n", __func__);
+
+	ts.seen = false;
+	kfence_init(&ts.fence, fence_callback);
+
+	ret = __test_self(&ts.fence);
+	if (ret < 0)
+		return ret;
+
+	if (!ts.seen) {
+		pr_err("fence callback not executed\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __init test_dag(void)
+{
+	struct kfence *A, *B, *C;
+
+	/* Test detection of cycles within the kfence graphs */
+	pr_debug("%s\n", __func__);
+
+	if (!config_enabled(CONFIG_KFENCE_CHECK_DAG))
+		return 0;
+
+	A = alloc_kfence();
+	if (kfence_await_kfence(A, A, GFP_KERNEL) != -EINVAL) {
+		pr_err("recursive cycle not detected (AA)\n");
+		return -EINVAL;
+	}
+
+	B = alloc_kfence();
+
+	kfence_await_kfence(A, B, GFP_KERNEL);
+	if (kfence_await_kfence(B, A, GFP_KERNEL) != -EINVAL) {
+		pr_err("single depth cycle not detected (BAB)\n");
+		return -EINVAL;
+	}
+
+	C = alloc_kfence();
+	kfence_await_kfence(B, C, GFP_KERNEL);
+	if (kfence_await_kfence(C, A, GFP_KERNEL) != -EINVAL) {
+		pr_err("cycle not detected (BA, CB, AC)\n");
+		return -EINVAL;
+	}
+
+	kfence_complete(A);
+	kfence_put(A);
+
+	kfence_complete(B);
+	kfence_put(B);
+
+	kfence_complete(C);
+	kfence_put(C);
+
+	return 0;
+}
+
+static int __init test_AB(void)
+{
+	struct kfence *A, *B;
+	int ret;
+
+	/* Test kfence (A) waiting on an event source (B) */
+	pr_debug("%s\n", __func__);
+
+	A = alloc_kfence();
+	B = alloc_kfence();
+	if (!A || !B)
+		return -ENOMEM;
+
+	ret = kfence_await_kfence(A, B, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	kfence_complete(A);
+	if (kfence_done(A))
+		return -EINVAL;
+
+	kfence_complete(B);
+	if (!kfence_done(B))
+		return -EINVAL;
+
+	if (!kfence_done(A))
+		return -EINVAL;
+
+	kfence_put(B);
+	kfence_put(A);
+	return 0;
+}
+
+static int __init test_ABC(void)
+{
+	struct kfence *A, *B, *C;
+	int ret;
+
+	/* Test a chain of fences, A waits on B who waits on C */
+	pr_debug("%s\n", __func__);
+
+	A = alloc_kfence();
+	B = alloc_kfence();
+	C = alloc_kfence();
+	if (!A || !B || !C)
+		return -ENOMEM;
+
+	ret = kfence_await_kfence(A, B, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	ret = kfence_await_kfence(B, C, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	kfence_complete(A);
+	if (kfence_done(A))
+		return -EINVAL;
+
+	kfence_complete(B);
+	if (kfence_done(B))
+		return -EINVAL;
+
+	if (kfence_done(A))
+		return -EINVAL;
+
+	kfence_complete(C);
+	if (!kfence_done(C))
+		return -EINVAL;
+
+	if (!kfence_done(B))
+		return -EINVAL;
+
+	if (!kfence_done(A))
+		return -EINVAL;
+
+	kfence_put(C);
+	kfence_put(B);
+	kfence_put(A);
+	return 0;
+}
+
+static int __init test_AB_C(void)
+{
+	struct kfence *A, *B, *C;
+	int ret;
+
+	/* Test multiple fences (AB) waiting on a single event (C) */
+	pr_debug("%s\n", __func__);
+
+	A = alloc_kfence();
+	B = alloc_kfence();
+	C = alloc_kfence();
+	if (!A || !B || !C)
+		return -ENOMEM;
+
+	ret = kfence_await_kfence(A, C, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	ret = kfence_await_kfence(B, C, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	kfence_complete(A);
+	kfence_complete(B);
+
+	if (kfence_done(A))
+		return -EINVAL;
+
+	if (kfence_done(B))
+		return -EINVAL;
+
+	kfence_complete(C);
+	if (!kfence_done(C))
+		return -EINVAL;
+
+	if (!kfence_done(B))
+		return -EINVAL;
+
+	if (!kfence_done(A))
+		return -EINVAL;
+
+	kfence_put(C);
+	kfence_put(B);
+	kfence_put(A);
+	return 0;
+}
+
+static int __init test_C_AB(void)
+{
+	struct kfence *A, *B, *C;
+	int ret;
+
+	/* Test multiple event sources (A,B) for a single fence (C) */
+	pr_debug("%s\n", __func__);
+
+	A = alloc_kfence();
+	B = alloc_kfence();
+	C = alloc_kfence();
+	if (!A || !B || !C)
+		return -ENOMEM;
+
+	ret = kfence_await_kfence(C, A, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	ret = kfence_await_kfence(C, B, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	kfence_complete(C);
+	if (kfence_done(C))
+		return -EINVAL;
+
+	kfence_complete(A);
+	kfence_complete(B);
+
+	if (!kfence_done(A))
+		return -EINVAL;
+
+	if (!kfence_done(B))
+		return -EINVAL;
+
+	if (!kfence_done(C))
+		return -EINVAL;
+
+	kfence_put(C);
+	kfence_put(B);
+	kfence_put(A);
+	return 0;
+}
+
+static int __init test_completion(void)
+{
+	struct kfence *fence;
+	struct completion x;
+	int ret;
+
+	/* Test use of a completion as an event source for kfences */
+	pr_debug("%s\n", __func__);
+
+	init_completion(&x);
+
+	fence = alloc_kfence();
+	if (!fence)
+		return -ENOMEM;
+
+	ret = kfence_await_completion(fence, &x, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	kfence_complete(fence);
+	if (kfence_done(fence))
+		return -EINVAL;
+
+	complete_all(&x);
+	if (!kfence_done(fence))
+		return -EINVAL;
+
+	kfence_put(fence);
+	return 0;
+}
+
+static int __init test_delay(void)
+{
+	struct kfence *fence;
+	ktime_t delay;
+	int ret;
+
+	/* Test use of a hrtimer as an event source for kfences */
+	pr_debug("%s\n", __func__);
+
+	fence = alloc_kfence();
+	if (!fence)
+		return -ENOMEM;
+
+	delay = ktime_get();
+
+	ret = kfence_await_hrtimer(fence, CLOCK_MONOTONIC, HRTIMER_MODE_REL,
+				   ms_to_ktime(1), 1 << 10,
+				   GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return -EINVAL;
+
+	kfence_complete(fence);
+	kfence_wait(fence);
+
+	delay = ktime_sub(ktime_get(), delay);
+	kfence_put(fence);
+
+	if (!ktime_to_ms(delay)) {
+		pr_err("kfence woke too early, delay was only %lldns\n",
+		       (long long)ktime_to_ns(delay));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct task_ipc {
+	struct work_struct work;
+	struct completion started;
+	struct kfence *in, *out;
+	int value;
+};
+
+static void __init task_ipc(struct work_struct *work)
+{
+	struct task_ipc *ipc = container_of(work, typeof(*ipc), work);
+
+	complete(&ipc->started);
+
+	kfence_wait(ipc->in);
+	smp_store_mb(ipc->value, 1);
+	kfence_complete(ipc->out);
+}
+
+static int __init test_chain(void)
+{
+	const int nfences = 4096;
+	struct kfence **fences;
+	int ret, i;
+
+	/* Test a long chain of fences */
+	pr_debug("%s\n", __func__);
+
+	fences = kmalloc_array(nfences, sizeof(*fences), GFP_KERNEL);
+	if (!fences)
+		return -ENOMEM;
+
+	for (i = 0; i < nfences; i++) {
+		fences[i] = alloc_kfence();
+		if (!fences[i])
+			return -ENOMEM;
+
+		if (i > 0) {
+			ret = kfence_await_kfence(fences[i],
+						  fences[i - 1],
+						  GFP_KERNEL);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	for (i = nfences; --i; ) {
+		kfence_complete(fences[i]);
+		if (kfence_done(fences[i]))
+			return -EINVAL;
+	}
+
+	kfence_complete(fences[0]);
+	for (i = 0; i < nfences; i++) {
+		if (!kfence_done(fences[i]))
+			return -EINVAL;
+
+		kfence_put(fences[i]);
+	}
+	kfree(fences);
+	return 0;
+}
+
+static int __init test_ipc(void)
+{
+	struct task_ipc ipc;
+	int ret = 0;
+
+	/* Test use of kfence as an interprocess signaling mechanism */
+	pr_debug("%s\n", __func__);
+
+	ipc.in = alloc_kfence();
+	ipc.out = alloc_kfence();
+	if (!ipc.in || !ipc.out)
+		return -ENOMEM;
+
+	/* use a completion to avoid chicken-and-egg testing for kfence */
+	init_completion(&ipc.started);
+
+	ipc.value = 0;
+	INIT_WORK(&ipc.work, task_ipc);
+	schedule_work(&ipc.work);
+
+	wait_for_completion(&ipc.started);
+
+	usleep_range(1000, 2000);
+	if (READ_ONCE(ipc.value)) {
+		pr_err("worker updated value before kfence was signaled\n");
+		ret = -EINVAL;
+	}
+
+	kfence_complete(ipc.in);
+	kfence_wait(ipc.out);
+
+	if (!READ_ONCE(ipc.value)) {
+		pr_err("worker signaled kfence before value was posted\n");
+		ret = -EINVAL;
+	}
+
+	flush_work(&ipc.work);
+	kfence_put(ipc.in);
+	kfence_put(ipc.out);
+	return ret;
+}
+
+static int __init test_kfence_init(void)
+{
+	int ret;
+
+	pr_info("Testing kfences\n");
+
+	ret = test_self();
+	if (ret < 0) {
+		pr_err("self failed\n");
+		return ret;
+	}
+
+	ret = test_stack();
+	if (ret < 0) {
+		pr_err("stack failed\n");
+		return ret;
+	}
+
+	ret = test_dag();
+	if (ret < 0) {
+		pr_err("DAG checker failed\n");
+		return ret;
+	}
+
+	ret = test_AB();
+	if (ret < 0) {
+		pr_err("AB failed\n");
+		return ret;
+	}
+
+	ret = test_ABC();
+	if (ret < 0) {
+		pr_err("ABC failed\n");
+		return ret;
+	}
+
+	ret = test_AB_C();
+	if (ret < 0) {
+		pr_err("AB_C failed\n");
+		return ret;
+	}
+
+	ret = test_C_AB();
+	if (ret < 0) {
+		pr_err("C_AB failed\n");
+		return ret;
+	}
+
+	ret = test_chain();
+	if (ret < 0) {
+		pr_err("chain failed\n");
+		return ret;
+	}
+
+	ret = test_ipc();
+	if (ret < 0) {
+		pr_err("ipc failed\n");
+		return ret;
+	}
+
+	ret = test_completion();
+	if (ret < 0) {
+		pr_err("completion failed\n");
+		return ret;
+	}
+
+	ret = test_delay();
+	if (ret < 0) {
+		pr_err("delay failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit test_kfence_cleanup(void)
+{
+}
+
+module_init(test_kfence_init);
+module_exit(test_kfence_cleanup);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/lib/kfence.sh b/tools/testing/selftests/lib/kfence.sh
new file mode 100755
index 000000000000..487320c70ed1
--- /dev/null
+++ b/tools/testing/selftests/lib/kfence.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# Runs infrastructure tests using test-kfence kernel module
+
+if /sbin/modprobe -q test-kfence; then
+	/sbin/modprobe -q -r test-kfence
+	echo "kfence: ok"
+else
+	echo "kfence: [FAIL]"
+	exit 1
+fi
-- 
2.8.1