[PATCH 09/43] drm/i915: Unify active context tracking between legacy/execlists/guc

Sun Dec 4 21:16:47 UTC 2016

The requests conversion introduced a nasty bug where we could generate a
new request in the middle of constructing a request. The new request
would be executed (and waited upon) before the current one, creating a
minor havoc in the seqno accounting. (Prior to deferred seqno
assignment, this would mean that the seqno would be out of order, and
the current request would be deemed complete even before it was
submitted.)

We also employed two different mechanisms to track the active context
until it was switched out. The legacy method allowed for waiting upon an
active context (it could forcibly evict any vma, including context's),
but the execlists method took a step backwards by pinning the vma for
the entire active lifespan of the context (the only way to evict was to
idle the entire GPU, not individual contexts). However, to circumvent
the tricky issue of locking (i.e. we cannot take struct_mutex at the
time of i915_gem_request_submit(), where we would want to move the
previous context onto the active tracker and unpin it), we take the
execlists approach and keep the contexts pinned until retirement.
The benefit of the execlists approach, more important for execlists than
legacy, was the reduction in work in pinning the context for each
request - as the context was kept pinned until idle, it could short
circuit the pinning for all active contexts.

We introduce new engine vfuncs to pin and unpin the context
respectively. The context is pinned at the start of the request, and
only unpinned when the following request is retired (this ensures that
the context is idle and coherent in main memory before we unpin it). We
move the engine->last_context tracking into the retirement itself
(rather than during request submission) in order to allow the submission
to be reordered or unwound without undue difficultly.

And finally an ulterior motive for unifying context handling was to
prepare for mock requests.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h            |   4 --
 drivers/gpu/drm/i915/i915_gem_context.c    | 102 +++--------------------------
 drivers/gpu/drm/i915/i915_gem_request.c    |  37 +++++++----
 drivers/gpu/drm/i915/i915_gem_request.h    |  11 ----
 drivers/gpu/drm/i915/i915_guc_submission.c |  11 ----
 drivers/gpu/drm/i915/i915_perf.c           |  18 ++---
 drivers/gpu/drm/i915/intel_engine_cs.c     |  21 +++++-
 drivers/gpu/drm/i915/intel_lrc.c           |  63 ++++++------------
 drivers/gpu/drm/i915/intel_lrc.h           |   5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  57 +++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   4 ++
 11 files changed, 122 insertions(+), 211 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 035ac75dd1ed..ba4aa6aaf64c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2233,7 +2233,6 @@ struct drm_i915_private {
 			struct i915_perf_stream *exclusive_stream;
 
 			u32 specific_ctx_id;
-			struct i915_vma *pinned_rcs_vma;
 
 			struct hrtimer poll_check_timer;
 			wait_queue_head_t poll_wq;
@@ -3272,9 +3271,6 @@ int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
 void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
 int i915_switch_context(struct drm_i915_gem_request *req);
 int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv);
-struct i915_vma *
-i915_gem_context_pin_legacy(struct i915_gem_context *ctx,
-			    unsigned int flags);
 void i915_gem_context_free(struct kref *ctx_ref);
 struct i915_gem_context *
 i915_gem_context_create_gvt(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index a57c22659a3c..95812c26767c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -416,21 +416,6 @@ i915_gem_context_create_gvt(struct drm_device *dev)
 	return ctx;
 }
 
-static void i915_gem_context_unpin(struct i915_gem_context *ctx,
-				   struct intel_engine_cs *engine)
-{
-	if (i915.enable_execlists) {
-		intel_lr_context_unpin(ctx, engine);
-	} else {
-		struct intel_context *ce = &ctx->engine[engine->id];
-
-		if (ce->state)
-			i915_vma_unpin(ce->state);
-
-		i915_gem_context_put(ctx);
-	}
-}
-
 int i915_gem_context_init(struct drm_i915_private *dev_priv)
 {
 	struct i915_gem_context *ctx;
@@ -490,10 +475,11 @@ void i915_gem_context_lost(struct drm_i915_private *dev_priv)
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
 	for_each_engine(engine, dev_priv, id) {
-		if (engine->last_context) {
-			i915_gem_context_unpin(engine->last_context, engine);
-			engine->last_context = NULL;
-		}
+		if (!engine->last_context)
+			continue;
+
+		engine->context_unpin(engine, engine->last_context);
+		engine->last_context = NULL;
 	}
 
 	/* Force the GPU state to be restored on enabling */
@@ -761,57 +747,20 @@ needs_pd_load_post(struct i915_hw_ppgtt *ppgtt,
 	return false;
 }
 
-struct i915_vma *
-i915_gem_context_pin_legacy(struct i915_gem_context *ctx,
-			    unsigned int flags)
-{
-	struct i915_vma *vma = ctx->engine[RCS].state;
-	int ret;
-
-	/* Clear this page out of any CPU caches for coherent swap-in/out.
-	 * We only want to do this on the first bind so that we do not stall
-	 * on an active context (which by nature is already on the GPU).
-	 */
-	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
-		ret = i915_gem_object_set_to_gtt_domain(vma->obj, false);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	ret = i915_vma_pin(vma, 0, ctx->ggtt_alignment, PIN_GLOBAL | flags);
-	if (ret)
-		return ERR_PTR(ret);
-
-	return vma;
-}
-
 static int do_rcs_switch(struct drm_i915_gem_request *req)
 {
 	struct i915_gem_context *to = req->ctx;
 	struct intel_engine_cs *engine = req->engine;
 	struct i915_hw_ppgtt *ppgtt = to->ppgtt ?: req->i915->mm.aliasing_ppgtt;
-	struct i915_vma *vma;
-	struct i915_gem_context *from;
+	struct i915_gem_context *from = engine->last_context;
 	u32 hw_flags;
 	int ret, i;
 
+	GEM_BUG_ON(engine->id != RCS);
+
 	if (skip_rcs_switch(ppgtt, engine, to))
 		return 0;
 
-	/* Trying to pin first makes error handling easier. */
-	vma = i915_gem_context_pin_legacy(to, 0);
-	if (IS_ERR(vma))
-		return PTR_ERR(vma);
-
-	/*
-	 * Pin can switch back to the default context if we end up calling into
-	 * evict_everything - as a last ditch gtt defrag effort that also
-	 * switches to the default context. Hence we need to reload from here.
-	 *
-	 * XXX: Doing so is painfully broken!
-	 */
-	from = engine->last_context;
-
 	if (needs_pd_load_pre(ppgtt, engine, to)) {
 		/* Older GENs and non render rings still want the load first,
 		 * "PP_DCLV followed by PP_DIR_BASE register through Load
@@ -820,7 +769,7 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 		trace_switch_mm(engine, to);
 		ret = ppgtt->switch_mm(ppgtt, req);
 		if (ret)
-			goto err;
+			return ret;
 	}
 
 	if (!to->engine[RCS].initialised || i915_gem_context_is_default(to))
@@ -837,29 +786,8 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	if (to != from || (hw_flags & MI_FORCE_RESTORE)) {
 		ret = mi_set_context(req, hw_flags);
 		if (ret)
-			goto err;
-	}
-
-	/* The backing object for the context is done after switching to the
-	 * *next* context. Therefore we cannot retire the previous context until
-	 * the next context has already started running. In fact, the below code
-	 * is a bit suboptimal because the retiring can occur simply after the
-	 * MI_SET_CONTEXT instead of when the next seqno has completed.
-	 */
-	if (from != NULL) {
-		/* As long as MI_SET_CONTEXT is serializing, ie. it flushes the
-		 * whole damn pipeline, we don't need to explicitly mark the
-		 * object dirty. The only exception is that the context must be
-		 * correct in case the object gets swapped out. Ideally we'd be
-		 * able to defer doing this until we know the object would be
-		 * swapped, but there is no way to do that yet.
-		 */
-		i915_vma_move_to_active(from->engine[RCS].state, req, 0);
-		/* state is kept alive until the next request */
-		i915_vma_unpin(from->engine[RCS].state);
-		i915_gem_context_put(from);
+			return ret;
 	}
-	engine->last_context = i915_gem_context_get(to);
 
 	/* GEN8 does *not* require an explicit reload if the PDPs have been
 	 * setup, and we do not wish to move them.
@@ -900,10 +828,6 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	}
 
 	return 0;
-
-err:
-	i915_vma_unpin(vma);
-	return ret;
 }
 
 /**
@@ -943,12 +867,6 @@ int i915_switch_context(struct drm_i915_gem_request *req)
 			ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
 		}
 
-		if (to != engine->last_context) {
-			if (engine->last_context)
-				i915_gem_context_put(engine->last_context);
-			engine->last_context = i915_gem_context_get(to);
-		}
-
 		return 0;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index fcf22b0e2967..f84c35b2b8c2 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -206,6 +206,7 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
 
 static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 {
+	struct intel_engine_cs *engine = request->engine;
 	struct i915_gem_active *active, *next;
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
@@ -216,9 +217,9 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 
 	trace_i915_gem_request_retire(request);
 
-	spin_lock_irq(&request->engine->timeline->lock);
+	spin_lock_irq(&engine->timeline->lock);
 	list_del_init(&request->link);
-	spin_unlock_irq(&request->engine->timeline->lock);
+	spin_unlock_irq(&engine->timeline->lock);
 
 	/* We know the GPU must have read the request to have
 	 * sent us the seqno + interrupt, so use the position
@@ -266,17 +267,20 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 
 	i915_gem_request_remove_from_client(request);
 
-	if (request->previous_context) {
-		if (i915.enable_execlists)
-			intel_lr_context_unpin(request->previous_context,
-					       request->engine);
-	}
-
 	/* Retirement decays the ban score as it is a sign of ctx progress */
 	if (request->ctx->ban_score > 0)
 		request->ctx->ban_score--;
 
-	i915_gem_context_put(request->ctx);
+	/* The backing object for the context is done after switching to the
+	 * *next* context. Therefore we cannot retire the previous context until
+	 * the next context has already started running. However, since we
+	 * cannot take the required locks at i915_gem_request_submit() we
+	 * defer the unpinning of the active context to now, retirement of
+	 * the subsequent request.
+	 */
+	if (engine->last_context)
+		engine->context_unpin(engine, engine->last_context);
+	engine->last_context = request->ctx;
 
 	dma_fence_signal(&request->fence);
 
@@ -524,10 +528,18 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ret = reserve_global_seqno(dev_priv);
+	/* Pinning the contexts may generate requests in order to acquire
+	 * GGTT space, so do this first before we reserve a seqno for
+	 * ourselves.
+	 */
+	ret = engine->context_pin(engine, ctx);
 	if (ret)
 		return ERR_PTR(ret);
 
+	ret = reserve_global_seqno(dev_priv);
+	if (ret)
+		goto err_unpin;
+
 	/* Move the oldest request to the slab-cache (if not in use!) */
 	req = list_first_entry_or_null(&engine->timeline->requests,
 				       typeof(*req), link);
@@ -593,11 +605,10 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
-	req->ctx = i915_gem_context_get(ctx);
+	req->ctx = ctx;
 
 	/* No zalloc, must clear what we need by hand */
 	req->global_seqno = 0;
-	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
 
@@ -637,6 +648,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	kmem_cache_free(dev_priv->requests, req);
 err_unreserve:
 	dev_priv->gt.active_requests--;
+err_unpin:
+	engine->context_unpin(engine, ctx);
 	return ERR_PTR(ret);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index e2b077df2da0..8569b35a332a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -170,17 +170,6 @@ struct drm_i915_gem_request {
 	/** Preallocate space in the ring for the emitting the request */
 	u32 reserved_space;
 
-	/**
-	 * Context related to the previous request.
-	 * As the contexts are accessed by the hardware until the switch is
-	 * completed to a new context, the hardware may still be writing
-	 * to the context object after the breadcrumb is visible. We must
-	 * not unpin/unbind/prune that object whilst still active and so
-	 * we keep the previous context pinned until the following (this)
-	 * request is retired.
-	 */
-	struct i915_gem_context *previous_context;
-
 	/** Batch buffer related to this request if any (used for
 	 * error state dump only).
 	 */
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 7fa4e74c1dd3..3f144216e188 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -533,17 +533,6 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
 
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-	struct intel_engine_cs *engine = rq->engine;
-
-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	rq->previous_context = engine->last_context;
-	engine->last_context = rq->ctx;
-
 	i915_gem_request_submit(rq);
 	__i915_guc_submit(rq);
 }
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5669f0862458..cfe4152212b9 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -641,7 +641,7 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	struct i915_vma *vma;
+	struct intel_engine_cs *engine = dev_priv->engine[RCS];
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
@@ -653,19 +653,16 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 	 *
 	 * NB: implied RCS engine...
 	 */
-	vma = i915_gem_context_pin_legacy(stream->ctx, 0);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	ret = engine->context_pin(engine, stream->ctx);
+	if (ret)
 		goto unlock;
-	}
-
-	dev_priv->perf.oa.pinned_rcs_vma = vma;
 
 	/* Explicitly track the ID (instead of calling i915_ggtt_offset()
 	 * on the fly) considering the difference with gen8+ and
 	 * execlists
 	 */
-	dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(vma);
+	dev_priv->perf.oa.specific_ctx_id =
+		i915_ggtt_offset(stream->ctx->engine[engine->id].state);
 
 unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
@@ -676,13 +673,12 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct intel_engine_cs *engine = dev_priv->engine[RCS];
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 
-	i915_vma_unpin(dev_priv->perf.oa.pinned_rcs_vma);
-	dev_priv->perf.oa.pinned_rcs_vma = NULL;
-
 	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+	engine->context_unpin(engine, stream->ctx);
 
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 }
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index e8afe1185831..97bbbc3d6aa8 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -304,15 +304,30 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 {
 	int ret;
 
-	ret = intel_engine_init_breadcrumbs(engine);
+	/* We may need to do things with the shrinker which
+	 * require us to immediately switch back to the default
+	 * context. This can cause a problem as pinning the
+	 * default context also requires GTT space which may not
+	 * be available. To avoid this we always pin the default
+	 * context.
+	 */
+	ret = engine->context_pin(engine, engine->i915->kernel_context);
 	if (ret)
 		return ret;
 
+	ret = intel_engine_init_breadcrumbs(engine);
+	if (ret)
+		goto err_unpin;
+
 	ret = i915_gem_render_state_init(engine);
 	if (ret)
-		return ret;
+		goto err_unpin;
 
 	return 0;
+
+err_unpin:
+	engine->context_unpin(engine, engine->i915->kernel_context);
+	return ret;
 }
 
 /**
@@ -330,6 +345,8 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
 	i915_gem_batch_pool_fini(&engine->batch_pool);
+
+	engine->context_unpin(engine, engine->i915->kernel_context);
 }
 
 u64 intel_engine_get_active_head(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 58499cfccb88..58cea1f9ad27 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -512,15 +512,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		RB_CLEAR_NODE(&cursor->priotree.node);
 		cursor->priotree.priority = INT_MAX;
 
-		/* We keep the previous context alive until we retire the
-		 * following request. This ensures that any the context object
-		 * is still pinned for any residual writes the HW makes into it
-		 * on the context switch into the next object following the
-		 * breadcrumb. Otherwise, we may retire the context too early.
-		 */
-		cursor->previous_context = engine->last_context;
-		engine->last_context = cursor->ctx;
-
 		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
@@ -772,8 +763,8 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
 	/* XXX Do we need to preempt to make room for us and our deps? */
 }
 
-static int intel_lr_context_pin(struct i915_gem_context *ctx,
-				struct intel_engine_cs *engine)
+static int execlists_context_pin(struct intel_engine_cs *engine,
+				 struct i915_gem_context *ctx)
 {
 	struct intel_context *ce = &ctx->engine[engine->id];
 	void *vaddr;
@@ -784,6 +775,12 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 	if (ce->pin_count++)
 		return 0;
 
+	if (!ce->state) {
+		ret = execlists_context_deferred_alloc(ctx, engine);
+		if (ret)
+			goto err;
+	}
+
 	ret = i915_vma_pin(ce->state, 0, GEN8_LR_CONTEXT_ALIGN,
 			   PIN_OFFSET_BIAS | GUC_WOPCM_TOP | PIN_GLOBAL);
 	if (ret)
@@ -825,8 +822,8 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 	return ret;
 }
 
-void intel_lr_context_unpin(struct i915_gem_context *ctx,
-			    struct intel_engine_cs *engine)
+static void execlists_context_unpin(struct intel_engine_cs *engine,
+				    struct i915_gem_context *ctx)
 {
 	struct intel_context *ce = &ctx->engine[engine->id];
 
@@ -844,31 +841,23 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 	i915_gem_context_put(ctx);
 }
 
-
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_context *ce = &request->ctx->engine[engine->id];
 	int ret;
 
+	GEM_BUG_ON(!ce->pin_count);
+
 	/* Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
 	 * have to repeat work.
 	 */
 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
 
-	if (!ce->state) {
-		ret = execlists_context_deferred_alloc(request->ctx, engine);
-		if (ret)
-			return ret;
-	}
-
+	GEM_BUG_ON(!ce->ring);
 	request->ring = ce->ring;
 
-	ret = intel_lr_context_pin(request->ctx, engine);
-	if (ret)
-		return ret;
-
 	if (i915.enable_guc_submission) {
 		/*
 		 * Check that the GuC has space for the request before
@@ -877,7 +866,7 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 		 */
 		ret = i915_guc_wq_reserve(request);
 		if (ret)
-			goto err_unpin;
+			goto err;
 	}
 
 	ret = intel_ring_begin(request, 0);
@@ -905,8 +894,7 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 err_unreserve:
 	if (i915.enable_guc_submission)
 		i915_guc_wq_unreserve(request);
-err_unpin:
-	intel_lr_context_unpin(request->ctx, engine);
+err:
 	return ret;
 }
 
@@ -1790,13 +1778,12 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	if (engine->cleanup)
 		engine->cleanup(engine);
 
-	intel_engine_cleanup_common(engine);
-
 	if (engine->status_page.vma) {
 		i915_gem_object_unpin_map(engine->status_page.vma->obj);
 		engine->status_page.vma = NULL;
 	}
-	intel_lr_context_unpin(dev_priv->kernel_context, engine);
+
+	intel_engine_cleanup_common(engine);
 
 	lrc_destroy_wa_ctx_obj(engine);
 	engine->i915 = NULL;
@@ -1821,6 +1808,10 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	/* Default vfuncs which can be overriden by each engine. */
 	engine->init_hw = gen8_init_common_ring;
 	engine->reset_hw = reset_common_ring;
+
+	engine->context_pin = execlists_context_pin;
+	engine->context_unpin = execlists_context_unpin;
+
 	engine->emit_flush = gen8_emit_flush;
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
@@ -1903,18 +1894,6 @@ logical_ring_init(struct intel_engine_cs *engine)
 	if (ret)
 		goto error;
 
-	ret = execlists_context_deferred_alloc(dctx, engine);
-	if (ret)
-		goto error;
-
-	/* As this is the default context, always pin it */
-	ret = intel_lr_context_pin(dctx, engine);
-	if (ret) {
-		DRM_ERROR("Failed to pin context for %s: %d\n",
-			  engine->name, ret);
-		goto error;
-	}
-
 	/* And setup the hardware status page. */
 	ret = lrc_setup_hws(engine, dctx->engine[engine->id].state);
 	if (ret) {
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 7c6403243394..b5630331086a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -79,13 +79,10 @@ int intel_engines_init(struct drm_i915_private *dev_priv);
 #define LRC_PPHWSP_PN	(LRC_GUCSHR_PN + 1)
 #define LRC_STATE_PN	(LRC_PPHWSP_PN + 1)
 
+struct drm_i915_private;
 struct i915_gem_context;
 
 uint32_t intel_lr_context_size(struct intel_engine_cs *engine);
-void intel_lr_context_unpin(struct i915_gem_context *ctx,
-			    struct intel_engine_cs *engine);
-
-struct drm_i915_private;
 
 void intel_lr_context_resume(struct drm_i915_private *dev_priv);
 uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index bc18a4f2643d..0bfdb1295a3b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1939,8 +1939,26 @@ intel_ring_free(struct intel_ring *ring)
 	kfree(ring);
 }
 
-static int intel_ring_context_pin(struct i915_gem_context *ctx,
-				  struct intel_engine_cs *engine)
+static int context_pin(struct i915_gem_context *ctx, unsigned int flags)
+{
+	struct i915_vma *vma = ctx->engine[RCS].state;
+	int ret;
+
+	/* Clear this page out of any CPU caches for coherent swap-in/out.
+	 * We only want to do this on the first bind so that we do not stall
+	 * on an active context (which by nature is already on the GPU).
+	 */
+	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
+		ret = i915_gem_object_set_to_gtt_domain(vma->obj, false);
+		if (ret)
+			return ret;
+	}
+
+	return i915_vma_pin(vma, 0, ctx->ggtt_alignment, PIN_GLOBAL | flags);
+}
+
+static int intel_ring_context_pin(struct intel_engine_cs *engine,
+				  struct i915_gem_context *ctx)
 {
 	struct intel_context *ce = &ctx->engine[engine->id];
 	int ret;
@@ -1951,13 +1969,15 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx,
 		return 0;
 
 	if (ce->state) {
-		struct i915_vma *vma;
+		unsigned int flags;
+
+		flags = 0;
+		if (ctx == ctx->i915->kernel_context)
+			flags = PIN_HIGH;
 
-		vma = i915_gem_context_pin_legacy(ctx, PIN_HIGH);
-		if (IS_ERR(vma)) {
-			ret = PTR_ERR(vma);
+		ret = context_pin(ctx, flags);
+		if (ret)
 			goto error;
-		}
 	}
 
 	/* The kernel context is only used as a placeholder for flushing the
@@ -1978,8 +1998,8 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx,
 	return ret;
 }
 
-static void intel_ring_context_unpin(struct i915_gem_context *ctx,
-				     struct intel_engine_cs *engine)
+static void intel_ring_context_unpin(struct intel_engine_cs *engine,
+				     struct i915_gem_context *ctx)
 {
 	struct intel_context *ce = &ctx->engine[engine->id];
 
@@ -2008,17 +2028,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	if (ret)
 		goto error;
 
-	/* We may need to do things with the shrinker which
-	 * require us to immediately switch back to the default
-	 * context. This can cause a problem as pinning the
-	 * default context also requires GTT space which may not
-	 * be available. To avoid this we always pin the default
-	 * context.
-	 */
-	ret = intel_ring_context_pin(dev_priv->kernel_context, engine);
-	if (ret)
-		goto error;
-
 	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
 	if (IS_ERR(ring)) {
 		ret = PTR_ERR(ring);
@@ -2077,8 +2086,6 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
 
 	intel_engine_cleanup_common(engine);
 
-	intel_ring_context_unpin(dev_priv->kernel_context, engine);
-
 	engine->i915 = NULL;
 	dev_priv->engine[engine->id] = NULL;
 	kfree(engine);
@@ -2099,12 +2106,15 @@ int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	int ret;
 
+	GEM_BUG_ON(!request->ctx->engine[request->engine->id].pin_count);
+
 	/* Flush enough space to reduce the likelihood of waiting after
 	 * we start building the request - in which case we will just
 	 * have to repeat work.
 	 */
 	request->reserved_space += LEGACY_REQUEST_SIZE;
 
+	GEM_BUG_ON(!request->engine->buffer);
 	request->ring = request->engine->buffer;
 
 	ret = intel_ring_begin(request, 0);
@@ -2584,6 +2594,9 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 	engine->init_hw = init_ring_common;
 	engine->reset_hw = reset_ring_common;
 
+	engine->context_pin = intel_ring_context_pin;
+	engine->context_unpin = intel_ring_context_unpin;
+
 	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = i9xx_emit_breadcrumb_sz;
 	if (i915.semaphores) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 308931172aa8..10d202567755 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -267,6 +267,10 @@ struct intel_engine_cs {
 	void		(*reset_hw)(struct intel_engine_cs *engine,
 				    struct drm_i915_gem_request *req);
 
+	int		(*context_pin)(struct intel_engine_cs *engine,
+				       struct i915_gem_context *ctx);
+	void		(*context_unpin)(struct intel_engine_cs *engine,
+					 struct i915_gem_context *ctx);
 	int		(*init_context)(struct drm_i915_gem_request *req);
 
 	int		(*emit_flush)(struct drm_i915_gem_request *request,
-- 
2.10.2