[Intel-gfx] [RFC 36/38] drm/i915/preempt: update (LRC) ringbuffer-filling code to create preemptive requests

Fri Dec 11 06:49:52 PST 2015

From: Dave Gordon <david.s.gordon at intel.com>

This patch refactors the rinbuffer-level code (in execlists/GuC mode
only) and enhances it so that it can emit the proper sequence of opcode
for preemption requests.

A preemption request is similar to an batch submission, but doesn't
actually invoke a batchbuffer, the purpose being simply to get the
engine to stop what it's doing so that the scheduler can then send it a
new workload instead.

Preemption requests use different locations in the hardware status page
to hold the 'active' and 'done' seqnos from regular batches, so that
information pertaining to a preempted batch is not overwritten. Also,
whereas a regular batch clears its 'active' flag when it finishes (so
that TDR knows it's no longer to blame), preemption requests leave this
set and the driver clears it once the completion of the preemption
request has been noticed. Only one preemption (per ring) can be in
progress at one time, so this handshake ensures correct sequencing of
the request between the GPU and CPU.

Actually-preemptive requests are still disabled via a module parameter
at this stage, but all the components should now be ready for us to turn
it on :)

v2: Updated to use locally cached request pointer and to fix the
location of the dispatch trace point.

For: VIZ-2021
Signed-off-by: Dave Gordon <david.s.gordon at intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 177 ++++++++++++++++++++++++++++++---------
 1 file changed, 136 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 36d63b7..31645a3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -748,7 +748,7 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 	struct drm_i915_private *dev_priv = request->i915;
 	struct i915_guc_client *client = dev_priv->guc.execbuf_client;
 	const static bool fake = false;	/* true => only pretend to preempt */
-	bool preemptive = false;	/* for now */
+	bool preemptive;
 
 	intel_logical_ring_advance(request->ringbuf);
 
@@ -757,6 +757,7 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 	if (intel_ring_stopped(ring))
 		return;
 
+	preemptive = (request->scheduler_flags & i915_req_sf_preempt) != 0;
 	if (preemptive && dev_priv->guc.preempt_client && !fake)
 		client = dev_priv->guc.preempt_client;
 
@@ -951,6 +952,117 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
 }
 
 /*
+ * This function stores the specified constant value in the (index)th DWORD of the
+ * hardware status page (execlist mode only). See separate code for legacy mode.
+ */
+static void
+emit_store_dw_index(struct drm_i915_gem_request *req, uint32_t value, uint32_t index)
+{
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	uint64_t hwpa = req->ring->status_page.gfx_addr;
+	hwpa += index << MI_STORE_DWORD_INDEX_SHIFT;
+
+	intel_logical_ring_emit(ringbuf, MI_STORE_DWORD_IMM_GEN4 | MI_GLOBAL_GTT);
+	intel_logical_ring_emit(ringbuf, lower_32_bits(hwpa));
+	intel_logical_ring_emit(ringbuf, upper_32_bits(hwpa)); /* GEN8+ */
+	intel_logical_ring_emit(ringbuf, value);
+
+	req->ring->gpu_caches_dirty = true;
+}
+
+/*
+ * This function stores the specified register value in the (index)th DWORD
+ * of the hardware status page (execlist mode only). See separate code for
+ * legacy mode.
+ */
+static void
+emit_store_reg_index(struct drm_i915_gem_request *req, uint32_t reg, uint32_t index)
+{
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	uint64_t hwpa = req->ring->status_page.gfx_addr;
+	hwpa += index << MI_STORE_DWORD_INDEX_SHIFT;
+
+	intel_logical_ring_emit(ringbuf, (MI_STORE_REG_MEM+1) | MI_GLOBAL_GTT);
+	intel_logical_ring_emit(ringbuf, reg);
+	intel_logical_ring_emit(ringbuf, lower_32_bits(hwpa));
+	intel_logical_ring_emit(ringbuf, upper_32_bits(hwpa)); /* GEN8+ */
+
+	req->ring->gpu_caches_dirty = true;
+}
+
+/*
+ * Emit the commands to execute when preparing to start a batch
+ *
+ * The GPU will log the seqno of the batch before it starts
+ * running any of the commands to actually execute that batch
+ */
+static void
+emit_preamble(struct drm_i915_gem_request *req)
+{
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	uint32_t seqno = i915_gem_request_get_seqno(req);
+
+	BUG_ON(!seqno);
+	if (req->scheduler_flags & i915_req_sf_preempt)
+		emit_store_dw_index(req, seqno, I915_PREEMPTIVE_ACTIVE_SEQNO);
+	else
+		emit_store_dw_index(req, seqno, I915_BATCH_ACTIVE_SEQNO);
+
+	intel_logical_ring_emit(ringbuf, MI_REPORT_HEAD);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	req->ring->gpu_caches_dirty = true;
+}
+
+static void
+emit_relconsts_mode(struct i915_execbuffer_params *params)
+{
+	if (params->ctx->relative_constants_mode != params->instp_mode) {
+		struct intel_ringbuffer *ringbuf = params->request->ringbuf;
+
+		intel_logical_ring_emit(ringbuf, MI_NOOP);
+		intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+		intel_logical_ring_emit(ringbuf, INSTPM);
+		intel_logical_ring_emit(ringbuf, params->instp_mask << 16 | params->instp_mode);
+
+		params->ctx->relative_constants_mode = params->instp_mode;
+	}
+}
+
+/*
+ * Emit the commands that flag the end of execution of a batch.
+ *
+ * The GPU will:
+ * 1) log the request of the batch we've just completed.
+ * 2) in the case of a non-preemptive batch, clear the in-progress sequence
+ *    number; otherwise, issue a dummy register store to flush the above
+ *    write before the interrupt happens.
+ * 3) Issue a USER INTERRUPT to notify the driver that the sequence number
+ *    has been updated.
+ */
+static void
+emit_postamble(struct drm_i915_gem_request *req)
+{
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	uint32_t seqno = i915_gem_request_get_seqno(req);
+
+	BUG_ON(!seqno);
+
+	if (req->scheduler_flags & i915_req_sf_preempt) {
+		emit_store_dw_index(req, seqno, I915_PREEMPTIVE_DONE_SEQNO);
+		emit_store_reg_index(req, NOPID, I915_GEM_HWS_SCRATCH_INDEX);
+		logical_ring_invalidate_all_caches(req);
+	} else {
+		emit_store_dw_index(req, seqno, I915_BATCH_DONE_SEQNO);
+		emit_store_dw_index(req, 0, I915_BATCH_ACTIVE_SEQNO);
+		logical_ring_flush_all_caches(req);
+	}
+
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
+}
+
+/*
  * This is the main function for adding a batch to the ring.
  * It is called from the scheduler, with the struct_mutex already held.
  */
@@ -1028,6 +1140,11 @@ int intel_execlists_submission_final(struct i915_execbuffer_params *params)
 	req->head = intel_ring_get_tail(ringbuf);
 
 	/*
+	 * Log the seqno of the batch we're starting
+	 */
+	emit_preamble(req);
+
+	/*
 	 * Unconditionally invalidate gpu caches and ensure that we do flush
 	 * any residual writes from the previous batch.
 	 */
@@ -1035,25 +1152,19 @@ int intel_execlists_submission_final(struct i915_execbuffer_params *params)
 	if (ret)
 		goto err;
 
-	if (ring == &dev_priv->ring[RCS] &&
-	    params->instp_mode != params->ctx->relative_constants_mode) {
-		intel_logical_ring_emit(ringbuf, MI_NOOP);
-		intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
-		intel_logical_ring_emit(ringbuf, INSTPM);
-		intel_logical_ring_emit(ringbuf, params->instp_mask << 16 | params->instp_mode);
-		intel_logical_ring_advance(ringbuf);
-
-		params->ctx->relative_constants_mode = params->instp_mode;
-	}
+	if (!(req->scheduler_flags & i915_req_sf_preempt)) {
+		if (ring == &dev_priv->ring[RCS])
+			emit_relconsts_mode(params);
 
-	exec_start = params->batch_obj_vm_offset +
-		     params->args_batch_start_offset;
+		exec_start = params->batch_obj_vm_offset +
+			     params->args_batch_start_offset;
 
-	ret = ring->emit_bb_start(req, exec_start, params->dispatch_flags);
-	if (ret)
-		goto err;
+		ret = ring->emit_bb_start(req, exec_start, params->dispatch_flags);
+		if (ret)
+			goto err;
 
-	trace_i915_gem_ring_dispatch(req, params->dispatch_flags);
+		trace_i915_gem_ring_dispatch(req, params->dispatch_flags);
+	}
 
 	i915_gem_execbuffer_retire_commands(params);
 
@@ -1914,38 +2025,22 @@ static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno)
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_ringbuffer *ringbuf = request->ringbuf;
-	struct intel_engine_cs *ring = ringbuf->ring;
-	u64 addr;
-	u32 cmd;
 	int ret;
 
+	emit_postamble(request);
+	intel_logical_ring_advance_and_submit(request);
+
 	/*
-	 * Reserve space for 2 NOOPs at the end of each request to be
-	 * used as a workaround for not being allowed to do lite
-	 * restore with HEAD==TAIL (WaIdleLiteRestore).
+	 * Add 4 NOOPs to the end of each request. These can
+	 * be used as a workaround for not being allowed to
+	 * do lite restore with HEAD==TAIL (WaIdleLiteRestore).
 	 */
-	ret = intel_logical_ring_begin(request, 8);
+	ret = intel_logical_ring_begin(request, 4);
 	if (ret)
 		return ret;
 
-	cmd = MI_STORE_DWORD_IMM_GEN4 | MI_GLOBAL_GTT;
-	intel_logical_ring_emit(ringbuf, cmd);
-
-	addr = I915_GEM_HWS_INDEX;
-	addr <<= MI_STORE_DWORD_INDEX_SHIFT;
-	addr += ring->status_page.gfx_addr;
-	intel_logical_ring_emit(ringbuf, lower_32_bits(addr));
-	intel_logical_ring_emit(ringbuf, upper_32_bits(addr));
-
-	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
-	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
 	intel_logical_ring_emit(ringbuf, MI_NOOP);
-	intel_logical_ring_advance_and_submit(request);
-
-	/*
-	 * Here we add two extra NOOPs as padding to avoid
-	 * lite restore of a context with HEAD==TAIL.
-	 */
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
 	intel_logical_ring_emit(ringbuf, MI_NOOP);
 	intel_logical_ring_emit(ringbuf, MI_NOOP);
 	intel_logical_ring_advance(ringbuf);
-- 
1.9.1