[FOR_CI_v3 06/15] drm/i915/tdr: Identify and retire hung request

Arun Siluvery arun.siluvery at linux.intel.com
Tue Jul 5 09:55:50 UTC 2016


The request that caused hang is the one at the head of execlist queue,
unless this is removed other work cannot be processed by the GPU.

This can be handled in two ways.
(i) Save current state of the engine, reset gpu and and resubmit it after
adjust HEAD register such that it skips the batch. On resubmission, engine
should be able to execute bookkeeping instructions. We also get context
completion event, then we will remove it from the elsp_queue and things
proceed normally.

(ii) In this case we completely drop this request and remove it from
elsp_queue by ourself. We need to advance the seqno as the batch didn't
execute.

This implementation uses the second option as it is much simpler and it
doesn't involve saving any of the engine state and resubmission.

A helper function is added to resync context, this is taken from a patch by
Dave Gordon <david.s.gordon at intel.com>.

Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |   1 +
 drivers/gpu/drm/i915/i915_gem.c  |   2 +-
 drivers/gpu/drm/i915/intel_lrc.c | 120 +++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.h |   3 +
 4 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f5fb1db..e9da271 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3323,6 +3323,7 @@ int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
 struct drm_i915_gem_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
+void i915_gem_request_retire(struct drm_i915_gem_request *request);
 void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
 void i915_gem_retire_requests_ring(struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8c4708b..ccbf889 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1618,7 +1618,7 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	request->pid = NULL;
 }
 
-static void i915_gem_request_retire(struct drm_i915_gem_request *request)
+void i915_gem_request_retire(struct drm_i915_gem_request *request)
 {
 	trace_i915_gem_request_retire(request);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2e66f31..0eb738a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1029,6 +1029,126 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 	i915_gem_context_unreference(ctx);
 }
 
+/*
+ * Empty the ringbuffer associated with the specified request
+ * by updating the ringbuffer 'head' to the value of 'tail', or,
+ * if 'rezero' is true, setting both 'head' and 'tail' to zero.
+ * Then propagate the change to the associated context image.
+ */
+void intel_lr_context_resync(struct i915_gem_context *ctx,
+			     struct intel_engine_cs *engine,
+			     bool rezero)
+{
+	enum intel_engine_id engine_id = engine->id;
+	struct drm_i915_gem_object *ctx_obj;
+	struct intel_ringbuffer *ringbuf;
+	struct page *page;
+	uint32_t *reg_state;
+
+	ctx_obj = ctx->engine[engine_id].state;
+	ringbuf = ctx->engine[engine_id].ringbuf;
+
+	/*
+	 * When resetting, a hardware context might be as-yet-unused
+	 * and therefore not-yet-allocated. In other situations, the
+	 * ringbuffer and context object must already exist.
+	 */
+	if (WARN_ON(!ringbuf != !ctx_obj))
+		return;
+	if (!i915_reset_in_progress(&ctx->i915->gpu_error))
+		WARN_ON(!ringbuf || !ctx_obj);
+	if (!ringbuf || !ctx_obj)
+		return;
+	if (WARN_ON(i915_gem_object_get_pages(ctx_obj)))
+		return;
+
+	if (i915_gem_object_get_pages(ctx_obj)) {
+		WARN(1, "Failed get_pages for context obj\n");
+		return;
+	}
+	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
+	reg_state = kmap_atomic(page);
+
+	if (rezero)
+		ringbuf->tail = 0;
+	ringbuf->head = ringbuf->tail;
+	ringbuf->last_retired_head = -1;
+	intel_ring_update_space(ringbuf);
+
+	reg_state[CTX_RING_HEAD+1] = ringbuf->head;
+	reg_state[CTX_RING_TAIL+1] = ringbuf->tail;
+
+	kunmap_atomic(reg_state);
+}
+
+/**
+ * intel_execlists_reset_preprocess() - identifies the request that is
+ * hung and retires it
+ *
+ * @engine: Engine that is currently hung
+ *
+ * Returns:
+ *   0 - if engine is prepared for reset
+ *   nonzero errorcode otherwise
+ */
+int intel_execlists_reset_preprocess(struct intel_engine_cs *engine)
+{
+	struct drm_i915_gem_request *req;
+	int ret = 0;
+
+	spin_lock_bh(&engine->execlist_lock);
+
+	req = list_first_entry_or_null(&engine->execlist_queue,
+				       struct drm_i915_gem_request,
+				       execlist_link);
+
+	/*
+	 * Only acknowledge the request in the execlist queue if it's actually
+	 * been submitted to hardware, otherwise there's the risk of
+	 * inconsistency between the (unsubmitted) request and the idle
+	 * hardware state.
+	 */
+	if (req && req->ctx && req->elsp_submitted) {
+		u32 execlist_status;
+		u32 hw_context;
+		u32 hw_active;
+		struct drm_i915_private *dev_priv = engine->i915;
+
+		if (i915_gem_request_completed(req)) {
+			DRM_ERROR("Request completed, no active request\n");
+			spin_unlock_bh(&engine->execlist_lock);
+			return -EINVAL;
+		}
+
+		hw_context = I915_READ(RING_EXECLIST_STATUS_CTX_ID(engine));
+		execlist_status = I915_READ(RING_EXECLIST_STATUS_LO(engine));
+		hw_active = ((execlist_status & EXECLIST_STATUS_ELEMENT0_ACTIVE) ||
+			     (execlist_status & EXECLIST_STATUS_ELEMENT1_ACTIVE));
+
+		/* If both HW and driver agrees then we found it */
+		if (hw_active && hw_context == req->ctx->hw_id) {
+			/* advance the seqno as this request gets dropped */
+			intel_write_status_page(engine, I915_GEM_HWS_INDEX, req->seqno);
+
+			intel_lr_context_resync(req->ctx, engine, false);
+
+			/* remove it from the elsp queue */
+			req->elsp_submitted--;
+			list_del(&req->execlist_link);
+		}
+	} else {
+		ret = -EINVAL;
+		WARN(1, "No active request for %s\n", engine->name);
+	}
+
+	spin_unlock_bh(&engine->execlist_lock);
+
+	if (req)
+		i915_gem_request_retire(req);
+
+	return ret;
+}
+
 static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	int ret, i;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 2b8255c..816e476 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -31,7 +31,10 @@
 /* Execlists regs */
 #define RING_ELSP(ring)				_MMIO((ring)->mmio_base + 0x230)
 #define RING_EXECLIST_STATUS_LO(ring)		_MMIO((ring)->mmio_base + 0x234)
+#define   EXECLIST_STATUS_ELEMENT0_ACTIVE       (1 << 14)
+#define   EXECLIST_STATUS_ELEMENT1_ACTIVE       (1 << 15)
 #define RING_EXECLIST_STATUS_HI(ring)		_MMIO((ring)->mmio_base + 0x234 + 4)
+#define RING_EXECLIST_STATUS_CTX_ID(ring)	RING_EXECLIST_STATUS_HI(ring)
 #define RING_CONTEXT_CONTROL(ring)		_MMIO((ring)->mmio_base + 0x244)
 #define	  CTX_CTRL_INHIBIT_SYN_CTX_SWITCH	(1 << 3)
 #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
-- 
1.9.1



More information about the Intel-gfx-trybot mailing list