[Intel-gfx] [PATCH 09/20] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8

Tomas Elf tomas.elf at intel.com
Thu Oct 22 18:32:31 PDT 2015


*** General ***

Watchdog timeout (or "media engine reset") is a feature that allows userland
applications to enable hang detection on individual batch buffers. The
detection mechanism itself is mostly bound to the hardware and the only thing
that the driver needs to do to support this form of hang detection is to
implement the interrupt handling support as well as watchdog command emission
before and after the emitted batch buffer start instruction in the ring buffer.

The principle of the hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a particular
batch buffer and the driver is in the process of emitting the batch buffer
start instruction into the ring buffer it also emits a watchdog timer start
instruction before and a watchdog timer cancellation instruction after the
batch buffer start instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction the
hardware watchdog counter is started by the hardware. The counter keeps
counting until either reaching a previously configured threshold value or the
timer cancellation instruction is executed.

2a. If the counter reaches the threshold value the hardware fires a watchdog
interrupt that is picked up by the watchdog interrupt handler. This means that
a hang has been detected and the driver needs to deal with it the same way it
would deal with a engine hang detected by the periodic hang checker. The only
difference between the two is that we never promote full GPU reset following a
watchdog timeout in case a per-engine reset was attempted too recently. Thusly,
the watchdog interrupt handler calls the error handler directly passing the
engine mask of the hung engine in question, which immediately results in a
per-engine hang recovery being scheduled.

2b. If the batch buffer completes and the execution reaches the watchdog
cancellation instruction before the watchdog counter reaches its threshold
value the watchdog is cancelled and nothing more comes of it. No hang is
detected.

*** This patch introduces: ***

1. Command emission into the ring buffer for starting and stopping the watchdog
timer before/after batch buffer start during batch buffer submission.

2. Feature support query functions for verifying that the requested engine
actually supports watchdog timeout and fails the batch buffer submission
otherwise.

NOTE: I don't know if Ben Widawsky had any part in this code from 3 years ago.
There have been so many people involved in this already that I am in no
position to know. If I've missed anyone's sob line please let me know.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c        | 99 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h | 22 ++++++++
 2 files changed, 121 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 40607da..30cd0ac 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1056,6 +1056,80 @@ int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request)
 	return intel_logical_ring_begin(request, 0);
 }
 
+static int
+gen8_ring_start_watchdog(struct drm_i915_gem_request *req)
+{
+	int ret;
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	struct intel_engine_cs *ring = ringbuf->ring;
+
+	ret = intel_logical_ring_begin(req, 10);
+	if (ret)
+		return ret;
+
+	/*
+	 * i915_reg.h includes a warning to place a MI_NOOP
+	 * before a MI_LOAD_REGISTER_IMM
+	 */
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	/* Set counter period */
+	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+	intel_logical_ring_emit(ringbuf, RING_THRESH(ring->mmio_base));
+	intel_logical_ring_emit(ringbuf, ring->watchdog_threshold);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	/* Start counter */
+	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+	intel_logical_ring_emit(ringbuf, RING_CNTR(ring->mmio_base));
+	intel_logical_ring_emit(ringbuf, I915_WATCHDOG_ENABLE);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_advance(ringbuf);
+
+	return 0;
+}
+
+static int
+gen8_ring_stop_watchdog(struct drm_i915_gem_request *req)
+{
+	int ret;
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	struct intel_engine_cs *ring = ringbuf->ring;
+
+	ret = intel_logical_ring_begin(req, 6);
+	if (ret)
+		return ret;
+
+	/*
+	 * i915_reg.h includes a warning to place a MI_NOOP
+	 * before a MI_LOAD_REGISTER_IMM
+	 */
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+	intel_logical_ring_emit(ringbuf, RING_CNTR(ring->mmio_base));
+
+	switch (ring->id) {
+	default:
+		WARN(1, "%s does not support watchdog timeout! " \
+			"Defaulting to render engine.\n", ring->name);
+	case RCS:
+		intel_logical_ring_emit(ringbuf, GEN6_RCS_WATCHDOG_DISABLE);
+		break;
+	case VCS:
+	case VCS2:
+		intel_logical_ring_emit(ringbuf, GEN8_VCS_WATCHDOG_DISABLE);
+		break;
+	}
+
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_advance(ringbuf);
+
+	return 0;
+}
+
 /**
  * execlists_submission() - submit a batchbuffer for execution, Execlists style
  * @dev: DRM device.
@@ -1085,6 +1159,12 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
 	int instp_mode;
 	u32 instp_mask;
 	int ret;
+	bool watchdog_running = false;
+	/*
+	 * NB: Place-holder until watchdog timeout is enabled through DRM
+	 * execbuf interface
+	 */
+	bool enable_watchdog = false;
 
 	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	instp_mask = I915_EXEC_CONSTANTS_MASK;
@@ -1121,6 +1201,18 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
 	if (ret)
 		return ret;
 
+	/* Start watchdog timer */
+	if (enable_watchdog) {
+		if (!intel_ring_supports_watchdog(ring))
+			return -EINVAL;
+
+		ret = gen8_ring_start_watchdog(params->request);
+		if (ret)
+			return ret;
+
+		watchdog_running = true;
+	}
+
 	if (ring == &dev_priv->ring[RCS] &&
 	    instp_mode != dev_priv->relative_constants_mode) {
 		ret = intel_logical_ring_begin(params->request, 4);
@@ -1145,6 +1237,13 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
 
 	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
 
+	/* Cancel watchdog timer */
+	if (watchdog_running) {
+		ret = gen8_ring_stop_watchdog(params->request);
+		if (ret)
+			return ret;
+	}
+
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
 	i915_gem_execbuffer_retire_commands(params);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4968c93..011ab52 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -31,6 +31,8 @@ struct  intel_hw_status_page {
 	struct		drm_i915_gem_object *obj;
 };
 
+#define I915_WATCHDOG_ENABLE 0
+
 #define I915_READ_TAIL(ring) I915_READ(RING_TAIL((ring)->mmio_base))
 #define I915_WRITE_TAIL(ring, val) I915_WRITE(RING_TAIL((ring)->mmio_base), val)
 
@@ -524,6 +526,26 @@ int intel_ring_save(struct intel_engine_cs *ring,
 int intel_ring_restore(struct intel_engine_cs *ring,
 		struct drm_i915_gem_request *req);
 
+static inline bool intel_ring_supports_watchdog(struct intel_engine_cs *ring)
+{
+	bool ret = false;
+
+	if (WARN_ON(!ring))
+		goto exit;
+
+	ret = (	ring->id == RCS ||
+		ring->id == VCS ||
+		ring->id == VCS2);
+
+	if (!ret)
+		DRM_ERROR("%s does not support watchdog timeout!\n", ring->name);
+
+exit:
+	return ret;
+}
+int intel_ring_start_watchdog(struct intel_engine_cs *ring);
+int intel_ring_stop_watchdog(struct intel_engine_cs *ring);
+
 int __must_check intel_ring_idle(struct intel_engine_cs *ring);
 void intel_ring_init_seqno(struct intel_engine_cs *ring, u32 seqno);
 int intel_ring_flush_all_caches(struct drm_i915_gem_request *req);
-- 
1.9.1



More information about the Intel-gfx mailing list