[Intel-gfx] [RFC 08/11] drm/i915: Watchdog timeout support for gen8.

Mon Jun 8 10:03:26 PDT 2015

Watchdog timeout (or "media engine reset" as it is sometimes called, even
though the render engine is also supported) is a feature that allows userland
applications to enable hang detection on individual batch buffers. The
detection mechanism itself is mostly bound to the hardware and the only thing
that the driver needs to do to support this form of hang detection is to
implement the interrupt handling support as well as watchdog instruction
injection before and after the emitted batch buffer start instruction in the
ring buffer.

The principle of this hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a particular
batch buffer and the driver is in the process of emitting the batch buffer
start instruction into the ring buffer it also emits a watchdog timer start
instruction before and a watchdog timer cancellation instruction after the
batch buffer instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction the
hardware watchdog counter is started by the hardware.  The counter keeps
counting until it reaches a previously configured threshold value.

2a. If the counter reaches the threshold value the hardware fires a watchdog
interrupt that is picked up by the watchdog interrupt service routine in this
commit. This means that a hang has been detected and the driver needs to deal
with it the same way it would deal with a engine hang detected by the periodic
hang checker. The only difference between the two is that we never promote full
GPU reset following a watchdog timeout in case a per-engine reset was attempted
too recently. Thusly, the watchdog interrupt handler calls the error handler
directly passing the engine mask of the hung engine in question, which
immediately results in a per-engine hang recovery being scheduled.

2b. If the batch buffer finishes executing and the execution reaches the
watchdog cancellation instruction before the watchdog counter reaches its
threshold value the watchdog is cancelled and nothing more comes of it. No hang
was detected.

Currently watchdog timeout for the render engine and all available media
engines are supported. The specifications elude to the VECS engine being
supported but that is currently not supported by this commit.

The current default watchdog threshold value is 60 ms, since this has been
emprically determined to be a good compromise for low-latency requirements and
low rate of false positives.

NOTE: I don't know if Ben Widawsky had any part in this code from 3 years
ago. There have been so many people involved in this already that I am in no
position to know. If I've missed anyone's sob line please let me know.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |    2 +-
 drivers/gpu/drm/i915/i915_dma.c         |   59 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h         |    7 ++-
 drivers/gpu/drm/i915/i915_irq.c         |   86 +++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_reg.h         |    7 +++
 drivers/gpu/drm/i915/intel_lrc.c        |   99 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |   31 ++++++++++
 include/uapi/drm/i915_drm.h             |    5 +-
 8 files changed, 272 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index e33e105..a89da48 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4183,7 +4183,7 @@ i915_wedged_set(void *data, u64 val)
 
 	intel_runtime_pm_get(dev_priv);
 
-	i915_handle_error(dev, 0x0, val,
+	i915_handle_error(dev, 0x0, false, val,
 			  "Manually setting wedged to %llu", val);
 
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index b98abf8..2ec3163 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -791,6 +791,64 @@ i915_hangcheck_init(struct drm_device *dev)
 	}
 }
 
+void i915_watchdog_init(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int freq;
+	int i;
+
+	/*
+	 * Based on pre-defined time out value (60ms or 30ms) calculate
+	 * timer count thresholds needed based on core frequency.
+	 *
+	 * For RCS.
+	 * The timestamp resolution changed in Gen7 and beyond to 80ns
+	 * for all pipes. Before that it was 640ns.
+	 */
+
+#define KM_RCS_ENGINE_TIMEOUT_VALUE_IN_MS 60
+#define KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS 60
+#define KM_TIMER_MILLISECOND 1000
+
+	/*
+	 * Timestamp timer resolution = 0.080 uSec,
+	 * or 12500000 counts per second
+	 */
+#define KM_TIMESTAMP_CNTS_PER_SEC_80NS 12500000
+
+	/*
+	 * Timestamp timer resolution = 0.640 uSec,
+	 * or 1562500 counts per second
+	 */
+#define KM_TIMESTAMP_CNTS_PER_SEC_640NS 1562500
+
+	if (INTEL_INFO(dev)->gen >= 7)
+		freq = KM_TIMESTAMP_CNTS_PER_SEC_80NS;
+	else
+		freq = KM_TIMESTAMP_CNTS_PER_SEC_640NS;
+
+	dev_priv->ring[RCS].watchdog_threshold =
+		((KM_RCS_ENGINE_TIMEOUT_VALUE_IN_MS) *
+		(freq / KM_TIMER_MILLISECOND));
+
+	dev_priv->ring[VCS].watchdog_threshold =
+		((KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS) *
+		(freq / KM_TIMER_MILLISECOND));
+
+	dev_priv->ring[VCS2].watchdog_threshold =
+		((KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS) *
+		(freq / KM_TIMER_MILLISECOND));
+
+	for (i = 0; i < I915_NUM_RINGS; i++)
+		dev_priv->ring[i].hangcheck.watchdog_count = 0;
+
+	DRM_INFO("Watchdog Timeout [ms], " \
+			"RCS: 0x%08X, VCS: 0x%08X, VCS2: 0x%08X\n", \
+			KM_RCS_ENGINE_TIMEOUT_VALUE_IN_MS,
+			KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS,
+			KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS);
+}
+
 /**
  * i915_driver_load - setup chip and create an initial config
  * @dev: DRM device
@@ -972,6 +1030,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	i915_gem_load(dev);
 
 	i915_hangcheck_init(dev);
+	i915_watchdog_init(dev);
 
 	/* On the 945G/GM, the chipset reports the MSI capability on the
 	 * integrated graphics even though the support isn't actually there
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index efa43c3..5139daa 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2563,6 +2563,7 @@ extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
 extern void i915_update_gfx_val(struct drm_i915_private *dev_priv);
 int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on);
 void intel_hpd_cancel_work(struct drm_i915_private *dev_priv);
+void i915_watchdog_init(struct drm_device *dev);
 static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
 {
 	struct intel_ring_hangcheck *hc = &engine->hangcheck;
@@ -2578,9 +2579,9 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-__printf(4, 5)
-void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
-		       const char *fmt, ...);
+__printf(5, 6)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
+		       bool watchdog, bool wedged, const char *fmt, ...);
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 extern void intel_hpd_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 9913c8f..57c8568 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1289,6 +1289,18 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 				intel_lrc_irq_handler(&dev_priv->ring[RCS]);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[RCS]);
+			if (tmp & (GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT)) {
+				struct intel_engine_cs *ring;
+
+				/* Stop the counter to prevent further interrupts */
+				ring = &dev_priv->ring[RCS];
+				I915_WRITE(RING_CNTR(ring->mmio_base),
+					GEN6_RCS_WATCHDOG_DISABLE);
+
+				ring->hangcheck.watchdog_count++;
+				i915_handle_error(ring->dev, intel_ring_flag(ring), true, true,
+					"Render engine watchdog timed out");
+			}
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
 				intel_lrc_irq_handler(&dev_priv->ring[BCS]);
@@ -1308,11 +1320,35 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 				intel_lrc_irq_handler(&dev_priv->ring[VCS]);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VCS]);
+			if (tmp & (GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT)) {
+				struct intel_engine_cs *ring;
+
+				/* Stop the counter to prevent further interrupts */
+				ring = &dev_priv->ring[VCS];
+				I915_WRITE(RING_CNTR(ring->mmio_base),
+					GEN8_VCS_WATCHDOG_DISABLE);
+
+				ring->hangcheck.watchdog_count++;
+				i915_handle_error(ring->dev, intel_ring_flag(ring), true, true,
+						  "Media engine watchdog timed out");
+			}
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
 				intel_lrc_irq_handler(&dev_priv->ring[VCS2]);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VCS2]);
+			if (tmp & (GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT)) {
+				struct intel_engine_cs *ring;
+
+				/* Stop the counter to prevent further interrupts */
+				ring = &dev_priv->ring[VCS2];
+				I915_WRITE(RING_CNTR(ring->mmio_base),
+					GEN8_VCS_WATCHDOG_DISABLE);
+
+				ring->hangcheck.watchdog_count++;
+				i915_handle_error(ring->dev, intel_ring_flag(ring), true, true,
+						  "Media engine 2 watchdog timed out");
+			}
 		} else
 			DRM_ERROR("The master control interrupt lied (GT1)!\n");
 	}
@@ -2563,6 +2599,7 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  *			or if one of the current engine resets fails we fall
  *			back to legacy full GPU reset.
  *
+ * @watchdog: 		true = Engine hang detected by hardware watchdog.
  * @wedged: 		true = Hang detected, invoke hang recovery.
  * @fmt, ...: 		Error message describing reason for error.
  *
@@ -2574,8 +2611,8 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  * reset the associated engine. Failing that, try to fall back to legacy
  * full GPU reset recovery mode.
  */
-void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
-		       const char *fmt, ...)
+void i915_handle_error(struct drm_device *dev, u32 engine_mask,
+                       bool watchdog, bool wedged, const char *fmt, ...)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	va_list args;
@@ -2607,20 +2644,27 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 			u32 i;
 
 			for_each_ring(engine, dev_priv, i) {
-				u32 now, last_engine_reset_timediff;
 
 				if (!(intel_ring_flag(engine) & engine_mask))
 					continue;
 
-				/* Measure the time since this engine was last reset */
-				now = get_seconds();
-				last_engine_reset_timediff =
-					now - engine->hangcheck.last_engine_reset_time;
-
-				full_reset = last_engine_reset_timediff <
-					i915.gpu_reset_promotion_time;
-
-				engine->hangcheck.last_engine_reset_time = now;
+				if (!watchdog) {
+					/* Measure the time since this engine was last reset */
+					u32 now = get_seconds();
+					u32 last_engine_reset_timediff =
+						now - engine->hangcheck.last_engine_reset_time;
+
+					full_reset = last_engine_reset_timediff <
+						i915.gpu_reset_promotion_time;
+
+					engine->hangcheck.last_engine_reset_time = now;
+				} else {
+					/*
+					 * Watchdog timeout always results
+					 * in engine reset.
+					 */
+					full_reset = false;
+				}
 
 				/*
 				 * This engine was not reset too recently - go ahead
@@ -2631,10 +2675,11 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 				 * This can still be overridden by a global
 				 * reset e.g. if per-engine reset fails.
 				 */
-				if (!full_reset)
+				if (watchdog || !full_reset)
 					atomic_set_mask(I915_ENGINE_RESET_IN_PROGRESS,
 						&engine->hangcheck.flags);
-				else
+
+				if (full_reset)
 					break;
 
 			} /* for_each_ring */
@@ -2642,7 +2687,7 @@ void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged,
 
 		if (full_reset) {
 			atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
-					&dev_priv->gpu_error.reset_counter);
+				&dev_priv->gpu_error.reset_counter);
 		}
 
 		/*
@@ -2980,7 +3025,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 	 */
 	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
-		i915_handle_error(dev, intel_ring_flag(ring), false,
+		i915_handle_error(dev, intel_ring_flag(ring), false, false,
 				  "Kicking stuck wait on %s",
 				  ring->name);
 		I915_WRITE_CTL(ring, tmp);
@@ -2992,7 +3037,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 		default:
 			return HANGCHECK_HUNG;
 		case 1:
-			i915_handle_error(dev, intel_ring_flag(ring), false,
+			i915_handle_error(dev, intel_ring_flag(ring), false, false,
 					  "Kicking stuck semaphore on %s",
 					  ring->name);
 			I915_WRITE_CTL(ring, tmp);
@@ -3134,9 +3179,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	}
 
 	if (engine_mask)
-		i915_handle_error(dev, engine_mask, true, "Ring hung (0x%02x)", engine_mask);
+		i915_handle_error(dev, engine_mask, false, true, "Ring hung (0x%02x)", engine_mask);
 	else if (force_full_gpu_reset)
-		i915_handle_error(dev, 0x0, true,
+		i915_handle_error(dev, 0x0, false, true,
 			"Hang recovery ineffective, falling back to full GPU reset");
 
 	if (busy_count)
@@ -3591,11 +3636,14 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
 {
 	/* These are interrupts we'll toggle with the ring mask register */
 	uint32_t gt_interrupts[] = {
+		GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
 		GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
 			GT_RENDER_L3_PARITY_ERROR_INTERRUPT |
 			GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT |
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT,
+		GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
+		GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
 		GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
 			GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index af9f0ad..d2adb9b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -1181,6 +1181,8 @@ enum skl_disp_power_wells {
 #define RING_HEAD(base)		((base)+0x34)
 #define RING_START(base)	((base)+0x38)
 #define RING_CTL(base)		((base)+0x3c)
+#define RING_CNTR(base)        ((base)+0x178)
+#define RING_THRESH(base) ((base)+0x17C)
 #define RING_SYNC_0(base)	((base)+0x40)
 #define RING_SYNC_1(base)	((base)+0x44)
 #define RING_SYNC_2(base)	((base)+0x48)
@@ -1584,6 +1586,11 @@ enum skl_disp_power_wells {
 #define GT_BSD_USER_INTERRUPT			(1 << 12)
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1	(1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
 #define GT_CONTEXT_SWITCH_INTERRUPT		(1 <<  8)
+#define GT_GEN6_RENDER_WATCHDOG_INTERRUPT	(1 <<  6)
+#define GT_GEN8_RCS_WATCHDOG_INTERRUPT		(1 <<  6)
+#define   GEN6_RCS_WATCHDOG_DISABLE		1
+#define GT_GEN8_VCS_WATCHDOG_INTERRUPT		(1 <<  6)
+#define   GEN8_VCS_WATCHDOG_DISABLE		0xFFFFFFFF
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT	(1 <<  5) /* !snb */
 #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT	(1 <<  4)
 #define GT_RENDER_CS_MASTER_ERROR_INTERRUPT	(1 <<  3)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e9940cc..051da09 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1122,6 +1122,78 @@ static int intel_logical_ring_begin(struct intel_ringbuffer *ringbuf,
 	return 0;
 }
 
+static int
+gen8_ring_start_watchdog(struct intel_ringbuffer *ringbuf, struct intel_context *ctx)
+{
+	int ret;
+	struct intel_engine_cs *ring = ringbuf->ring;
+
+	ret = intel_logical_ring_begin(ringbuf, ctx, 10);
+	if (ret)
+		return ret;
+
+	/*
+	 * i915_reg.h includes a warning to place a MI_NOOP
+	 * before a MI_LOAD_REGISTER_IMM
+	 */
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	/* Set counter period */
+	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+	intel_logical_ring_emit(ringbuf, RING_THRESH(ring->mmio_base));
+	intel_logical_ring_emit(ringbuf, ring->watchdog_threshold);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	/* Start counter */
+	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+	intel_logical_ring_emit(ringbuf, RING_CNTR(ring->mmio_base));
+	intel_logical_ring_emit(ringbuf, I915_WATCHDOG_ENABLE);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_advance(ringbuf);
+
+	return 0;
+}
+
+static int
+gen8_ring_stop_watchdog(struct intel_ringbuffer *ringbuf, struct intel_context *ctx)
+{
+	int ret;
+	struct intel_engine_cs *ring = ringbuf->ring;
+
+	ret = intel_logical_ring_begin(ringbuf, ctx, 6);
+	if (ret)
+		return ret;
+
+	/*
+	 * i915_reg.h includes a warning to place a MI_NOOP
+	 * before a MI_LOAD_REGISTER_IMM
+	 */
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
+	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
+	intel_logical_ring_emit(ringbuf, RING_CNTR(ring->mmio_base));
+
+	switch (ring->id) {
+	default:
+		WARN(1, "%s does not support watchdog timeout! " \
+			"Defaulting to render engine.\n", ring->name);
+	case RCS:
+		intel_logical_ring_emit(ringbuf, GEN6_RCS_WATCHDOG_DISABLE);
+		break;
+	case VCS:
+	case VCS2:
+		intel_logical_ring_emit(ringbuf, GEN8_VCS_WATCHDOG_DISABLE);
+		break;
+	}
+
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+	intel_logical_ring_advance(ringbuf);
+
+	return 0;
+}
+
 /**
  * execlists_submission() - submit a batchbuffer for execution, Execlists style
  * @dev: DRM device.
@@ -1152,6 +1224,7 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 	int instp_mode;
 	u32 instp_mask;
 	int ret;
+	bool watchdog_running = false;
 
 	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	instp_mask = I915_EXEC_CONSTANTS_MASK;
@@ -1203,6 +1276,18 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 	if (ret)
 		return ret;
 
+	/* Start watchdog timer */
+	if (args->flags & I915_EXEC_ENABLE_WATCHDOG) {
+		if (!intel_ring_supports_watchdog(ring))
+			return -EINVAL;
+
+		ret = gen8_ring_start_watchdog(ringbuf, ctx);
+		if (ret)
+			return ret;
+
+		watchdog_running = true;
+	}
+
 	if (ring == &dev_priv->ring[RCS] &&
 	    instp_mode != dev_priv->relative_constants_mode) {
 		ret = intel_logical_ring_begin(ringbuf, ctx, 4);
@@ -1224,6 +1309,13 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 
 	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags);
 
+	/* Cancel watchdog timer */
+	if (watchdog_running) {
+		ret = gen8_ring_stop_watchdog(ringbuf, ctx);
+		if (ret)
+			return ret;
+	}
+
 	i915_gem_execbuffer_move_to_active(vmas, ring);
 	i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
 
@@ -1892,6 +1984,9 @@ static int logical_render_ring_init(struct drm_device *dev)
 	if (HAS_L3_DPF(dev))
 		ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
 
+	ring->irq_keep_mask |=
+		(GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT);
+
 	if (INTEL_INFO(dev)->gen >= 9)
 		ring->init_hw = gen9_init_render_ring;
 	else
@@ -1930,6 +2025,8 @@ static int logical_bsd_ring_init(struct drm_device *dev)
 		GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
 	ring->irq_keep_mask =
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
+	ring->irq_keep_mask |=
+		(GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT);
 
 	ring->init_hw = gen8_init_common_ring;
 	ring->get_seqno = gen8_get_seqno;
@@ -1959,6 +2056,8 @@ static int logical_bsd2_ring_init(struct drm_device *dev)
 		GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
 	ring->irq_keep_mask =
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
+	ring->irq_keep_mask |=
+		(GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT);
 
 	ring->init_hw = gen8_init_common_ring;
 	ring->get_seqno = gen8_get_seqno;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 35360a4..9058789 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -30,6 +30,8 @@ struct  intel_hw_status_page {
 	struct		drm_i915_gem_object *obj;
 };
 
+#define I915_WATCHDOG_ENABLE 0
+
 #define I915_READ_TAIL(ring) I915_READ(RING_TAIL((ring)->mmio_base))
 #define I915_WRITE_TAIL(ring, val) I915_WRITE(RING_TAIL((ring)->mmio_base), val)
 
@@ -136,6 +138,9 @@ struct intel_ring_hangcheck {
 
 	/* Number of TDR hang detections */
 	u32 tdr_count;
+
+	/* Number of watchdog hang detections for this ring */
+	u32 watchdog_count;
 };
 
 struct intel_ringbuffer {
@@ -338,6 +343,12 @@ struct  intel_engine_cs {
 	/* Saved head value to be restored after reset */
 	u32 saved_head;
 
+	/*
+	 * Watchdog timer threshold values
+	 * only RCS, VCS, VCS2 rings have watchdog timeout support
+	 */
+	uint32_t watchdog_threshold;
+
 	struct {
 		struct drm_i915_gem_object *obj;
 		u32 gtt_offset;
@@ -484,6 +495,26 @@ int intel_ring_save(struct intel_engine_cs *ring,
 int intel_ring_restore(struct intel_engine_cs *ring,
 		struct drm_i915_gem_request *req);
 
+static inline bool intel_ring_supports_watchdog(struct intel_engine_cs *ring)
+{
+	bool ret = false;
+
+	if (WARN_ON(!ring))
+		goto exit;
+
+	ret = (	ring->id == RCS ||
+		ring->id == VCS ||
+		ring->id == VCS2);
+
+	if (!ret)
+		DRM_ERROR("%s does not support watchdog timeout!\n", ring->name);
+
+exit:
+	return ret;
+}
+int intel_ring_start_watchdog(struct intel_engine_cs *ring);
+int intel_ring_stop_watchdog(struct intel_engine_cs *ring);
+
 int __must_check intel_ring_idle(struct intel_engine_cs *ring);
 void intel_ring_init_seqno(struct intel_engine_cs *ring, u32 seqno);
 int intel_ring_flush_all_caches(struct intel_engine_cs *ring);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4851d66..f8af7d2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -760,7 +760,10 @@ struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_BSD_RING1		(1<<13)
 #define I915_EXEC_BSD_RING2		(2<<13)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
+/* Enable watchdog timer for this batch buffer */
+#define I915_EXEC_ENABLE_WATCHDOG       (1<<15)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(1<<16)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
1.7.9.5