[Intel-gfx] [PATCH 08/20] drm/i915: Watchdog timeout: IRQ handler for gen8

Thu Oct 22 18:32:30 PDT 2015

*** General ***

Watchdog timeout (or "media engine reset") is a feature that allows userland
applications to enable hang detection on individual batch buffers. The
detection mechanism itself is mostly bound to the hardware and the only thing
that the driver needs to do to support this form of hang detection is to
implement the interrupt handling support as well as watchdog command emission
before and after the emitted batch buffer start instruction in the ring buffer.

The principle of the hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a particular
batch buffer and the driver is in the process of emitting the batch buffer
start instruction into the ring buffer it also emits a watchdog timer start
instruction before and a watchdog timer cancellation instruction after the
batch buffer start instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction the
hardware watchdog counter is started by the hardware. The counter keeps
counting until either reaching a previously configured threshold value or the
timer cancellation instruction is executed.

2a. If the counter reaches the threshold value the hardware fires a watchdog
interrupt that is picked up by the watchdog interrupt handler. This means that
a hang has been detected and the driver needs to deal with it the same way it
would deal with a engine hang detected by the periodic hang checker. The only
difference between the two is that we never promote full GPU reset following a
watchdog timeout in case a per-engine reset was attempted too recently. Thusly,
the watchdog interrupt handler calls the error handler directly passing the
engine mask of the hung engine in question, which immediately results in a
per-engine hang recovery being scheduled.

2b. If the batch buffer completes and the execution reaches the watchdog
cancellation instruction before the watchdog counter reaches its threshold
value the watchdog is cancelled and nothing more comes of it. No hang is
detected.

*** This patch introduces: ***

1. IRQ handler code for watchdog timeout allowing direct hang recovery based on
hardware-driven hang detection, which then integrates directly with the
per-engine hang recovery path.

2. Watchdog timeout init code patch for setup of watchdog timeout threshold
values and gen-specific register information.

The current default watchdog threshold value is 60 ms, since this has been
empirically determined to be a good compromise for low-latency requirements and
low rate of false positives.

Currently the render engine and all available media engines support watchdog
timeout. The specifications elude to the VECS engine being supported but that
is currently not supported by this commit.

NOTE: I don't know if Ben Widawsky had any part in this code from 3 years
ago. There have been so many people involved in this already that I am in no
position to know. If I've missed anyone's sob line please let me know.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c         | 59 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_irq.c         | 39 ++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h         |  7 ++++
 drivers/gpu/drm/i915/intel_lrc.c        |  7 ++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  9 +++++
 6 files changed, 122 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 147964f..1e203e7d 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -871,6 +871,64 @@ static void intel_init_dpio(struct drm_i915_private *dev_priv)
 	}
 }
 
+void i915_watchdog_init(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int freq;
+	int i;
+
+	/*
+	 * Based on pre-defined time out value (60ms or 30ms) calculate
+	 * timer count thresholds needed based on core frequency.
+	 *
+	 * For RCS.
+	 * The timestamp resolution changed in Gen7 and beyond to 80ns
+	 * for all pipes. Before that it was 640ns.
+	 */
+
+#define KM_RCS_ENGINE_TIMEOUT_VALUE_IN_MS 60
+#define KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS 60
+#define KM_TIMER_MILLISECOND 1000
+
+	/*
+	 * Timestamp timer resolution = 0.080 uSec,
+	 * or 12500000 counts per second
+	 */
+#define KM_TIMESTAMP_CNTS_PER_SEC_80NS 12500000
+
+	/*
+	 * Timestamp timer resolution = 0.640 uSec,
+	 * or 1562500 counts per second
+	 */
+#define KM_TIMESTAMP_CNTS_PER_SEC_640NS 1562500
+
+	if (INTEL_INFO(dev)->gen >= 7)
+		freq = KM_TIMESTAMP_CNTS_PER_SEC_80NS;
+	else
+		freq = KM_TIMESTAMP_CNTS_PER_SEC_640NS;
+
+	dev_priv->ring[RCS].watchdog_threshold =
+		((KM_RCS_ENGINE_TIMEOUT_VALUE_IN_MS) *
+		(freq / KM_TIMER_MILLISECOND));
+
+	dev_priv->ring[VCS].watchdog_threshold =
+		((KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS) *
+		(freq / KM_TIMER_MILLISECOND));
+
+	dev_priv->ring[VCS2].watchdog_threshold =
+		((KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS) *
+		(freq / KM_TIMER_MILLISECOND));
+
+	for (i = 0; i < I915_NUM_RINGS; i++)
+		dev_priv->ring[i].hangcheck.watchdog_count = 0;
+
+	DRM_INFO("Watchdog Timeout [ms], " \
+			"RCS: 0x%08X, VCS: 0x%08X, VCS2: 0x%08X\n", \
+			KM_RCS_ENGINE_TIMEOUT_VALUE_IN_MS,
+			KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS,
+			KM_BSD_ENGINE_TIMEOUT_VALUE_IN_MS);
+}
+
 /**
  * i915_driver_load - setup chip and create an initial config
  * @dev: DRM device
@@ -1055,6 +1113,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	i915_gem_load(dev);
 
 	i915_hangcheck_init(dev);
+	i915_watchdog_init(dev);
 
 	/* On the 945G/GM, the chipset reports the MSI capability on the
 	 * integrated graphics even though the support isn't actually there
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b86d34b..9219904 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2741,6 +2741,7 @@ void intel_hpd_init(struct drm_i915_private *dev_priv);
 void intel_hpd_init_work(struct drm_i915_private *dev_priv);
 void intel_hpd_cancel_work(struct drm_i915_private *dev_priv);
 bool intel_hpd_pin_to_port(enum hpd_pin pin, enum port *port);
+void i915_watchdog_init(struct drm_device *dev);
 static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine)
 {
 	struct intel_ring_hangcheck *hc = &engine->hangcheck;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 19ab79e..f35a9b0 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1305,6 +1305,18 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 				intel_lrc_irq_handler(&dev_priv->ring[RCS]);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[RCS]);
+			if (tmp & (GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT)) {
+				struct intel_engine_cs *ring;
+
+				/* Stop the counter to prevent further interrupts */
+				ring = &dev_priv->ring[RCS];
+				I915_WRITE(RING_CNTR(ring->mmio_base),
+					GEN6_RCS_WATCHDOG_DISABLE);
+
+				ring->hangcheck.watchdog_count++;
+				i915_handle_error(ring->dev, intel_ring_flag(ring), true, true,
+					"Render engine watchdog timed out");
+			}
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
 				intel_lrc_irq_handler(&dev_priv->ring[BCS]);
@@ -1324,11 +1336,35 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 				intel_lrc_irq_handler(&dev_priv->ring[VCS]);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VCS]);
+			if (tmp & (GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT)) {
+				struct intel_engine_cs *ring;
+
+				/* Stop the counter to prevent further interrupts */
+				ring = &dev_priv->ring[VCS];
+				I915_WRITE(RING_CNTR(ring->mmio_base),
+					GEN8_VCS_WATCHDOG_DISABLE);
+
+				ring->hangcheck.watchdog_count++;
+				i915_handle_error(ring->dev, intel_ring_flag(ring), true, true,
+						  "Media engine watchdog timed out");
+			}
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
 				intel_lrc_irq_handler(&dev_priv->ring[VCS2]);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VCS2]);
+			if (tmp & (GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT)) {
+				struct intel_engine_cs *ring;
+
+				/* Stop the counter to prevent further interrupts */
+				ring = &dev_priv->ring[VCS2];
+				I915_WRITE(RING_CNTR(ring->mmio_base),
+					GEN8_VCS_WATCHDOG_DISABLE);
+
+				ring->hangcheck.watchdog_count++;
+				i915_handle_error(ring->dev, intel_ring_flag(ring), true, true,
+						  "Media engine 2 watchdog timed out");
+			}
 		} else
 			DRM_ERROR("The master control interrupt lied (GT1)!\n");
 	}
@@ -3776,11 +3812,14 @@ static void gen8_gt_irq_postinstall(struct drm_i915_private *dev_priv)
 {
 	/* These are interrupts we'll toggle with the ring mask register */
 	uint32_t gt_interrupts[] = {
+		GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
 		GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
 			GT_RENDER_L3_PARITY_ERROR_INTERRUPT |
 			GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT |
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT,
+		GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
+		GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
 		GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
 			GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT |
 			GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT |
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 52b4be4..af44dd7 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -1509,6 +1509,8 @@ enum skl_disp_power_wells {
 #define RING_HEAD(base)		((base)+0x34)
 #define RING_START(base)	((base)+0x38)
 #define RING_CTL(base)		((base)+0x3c)
+#define RING_CNTR(base)        ((base)+0x178)
+#define RING_THRESH(base) ((base)+0x17C)
 #define RING_SYNC_0(base)	((base)+0x40)
 #define RING_SYNC_1(base)	((base)+0x44)
 #define RING_SYNC_2(base)	((base)+0x48)
@@ -1943,6 +1945,11 @@ enum skl_disp_power_wells {
 #define GT_BSD_USER_INTERRUPT			(1 << 12)
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1	(1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
 #define GT_CONTEXT_SWITCH_INTERRUPT		(1 <<  8)
+#define GT_GEN6_RENDER_WATCHDOG_INTERRUPT	(1 <<  6)
+#define GT_GEN8_RCS_WATCHDOG_INTERRUPT		(1 <<  6)
+#define   GEN6_RCS_WATCHDOG_DISABLE		1
+#define GT_GEN8_VCS_WATCHDOG_INTERRUPT		(1 <<  6)
+#define   GEN8_VCS_WATCHDOG_DISABLE		0xFFFFFFFF
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT	(1 <<  5) /* !snb */
 #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT	(1 <<  4)
 #define GT_RENDER_CS_MASTER_ERROR_INTERRUPT	(1 <<  3)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b06232c..40607da 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2355,6 +2355,9 @@ static int logical_render_ring_init(struct drm_device *dev)
 	if (HAS_L3_DPF(dev))
 		ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
 
+	ring->irq_keep_mask |=
+		(GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT);
+
 	if (INTEL_INFO(dev)->gen >= 9)
 		ring->init_hw = gen9_init_render_ring;
 	else
@@ -2415,6 +2418,8 @@ static int logical_bsd_ring_init(struct drm_device *dev)
 		GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
 	ring->irq_keep_mask =
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
+	ring->irq_keep_mask |=
+		(GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT);
 
 	ring->init_hw = gen8_init_common_ring;
 	if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) {
@@ -2449,6 +2454,8 @@ static int logical_bsd2_ring_init(struct drm_device *dev)
 		GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
 	ring->irq_keep_mask =
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
+	ring->irq_keep_mask |=
+		(GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT);
 
 	ring->init_hw = gen8_init_common_ring;
 	ring->get_seqno = gen8_get_seqno;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 1c598ae..4968c93 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -137,6 +137,9 @@ struct intel_ring_hangcheck {
 
 	/* Number of TDR hang detections */
 	u32 tdr_count;
+
+	/* Number of watchdog hang detections for this ring */
+	u32 watchdog_count;
 };
 
 struct intel_ringbuffer {
@@ -363,6 +366,12 @@ struct  intel_engine_cs {
 	/* Saved head value to be restored after reset */
 	u32 saved_head;
 
+	/*
+	 * Watchdog timer threshold values
+	 * only RCS, VCS, VCS2 rings have watchdog timeout support
+	 */
+	uint32_t watchdog_threshold;
+
 	struct {
 		struct drm_i915_gem_object *obj;
 		u32 gtt_offset;
-- 
1.9.1