[Intel-gfx] [PATCH 2/3] drm/i915: Per-engine Timeout detection and recovery on HSW

Siluvery, Arun arun.siluvery at intel.com
Mon Nov 11 15:59:37 CET 2013


From: "Siluvery, Arun" <arun.siluvery at intel.com>

TDR provides per-engine hang detection and recovery. If an engine hangs then
the TDR will attempt to reset the engine and advance the command streamer to
the next instruction in the ring. If it was in the middle of processing a
batch buffer then control returns to the instruction following the batch buffer
start command. This provides a less intrusive recovery mechanism because it only
impacts the process which caused the hang.

uevents are sent so that user mode can detect that something has gone wrong
and take action if required.

Issues:

1) Full GPU resets can leave the system in a state where the display updates
   intermittently or lock up the system. This problem already existed with
   the current driver which could only do full GPU resets in response to a
   hang on any of the rings.
   The problem has not been seen with per-engine TDR unless the driver
   falls back to full GPU reset, either because an individual ring reset
   fails or because the rings are hanging too quickly.

Signed-off-by: Siluvery, Arun <arun.siluvery at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   6 +-
 drivers/gpu/drm/i915/i915_dma.c         |  16 +-
 drivers/gpu/drm/i915/i915_drv.c         | 195 ++++++++++-
 drivers/gpu/drm/i915/i915_drv.h         |  80 ++++-
 drivers/gpu/drm/i915/i915_gem.c         |  77 ++++-
 drivers/gpu/drm/i915/i915_gpu_error.c   |  25 +-
 drivers/gpu/drm/i915/i915_irq.c         | 555 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_display.c    |  25 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c |  20 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
 drivers/gpu/drm/i915/intel_uncore.c     |   6 +-
 include/drm/drmP.h                      |   7 +
 12 files changed, 724 insertions(+), 290 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6875b7a..0e5bcb4 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2604,9 +2604,13 @@ static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_device *dev = data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
 
 	DRM_INFO("Manually setting wedged to %llu\n", val);
-	i915_handle_error(dev, val);
+	if (val) {
+		if (!i915_reset_in_progress(&dev_priv->gpu_error))
+			i915_handle_error(dev, NULL);
+	}
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0cab2d0..694da55 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1466,6 +1466,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	struct intel_device_info *info;
 	int ret = 0, mmio_bar, mmio_size;
 	uint32_t aperture_size;
+	uint32_t i;
 
 	info = (struct intel_device_info *) flags;
 
@@ -1661,6 +1662,17 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 		acpi_video_register();
 	}
 
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		dev_priv->hangcheck[i].count = 0;
+		dev_priv->hangcheck[i].last_acthd = 0;
+		dev_priv->hangcheck[i].ringid = i;
+		dev_priv->hangcheck[i].dev = dev;
+
+		setup_timer(&dev_priv->hangcheck[i].timer,
+			i915_hangcheck_sample,
+			(unsigned long) &dev_priv->hangcheck[i]);
+	}
+
 	if (IS_GEN5(dev))
 		intel_gpu_ips_init(dev_priv);
 
@@ -1703,6 +1715,7 @@ int i915_driver_unload(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
+	uint32_t i;
 
 	intel_gpu_ips_teardown();
 
@@ -1748,9 +1761,10 @@ int i915_driver_unload(struct drm_device *dev)
 	}
 
 	/* Free error state after interrupts are fully disabled. */
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
 	cancel_work_sync(&dev_priv->gpu_error.work);
 	i915_destroy_error_state(dev);
+	for (i = 0; i < I915_NUM_RINGS; i++)
+		del_timer_sync(&dev_priv->hangcheck[i].timer);
 
 	cancel_delayed_work_sync(&dev_priv->pc8.enable_work);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 24d58b0..1c8b96b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -113,6 +113,59 @@ MODULE_PARM_DESC(enable_hangcheck,
 		"WARNING: Disabling this can cause system wide hangs. "
 		"(default: true)");
 
+unsigned int i915_hangcheck_period __read_mostly = 1000;
+
+int hangcheck_period_set(const char *val, const struct kernel_param *kp)
+{
+	/* Custom set function so we can validate the range*/
+	unsigned long num;
+	int ret;
+
+	ret = kstrtoul(val, 0, &num);
+
+	if (ret)
+		return ret;
+
+	/* Enforce minimum delay in ms */
+	if ((num >= MINIMUM_HANGCHECK_PERIOD)
+	&& (num <= MAXIMUM_HANGCHECK_PERIOD)) {
+		i915_hangcheck_period = num;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static const struct kernel_param_ops hangcheck_ops = {
+	.set = hangcheck_period_set,
+	.get = param_get_uint,
+};
+
+module_param_cb(i915_hangcheck_period, &hangcheck_ops,
+		&i915_hangcheck_period, 0644);
+MODULE_PARM_DESC(i915_hangcheck_period,
+		"The hangcheck timer period in milliseconds. "
+		"The actual time to detect a hang may be 3 - 4 times "
+		"this value (default = 1000ms)");
+
+unsigned int i915_ring_reset_min_alive_period __read_mostly;
+module_param_named(i915_ring_reset_min_alive_period,
+		i915_ring_reset_min_alive_period, int, 0644);
+MODULE_PARM_DESC(i915_ring_reset_min_alive_period,
+		"Catch excessive ring resets. Each ring maintains a timestamp of "
+		"the last time it was reset. If it hangs again within this period "
+		"then switch to full GPU reset to try and clear the hang."
+		"default=0 seconds (disabled)");
+
+unsigned int i915_gpu_reset_min_alive_period __read_mostly;
+module_param_named(i915_gpu_reset_min_alive_period,
+		i915_gpu_reset_min_alive_period, int, 0644);
+MODULE_PARM_DESC(i915_gpu_reset_min_alive_period,
+		"Catch excessive GPU resets. If the GPU hangs again within this period "
+		"following the previous GPU reset then declare it wedged and "
+		"prevent further resets. "
+		"default=0 seconds (disabled)");
+
 int i915_enable_ppgtt __read_mostly = -1;
 module_param_named(i915_enable_ppgtt, i915_enable_ppgtt, int, 0600);
 MODULE_PARM_DESC(i915_enable_ppgtt,
@@ -726,6 +779,142 @@ int i915_resume(struct drm_device *dev)
 	return 0;
 }
 
+int i915_handle_hung_ring(struct drm_device *dev, uint32_t ringid)
+{
+	/* TDR Version 1:
+	* Reset the ring that is hung
+	*
+	* WARNING: Hold dev->struct_mutex before entering
+	*          this function
+	*/
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct intel_ring_buffer *ring = &dev_priv->ring[ringid];
+	struct drm_crtc *crtc;
+	struct intel_crtc *intel_crtc;
+	int ret = 0;
+	int pipe = 0;
+	struct intel_unpin_work *unpin_work;
+	uint32_t ring_flags = 0;
+	uint32_t head;
+
+	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	/* Take wake lock to prevent power saving mode */
+	gen6_gt_force_wake_get(dev_priv);
+
+	/* Check if the ring has hung on a MI_DISPLAY_FLIP command.
+	* The pipe value will be stored in the HWS page if it has.
+	* At the moment this should only happen for the blitter but
+	* each ring has its own status page so this should work for
+	* all rings*/
+	pipe = intel_read_status_page(ring, I915_GEM_PGFLIP_INDEX);
+	if (pipe) {
+		/* Clear it to avoid responding to it twice*/
+		intel_write_status_page(ring, I915_GEM_PGFLIP_INDEX, 0);
+	}
+
+	/* Clear any simulated hang flags */
+	if (dev_priv->stop_rings) {
+		DRM_DEBUG_TDR("Simulated gpu hang, rst stop_rings bits %08x\n",
+			(0x1 << ringid));
+		dev_priv->stop_rings &= ~(0x1 << ringid);
+	}
+
+	DRM_DEBUG_TDR("Resetting ring %d\n", ringid);
+
+	ret = intel_ring_disable(ring);
+	if (ret != 0) {
+		DRM_ERROR("Failed to disable ring %d\n", ringid);
+		goto handle_hung_ring_error;
+	}
+
+	/* Sample the current ring head position */
+	head = I915_READ(RING_HEAD(ring->mmio_base)) & HEAD_ADDR;
+	DRM_DEBUG_TDR("head 0x%08X, last_head 0x%08X\n",
+		      head, dev_priv->hangcheck[ringid].last_head);
+	if (head == dev_priv->hangcheck[ringid].last_head) {
+		/* The ring has not advanced since the last
+		* time it hung so force it to advance to the
+		* next QWORD. In most cases the ring head
+		* pointer will automatically advance to the
+		* next instruction as soon as it has read the
+		* current instruction, without waiting for it
+		* to complete. This seems to be the default
+		* behaviour, however an MBOX wait inserted
+		* directly to the VCS/BCS rings does not behave
+		* in the same way, instead the head pointer
+		* will still be pointing at the MBOX instruction
+		* until it completes.*/
+		ring_flags = FORCE_ADVANCE;
+		DRM_DEBUG_TDR("Force ring head to advance\n");
+	}
+	dev_priv->hangcheck[ringid].last_head = head;
+
+	ret = intel_ring_save(ring, ring_flags);
+	if (ret != 0) {
+		DRM_ERROR("Failed to save ring state\n");
+		goto handle_hung_ring_error;
+	}
+
+	ret = intel_ring_reset(ring);
+	if (ret != 0) {
+		DRM_ERROR("Failed to reset ring\n");
+		goto handle_hung_ring_error;
+	}
+
+	DRM_ERROR("Reset ring %d (GPU Hang)\n", ringid);
+
+	/* Clear last_acthd in hangcheck timer for this ring */
+	dev_priv->hangcheck[ringid].last_acthd = 0;
+
+	/* Clear reset to allow future hangchecks */
+	atomic_set(&dev_priv->hangcheck[ringid].reset, 0);
+
+	ret = intel_ring_restore(ring);
+	if (ret != 0) {
+		DRM_ERROR("Failed to restore ring state\n");
+		goto handle_hung_ring_error;
+	}
+
+	/* Correct driver state */
+	intel_ring_resample(ring);
+
+	ret = intel_ring_enable(ring);
+	if (ret != 0) {
+		DRM_ERROR("Failed to enable ring\n");
+		goto handle_hung_ring_error;
+	}
+
+	/* Wake up anything waiting on this rings queue */
+	wake_up_all(&ring->irq_queue);
+
+	if (pipe &&
+		((pipe - 1) < ARRAY_SIZE(dev_priv->pipe_to_crtc_mapping))) {
+		/* The pipe value in the status page is offset by 1 */
+		pipe -= 1;
+
+		/* The ring hung on a page flip command so we
+		* must manually release the pending flip queue */
+		crtc = dev_priv->pipe_to_crtc_mapping[pipe];
+		intel_crtc = to_intel_crtc(crtc);
+		unpin_work = intel_crtc->unpin_work;
+
+		if (unpin_work && unpin_work->pending_flip_obj) {
+			intel_prepare_page_flip(dev, intel_crtc->pipe);
+			intel_finish_page_flip(dev, intel_crtc->pipe);
+			DRM_DEBUG_TDR("Released stuck page flip for pipe %d\n",
+				pipe);
+		}
+	}
+
+handle_hung_ring_error:
+
+	/* Release power lock */
+	gen6_gt_force_wake_put(dev_priv);
+
+	return ret;
+}
+
 /**
  * i915_reset - reset chip after a hang
  * @dev: drm device to reset
@@ -759,7 +948,11 @@ int i915_reset(struct drm_device *dev)
 	ret = intel_gpu_reset(dev);
 
 	/* Also reset the gpu hangman. */
-	if (simulated) {
+	if (!simulated && (get_seconds() - dev_priv->gpu_error.last_reset)
+	    < i915_gpu_reset_min_alive_period) {
+		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
+		ret = -ENODEV;
+	} else if (simulated) {
 		DRM_INFO("Simulated gpu hang, resetting stop_rings\n");
 		dev_priv->gpu_error.stop_rings = 0;
 		if (ret == -ENODEV) {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b0a244d..c539509 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -307,7 +307,7 @@ struct drm_i915_error_state {
 	u32 ctl[I915_NUM_RINGS];
 	u32 ipeir[I915_NUM_RINGS];
 	u32 ipehr[I915_NUM_RINGS];
-	u32 instdone[I915_NUM_RINGS];
+	u32 instdone[I915_NUM_RINGS][I915_NUM_INSTDONE_REG];
 	u32 acthd[I915_NUM_RINGS];
 	u32 semaphore_mboxes[I915_NUM_RINGS][I915_NUM_RINGS - 1];
 	u32 semaphore_seqno[I915_NUM_RINGS][I915_NUM_RINGS - 1];
@@ -1042,6 +1042,13 @@ struct i915_gem_mm {
 	 */
 	bool interruptible;
 
+	/**
+	 * This is set when the error_recovery function is running.
+	 * It prevents command submission from occurring and makes
+	 * every pending request fail
+	 */
+	atomic_t wedged;
+
 	/** Bit 6 swizzling required for X tiling */
 	uint32_t bit_6_swizzle_x;
 	/** Bit 6 swizzling required for Y tiling */
@@ -1072,8 +1079,10 @@ struct i915_error_state_file_priv {
 
 struct i915_gpu_error {
 	/* For hangcheck timer */
+#define MINIMUM_HANGCHECK_PERIOD 100   /* 100ms */
+#define MAXIMUM_HANGCHECK_PERIOD 30000 /* 30s */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
-#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
+#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(i915_hangcheck_period)
 	/* Hang gpu twice in this window and your context gets banned */
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
@@ -1087,6 +1096,7 @@ struct i915_gpu_error {
 
 
 	unsigned long missed_irq_rings;
+	unsigned long last_reset;
 
 	/**
 	 * State variable and reset counter controlling the reset flow
@@ -1304,6 +1314,41 @@ struct intel_pipe_crc {
 	wait_queue_head_t wq;
 };
 
+struct intel_hangcheck {
+	/* The ring being monitored*/
+	uint32_t ringid;
+
+	/* Parent drm_device*/
+	struct drm_device *dev;
+
+	/* Timer for this ring only*/
+	struct timer_list timer;
+
+	/* Count of consecutive hang detections
+	 * (reset flag set once count exceeds threshold)*/
+#define HANGCHECK_THRESHOLD      1
+#define MBOX_HANGCHECK_THRESHOLD 4
+	int count;
+
+	/* Last sampled head and active head*/
+	uint32_t last_acthd;
+	uint32_t last_hd;
+
+	/* Last recorded ring head index from previous ring hang.
+	* This is only ever a ring index where as active
+	* head may be a graphics address in a ring buffer */
+	uint32_t last_head;
+
+	/* Last recorded instdone*/
+	uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
+
+	/* Flag to indicate if ring reset required*/
+	atomic_t reset;
+
+	/* Keep a record of the last time the ring was reset */
+	unsigned long last_reset;
+};
+
 typedef struct drm_i915_private {
 	struct drm_device *dev;
 	struct kmem_cache *slab;
@@ -1372,6 +1417,17 @@ typedef struct drm_i915_private {
 
 	int num_plane;
 
+	/* For hangcheck timer */
+	struct intel_hangcheck hangcheck[I915_NUM_RINGS];
+
+	unsigned int stop_rings;
+
+	unsigned long cfb_size;
+	unsigned int cfb_fb;
+	enum plane cfb_plane;
+	int cfb_y;
+	struct intel_fbc_work *fbc_work;
+
 	struct i915_fbc fbc;
 	struct intel_opregion opregion;
 	struct intel_vbt_data vbt;
@@ -1397,6 +1453,11 @@ typedef struct drm_i915_private {
 
 	unsigned int fsb_freq, mem_freq, is_ddr3;
 
+	struct work_struct error_work;
+	atomic_t full_reset;
+	uint32_t total_resets;
+
+	wait_queue_head_t error_queue;
 	/**
 	 * wq - Driver workqueue for GEM.
 	 *
@@ -1864,6 +1925,9 @@ extern int i915_vbt_sdvo_panel_type __read_mostly;
 extern int i915_enable_rc6 __read_mostly;
 extern int i915_enable_fbc __read_mostly;
 extern bool i915_enable_hangcheck __read_mostly;
+extern unsigned int i915_hangcheck_period __read_mostly;
+extern unsigned int i915_ring_reset_min_alive_period __read_mostly;
+extern unsigned int i915_gpu_reset_min_alive_period __read_mostly;
 extern int i915_enable_ppgtt __read_mostly;
 extern int i915_enable_psr __read_mostly;
 extern unsigned int i915_preliminary_hw_support __read_mostly;
@@ -1899,6 +1963,7 @@ extern int i915_emit_box(struct drm_device *dev,
 			 struct drm_clip_rect *box,
 			 int DR1, int DR4);
 extern int intel_gpu_reset(struct drm_device *dev);
+extern int i915_handle_hung_ring(struct drm_device *dev, uint32_t ringid);
 extern int i915_reset(struct drm_device *dev);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
@@ -1909,7 +1974,10 @@ extern void intel_console_resume(struct work_struct *work);
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-void i915_handle_error(struct drm_device *dev, bool wedged);
+void i915_hangcheck_sample(unsigned long data);
+void i915_handle_error(struct drm_device *dev, struct intel_hangcheck *hc);
+
+
 
 extern void intel_irq_init(struct drm_device *dev);
 extern void intel_pm_init(struct drm_device *dev);
@@ -2067,7 +2135,8 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
 bool i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
 int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
-				      bool interruptible);
+				      bool interruptible,
+				      struct intel_ring_buffer *ring);
 static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
 {
 	return unlikely(atomic_read(&error->reset_counter)
@@ -2312,7 +2381,8 @@ void i915_error_state_get(struct drm_device *dev,
 void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
 void i915_destroy_error_state(struct drm_device *dev);
 
-void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
+void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone,
+	struct intel_ring_buffer *ring);
 const char *i915_cache_level_str(int type);
 
 /* i915_suspend.c */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 40d9dcf..b7e5a8e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -142,14 +142,38 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
 	return 0;
 }
 
-int i915_mutex_lock_interruptible(struct drm_device *dev)
+int i915_gem_wedged(struct drm_device *dev, bool interruptible)
 {
+	/* Warning: This function can only give an indication
+	 * if the GPU is wedged at a particular instance of time.
+	 * The hangcheck process is asynchronous so a hang
+	 * may be detected just after the flags have been sampled */
+	unsigned i;
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	int err = !interruptible ? -EIO : -EAGAIN;
+
+	/* Full reset requested */
+	if (i915_reset_in_progress(&dev_priv->gpu_error))
+		return err;
+
+	/* Check for an individual ring which has hung */
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		if (atomic_read(&dev_priv->hangcheck[i].reset))
+			return err;
+	}
+
+	return 0;
+}
+
+int i915_mutex_lock_interruptible(struct drm_device *dev)
+{
 	int ret;
 
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
-	if (ret)
-		return ret;
+	/*  There should be no need to call i915_gem_wait_for_error
+	*    as the error recovery handler takes dev->struct_mutex
+	*     so if it is active we will wait on the
+	*     mutex_lock_interruptible call instead.
+	*/
 
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
@@ -935,9 +959,15 @@ unlock:
 
 int
 i915_gem_check_wedge(struct i915_gpu_error *error,
-		     bool interruptible)
+		     bool interruptible,
+		     struct intel_ring_buffer *ring)
 {
-	if (i915_reset_in_progress(error)) {
+	drm_i915_private_t *dev_priv;
+
+	dev_priv = container_of(error, drm_i915_private_t, gpu_error);
+
+	if ((ring && atomic_read(&dev_priv->hangcheck[ring->id].reset)) ||
+	    i915_reset_in_progress(error)) {
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
 		if (!interruptible)
@@ -1054,7 +1084,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
 			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
 			 * is truely gone. */
-			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible, ring);
 			if (ret == 0)
 				ret = -EAGAIN;
 			break;
@@ -1124,7 +1154,7 @@ i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno)
 	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 	BUG_ON(seqno == 0);
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+	ret = i915_gem_wedged(dev, interruptible);
 	if (ret)
 		return ret;
 
@@ -1201,7 +1231,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	if (seqno == 0)
 		return 0;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
+	ret = i915_gem_check_wedge(&dev_priv->gpu_error, true, ring);
 	if (ret)
 		return ret;
 
@@ -1213,8 +1243,9 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 	mutex_unlock(&dev->struct_mutex);
 	ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, file->driver_priv);
 	mutex_lock(&dev->struct_mutex);
-	if (ret)
+	if (ret) {
 		return ret;
+	}
 
 	return i915_gem_object_wait_rendering__tail(obj, ring);
 }
@@ -2180,8 +2211,6 @@ int __i915_add_request(struct intel_ring_buffer *ring,
 	ring->preallocated_lazy_request = NULL;
 
 	if (!dev_priv->ums.mm_suspended) {
-		i915_queue_hangcheck(ring->dev);
-
 		if (was_empty) {
 			cancel_delayed_work_sync(&dev_priv->mm.idle_work);
 			queue_delayed_work(dev_priv->wq,
@@ -3810,7 +3839,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (ret)
 		return ret;
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, false);
+	ret = i915_gem_check_wedge(&dev_priv->gpu_error, false, NULL);
 	if (ret)
 		return ret;
 
@@ -3828,9 +3857,16 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (seqno == 0)
 		return 0;
 
-	ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, NULL);
-	if (ret == 0)
-		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
+	if (ring) {
+		if (i915_gem_wedged(dev, 1) != 0)
+			return -EIO;
+
+		ret = __wait_seqno(ring, seqno, reset_counter, true, NULL,
+				   file->driver_priv);
+		if (ret == 0)
+			queue_delayed_work(dev_priv->wq,
+					   &dev_priv->mm.retire_work, 0);
+	}
 
 	return ret;
 }
@@ -4275,6 +4311,7 @@ i915_gem_suspend(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret = 0;
+	int i;
 
 	mutex_lock(&dev->struct_mutex);
 	if (dev_priv->ums.mm_suspended)
@@ -4301,7 +4338,8 @@ i915_gem_suspend(struct drm_device *dev)
 							     DRIVER_MODESET);
 	mutex_unlock(&dev->struct_mutex);
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	for (i = 0; i < I915_NUM_RINGS; i++)
+		del_timer_sync(&dev_priv->hangcheck[i].timer);
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
 	cancel_delayed_work_sync(&dev_priv->mm.idle_work);
 
@@ -4530,6 +4568,7 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
+	int i;
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return 0;
@@ -4537,6 +4576,10 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
 	if (i915_reset_in_progress(&dev_priv->gpu_error)) {
 		DRM_ERROR("Reenabling wedged hardware, good luck\n");
 		atomic_set(&dev_priv->gpu_error.reset_counter, 0);
+		for (i = 0; i < I915_NUM_RINGS; i++) {
+			/* Clear the reset flag */
+			atomic_set(&dev_priv->hangcheck[i].reset, 0);
+		}
 	}
 
 	mutex_lock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 79dcb8f..4c8ad4f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -723,7 +723,6 @@ static void i915_record_ring_state(struct drm_device *dev,
 		error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
 		error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
 		error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
-		error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
 		error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
 		if (ring->id == RCS)
 			error->bbaddr = I915_READ64(BB_ADDR);
@@ -732,9 +731,10 @@ static void i915_record_ring_state(struct drm_device *dev,
 		error->faddr[ring->id] = I915_READ(DMA_FADD_I8XX);
 		error->ipeir[ring->id] = I915_READ(IPEIR);
 		error->ipehr[ring->id] = I915_READ(IPEHR);
-		error->instdone[ring->id] = I915_READ(INSTDONE);
 	}
 
+	i915_get_extra_instdone(dev, error->instdone[ring->id],
+		&dev_priv->ring[ring->id]);
 	error->waiting[ring->id] = waitqueue_active(&ring->irq_queue);
 	error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
 	error->seqno[ring->id] = ring->get_seqno(ring, false);
@@ -899,6 +899,7 @@ void i915_capture_error_state(struct drm_device *dev)
 	struct drm_i915_error_state *error;
 	unsigned long flags;
 	int pipe;
+	int i;
 
 	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
 	error = dev_priv->gpu_error.first_error;
@@ -957,7 +958,9 @@ void i915_capture_error_state(struct drm_device *dev)
 	if (INTEL_INFO(dev)->gen == 7)
 		error->err_int = I915_READ(GEN7_ERR_INT);
 
-	i915_get_extra_instdone(dev, error->extra_instdone);
+	for (i = 0; i < I915_NUM_RINGS; i++)
+		i915_get_extra_instdone(dev, error->instdone[i],
+					&dev_priv->ring[i]);
 
 	i915_gem_capture_buffers(dev_priv, error);
 	i915_gem_record_fences(dev, error);
@@ -1026,7 +1029,9 @@ const char *i915_cache_level_str(int type)
 }
 
 /* NB: please notice the memset */
-void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone)
+void i915_get_extra_instdone(struct drm_device *dev,
+			     uint32_t *instdone,
+			     struct intel_ring_buffer *ring)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	memset(instdone, 0, sizeof(*instdone) * I915_NUM_INSTDONE_REG);
@@ -1046,10 +1051,14 @@ void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone)
 		WARN_ONCE(1, "Unsupported platform\n");
 	case 7:
 	case 8:
-		instdone[0] = I915_READ(GEN7_INSTDONE_1);
-		instdone[1] = I915_READ(GEN7_SC_INSTDONE);
-		instdone[2] = I915_READ(GEN7_SAMPLER_INSTDONE);
-		instdone[3] = I915_READ(GEN7_ROW_INSTDONE);
+		instdone[0] =
+			I915_READ(RING_INSTDONE(ring->mmio_base));
+
+		if (ring->id == RCS) {
+			instdone[1] = I915_READ(GEN7_SC_INSTDONE);
+			instdone[2] = I915_READ(GEN7_SAMPLER_INSTDONE);
+			instdone[3] = I915_READ(GEN7_ROW_INSTDONE);
+		}
 		break;
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ac57d6d..78b2cbb 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -957,7 +957,6 @@ static void notify_ring(struct drm_device *dev,
 	trace_i915_gem_request_complete(ring);
 
 	wake_up_all(&ring->irq_queue);
-	i915_queue_hangcheck(dev);
 }
 
 static void gen6_pm_rps_work(struct work_struct *work)
@@ -1155,12 +1154,14 @@ static void snb_gt_irq_handler(struct drm_device *dev,
 	if (gt_iir & GT_BLT_USER_INTERRUPT)
 		notify_ring(dev, &dev_priv->ring[BCS]);
 
-	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
-		      GT_BSD_CS_ERROR_INTERRUPT |
-		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
-		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
-		i915_handle_error(dev, false);
-	}
+	if (gt_iir & GT_RENDER_CS_MASTER_ERROR_INTERRUPT)
+		i915_handle_error(dev, &dev_priv->hangcheck[RCS]);
+
+	if (gt_iir & GT_BSD_CS_ERROR_INTERRUPT)
+		i915_handle_error(dev, &dev_priv->hangcheck[VCS]);
+
+	if (gt_iir & GT_BLT_CS_ERROR_INTERRUPT)
+		i915_handle_error(dev, &dev_priv->hangcheck[BCS]);
 
 	if (gt_iir & GT_PARITY_ERROR(dev))
 		ivybridge_parity_error_irq_handler(dev, gt_iir);
@@ -1403,7 +1404,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 
 		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
 			DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
-			i915_handle_error(dev_priv->dev, false);
+			i915_handle_error(dev_priv->dev, &dev_priv->hangcheck[VECS]);
 		}
 	}
 }
@@ -1946,9 +1947,41 @@ static void i915_error_work_func(struct work_struct *work)
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
 	int ret;
+	int i;
+	int pipe;
+	struct drm_crtc *crtc;
+	struct intel_crtc *intel_crtc;
+	struct intel_unpin_work *unpin_work;
+
+	/* Set this flag as it should force any waiting processes to release
+	* struct->mutex if they are holding it */
+	atomic_set(&dev_priv->mm.wedged, 1);
+
+	mutex_lock(&dev->struct_mutex);
 
 	kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
 
+	/* Skip individual ring reset requests if full_reset requested*/
+	if (!i915_reset_in_progress(error)) {
+		/* Check each ring for a pending reset condition */
+		for (i = 0; i < I915_NUM_RINGS; i++) {
+			if (atomic_read(&dev_priv->hangcheck[i].reset)) {
+				DRM_DEBUG_TDR("resetting ring %d\n", i);
+
+				if (i915_handle_hung_ring(dev, i) != 0) {
+					DRM_ERROR("ring %d reset failed", i);
+					atomic_set_mask(
+					I915_RESET_IN_PROGRESS_FLAG,
+					&dev_priv->gpu_error.reset_counter);
+					break;
+				}
+			}
+		}
+	}
+	/* Release struct->mutex for the full GPU reset. It will take
+	* it itself when it needs it */
+	mutex_unlock(&dev->struct_mutex);
+
 	/*
 	 * Note that there's only one work item which does gpu resets, so we
 	 * need not worry about concurrent gpu resets potentially incrementing
@@ -1988,8 +2021,35 @@ static void i915_error_work_func(struct work_struct *work)
 			smp_mb__before_atomic_inc();
 			atomic_inc(&dev_priv->gpu_error.reset_counter);
 
-			kobject_uevent_env(&dev->primary->kdev->kobj,
-					   KOBJ_CHANGE, reset_done_event);
+			for (i = 0; i < I915_NUM_RINGS; i++) {
+				/* Clear individual ring reset flags*/
+				atomic_set(&dev_priv->hangcheck[i].reset, 0);
+			}
+
+			mutex_lock(&dev->mode_config.mutex);
+			/* Release any pending page flip
+			* This is particularly important if ring_stop was set.
+
+			* WARNING: This code could retire a page flip that
+			* arrives just after reset. In that case we will get
+			* an extra page flip interrupt that is not expected.
+			* If another page flip request arrives before the interrupt
+			* then the unpin work could happen sooner than expected.
+			*/
+			for_each_pipe(pipe) {
+				crtc =
+					dev_priv->pipe_to_crtc_mapping[pipe];
+				intel_crtc = to_intel_crtc(crtc);
+				unpin_work = intel_crtc->unpin_work;
+
+				if (unpin_work
+				&& unpin_work->pending_flip_obj) {
+					intel_prepare_page_flip(dev, pipe);
+					intel_finish_page_flip(dev, pipe);
+					DRM_DEBUG_TDR("Clr pg flip\n");
+				}
+			}
+			mutex_unlock(&dev->mode_config.mutex);
 		} else {
 			atomic_set(&error->reset_counter, I915_WEDGED);
 		}
@@ -2000,21 +2060,36 @@ static void i915_error_work_func(struct work_struct *work)
 		 */
 		i915_error_wake_up(dev_priv, true);
 	}
+
+	/* Clear wedged condition and wake up waiters*/
+	atomic_set(&dev_priv->mm.wedged, 0);
+
+	kobject_uevent_env(&dev->primary->kdev->kobj,
+				KOBJ_CHANGE, reset_done_event);
+
+	/* Wake anyone waiting on error handling completion*/
+	wake_up_all(&dev_priv->error_queue);
+
+	DRM_DEBUG_TDR("End recovery work\n\n");
 }
 
 static void i915_report_and_clear_eir(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t instdone[I915_NUM_INSTDONE_REG];
-	u32 eir = I915_READ(EIR);
+	u32 eir;
 	int pipe, i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
 
+	eir = I915_READ(EIR);
 	if (!eir)
-		return;
+		goto i915_report_and_clear_eir_exit;
 
 	pr_err("render error detected, EIR: 0x%08x\n", eir);
 
-	i915_get_extra_instdone(dev, instdone);
+	i915_get_extra_instdone(dev, instdone, &dev_priv->ring[RCS]);
 
 	if (IS_G4X(dev)) {
 		if (eir & (GM45_ERROR_MEM_PRIV | GM45_ERROR_CP_PRIV)) {
@@ -2092,6 +2167,9 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
 		I915_WRITE(EMR, I915_READ(EMR) | eir);
 		I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
 	}
+
+i915_report_and_clear_eir_exit:
+	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 }
 
 /**
@@ -2104,39 +2182,74 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  * so userspace knows something bad happened (should trigger collection
  * of a ring dump etc.).
  */
-void i915_handle_error(struct drm_device *dev, bool wedged)
+void i915_handle_error(struct drm_device *dev, struct intel_hangcheck *hc)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	int full_reset = 0;
+	unsigned long cur_time;
+	unsigned long last_reset;
 
 	i915_capture_error_state(dev);
 	i915_report_and_clear_eir(dev);
 
-	if (wedged) {
+	/* Currently we only support individual ring reset for GEN7 onwards,
+	 * older chips will revert to a full reset.
+	 * Error interrupts trigger a full reset (hc == NULL)*/
+	if ((INTEL_INFO(dev)->gen >= 7) && hc) {
+		cur_time = get_seconds();
+		last_reset = hc->last_reset;
+		hc->last_reset = cur_time;
+
+		if ((cur_time - last_reset)
+		    < i915_ring_reset_min_alive_period) {
+			/* This ring is hanging too frequently.
+			* Opt for full-reset instead */
+			DRM_DEBUG_TDR("Ring %d hanging too quickly...\r\n",
+				hc->ringid);
+			full_reset = 1;
+		} else {
+			if (atomic_read(&hc->reset)) {
+				/* Reset already in progress for this ring */
+				return;
+			}
+
+			atomic_set(&hc->reset, 1);
+			DRM_DEBUG_TDR("Reset Ring %d\n", hc->ringid);
+		}
+	} else
+		full_reset = 1;
+
+	if (!hc || full_reset) {
+		if (i915_reset_in_progress(&dev_priv->gpu_error))
+			return;
+
 		atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
 				&dev_priv->gpu_error.reset_counter);
-
-		/*
-		 * Wakeup waiting processes so that the reset work function
-		 * i915_error_work_func doesn't deadlock trying to grab various
-		 * locks. By bumping the reset counter first, the woken
-		 * processes will see a reset in progress and back off,
-		 * releasing their locks and then wait for the reset completion.
-		 * We must do this for _all_ gpu waiters that might hold locks
-		 * that the reset work needs to acquire.
-		 *
-		 * Note: The wake_up serves as the required memory barrier to
-		 * ensure that the waiters see the updated value of the reset
-		 * counter atomic_t.
-		 */
-		i915_error_wake_up(dev_priv, false);
+		DRM_DEBUG_TDR("Full reset of GPU requested\n");
 	}
 
 	/*
+	 * Wakeup waiting processes so that the reset work function
+	 * i915_error_work_func doesn't deadlock trying to grab various
+	 * locks. By bumping the reset counter first, the woken
+	 * processes will see a reset in progress and back off,
+	 * releasing their locks and then wait for the reset completion.
+	 * We must do this for _all_ gpu waiters that might hold locks
+	 * that the reset work needs to acquire.
+	 *
+	 * Note: The wake_up serves as the required memory barrier to
+	 * ensure that the waiters see the updated value of the reset
+	 * counter atomic_t.
+	 */
+	i915_error_wake_up(dev_priv, false);
+
+	/*
 	 * Our reset work can grab modeset locks (since it needs to reset the
 	 * state of outstanding pagelips). Hence it must not be run on our own
 	 * dev-priv->wq work queue for otherwise the flush_work in the pageflip
 	 * code will deadlock.
 	 */
+	DRM_DEBUG_TDR("Schedule error recovery work\n");
 	schedule_work(&dev_priv->gpu_error.work);
 }
 
@@ -2339,245 +2452,191 @@ ring_last_seqno(struct intel_ring_buffer *ring)
 			  struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool
-ring_idle(struct intel_ring_buffer *ring, u32 seqno)
-{
-	return (list_empty(&ring->request_list) ||
-		i915_seqno_passed(seqno, ring_last_seqno(ring)));
-}
-
-static struct intel_ring_buffer *
-semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
-{
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	u32 cmd, ipehr, acthd, acthd_min;
-
-	ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
-	if ((ipehr & ~(0x3 << 16)) !=
-	    (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-		return NULL;
-
-	/* ACTHD is likely pointing to the dword after the actual command,
-	 * so scan backwards until we find the MBOX.
-	 */
-	acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-	acthd_min = max((int)acthd - 3 * 4, 0);
-	do {
-		cmd = ioread32(ring->virtual_start + acthd);
-		if (cmd == ipehr)
-			break;
-
-		acthd -= 4;
-		if (acthd < acthd_min)
-			return NULL;
-	} while (1);
-
-	*seqno = ioread32(ring->virtual_start+acthd+4)+1;
-	return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-}
-
-static int semaphore_passed(struct intel_ring_buffer *ring)
-{
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	struct intel_ring_buffer *signaller;
-	u32 seqno, ctl;
-
-	ring->hangcheck.deadlock = true;
-
-	signaller = semaphore_waits_for(ring, &seqno);
-	if (signaller == NULL || signaller->hangcheck.deadlock)
-		return -1;
-
-	/* cursory check for an unkickable deadlock */
-	ctl = I915_READ_CTL(signaller);
-	if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
-		return -1;
-
-	return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
-}
-
-static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+void i915_queue_hangcheck(struct drm_device *dev)
 {
-	struct intel_ring_buffer *ring;
-	int i;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	if (!i915_enable_hangcheck)
+		return;
 
-	for_each_ring(ring, dev_priv, i)
-		ring->hangcheck.deadlock = false;
+	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
 }
 
-static enum intel_ring_hangcheck_action
-ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
+static bool kick_ring(struct intel_ring_buffer *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 tmp;
-
-	if (ring->hangcheck.acthd != acthd)
-		return HANGCHECK_ACTIVE;
-
-	if (IS_GEN2(dev))
-		return HANGCHECK_HUNG;
-
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
-	tmp = I915_READ_CTL(ring);
+	u32 tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
 		DRM_ERROR("Kicking stuck wait on %s\n",
 			  ring->name);
-		i915_handle_error(dev, false);
 		I915_WRITE_CTL(ring, tmp);
-		return HANGCHECK_KICK;
+		return true;
 	}
+	return false;
+}
 
-	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
-		switch (semaphore_passed(ring)) {
-		default:
-			return HANGCHECK_HUNG;
-		case 1:
-			DRM_ERROR("Kicking stuck semaphore on %s\n",
-				  ring->name);
-			i915_handle_error(dev, false);
-			I915_WRITE_CTL(ring, tmp);
-			return HANGCHECK_KICK;
-		case 0:
-			return HANGCHECK_WAIT;
+/* This function is called when the TDR algorithm detects that the hardware
+ * has not advanced during the last sampling period.*/
+static bool i915_hangcheck_hung(struct intel_hangcheck *hc)
+{
+	struct drm_device *dev = hc->dev;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	uint32_t mbox_wait;
+	uint32_t threshold;
+ 	struct intel_ring_buffer *ring;
+
+	DRM_DEBUG_TDR("Ring [%d] hc->count = %d\n", hc->ringid, hc->count);
+
+	ring = &dev_priv->ring[hc->ringid];
+
+	/* Is this ring waiting on a semaphore mbox?
+	* If so, give it a bit longer as it may be waiting on another
+	* ring which has actually hung. Give the other ring chance to
+	* reset and clear the hang.
+	*/
+	mbox_wait = ((I915_READ(RING_CTL(ring->mmio_base)) >> 10) & 0x1);
+	threshold = mbox_wait ? MBOX_HANGCHECK_THRESHOLD : HANGCHECK_THRESHOLD;
+
+	if (hc->count++ > threshold) {
+		bool hung = true;
+
+		DRM_DEBUG_TDR("Hangcheck timer elapsed... ring %d hung\n",
+			hc->ringid);
+		/* Reset the counter*/
+		hc->count = 0;
+
+		if (!IS_GEN2(dev)) {
+			/* If the ring is hanging on a WAIT_FOR_EVENT
+			 * then simply poke the RB_WAIT bit
+			 * and break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
+			ring = &dev_priv->ring[hc->ringid];
+			hung &= !kick_ring(ring);
+			DRM_DEBUG_TDR("hung=%d after kick ring\n", hung);
+		}
+		if (hung) {
+			i915_handle_error(dev, hc);
 		}
+		return hung;
 	}
-
-	return HANGCHECK_HUNG;
-}
+	return false;
+} 
 
 /**
- * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. We keep track per ring seqno progress and
- * if there are no progress, hangcheck score for that ring is increased.
- * Further, acthd is inspected to see if the ring is stuck. On stuck case
- * we kick the ring. If we see no progress on three subsequent calls
- * we assume chip is wedged and try to fix it by resetting the chip.
+ * This is called from the hangcheck timer for each ring.
+ * It samples the current state of the hardware to make
+ * sure that it is progressing.
  */
-static void i915_hangcheck_elapsed(unsigned long data)
-{
-	struct drm_device *dev = (struct drm_device *)data;
-	drm_i915_private_t *dev_priv = dev->dev_private;
+void i915_hangcheck_sample(unsigned long data)
+{
+	struct intel_hangcheck *hc = (struct intel_hangcheck *)data;
+	struct drm_device *dev;
+	drm_i915_private_t *dev_priv;
+	uint32_t head, tail, acthd, instdone[I915_NUM_INSTDONE_REG];
+	uint32_t cur_seqno = 0;
+	uint32_t last_seqno = 0;
 	struct intel_ring_buffer *ring;
-	int i;
-	int busy_count = 0, rings_hung = 0;
-	bool stuck[I915_NUM_RINGS] = { 0 };
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define FIRE 30
-
-	if (!i915_enable_hangcheck)
-		return;
-
-	for_each_ring(ring, dev_priv, i) {
-		u32 seqno, acthd;
-		bool busy = true;
-
-		semaphore_clear_deadlocks(dev_priv);
-
-		seqno = ring->get_seqno(ring, false);
-		acthd = intel_ring_get_active_head(ring);
-
-		if (ring->hangcheck.seqno == seqno) {
-			if (ring_idle(ring, seqno)) {
-				ring->hangcheck.action = HANGCHECK_IDLE;
-
-				if (waitqueue_active(&ring->irq_queue)) {
-					/* Issue a wake-up to catch stuck h/w. */
-					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
-							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-								  ring->name);
-						else
-							DRM_INFO("Fake missed irq on %s\n",
-								 ring->name);
-						wake_up_all(&ring->irq_queue);
-					}
-					/* Safeguard against driver failure */
-					ring->hangcheck.score += BUSY;
-				} else
-					busy = false;
+	bool idle;
+	int instdone_cmp;
+	int pending_work = 1;
+	int resched_timer = 1;
+	int empty;
+ 
+	if (!i915_enable_hangcheck || !hc)
+ 		return;
+
+	dev = hc->dev;
+	dev_priv = dev->dev_private;
+
+	ring = &dev_priv->ring[hc->ringid];
+
+	/* Sample the current state */
+	head = I915_READ_HEAD(ring) & HEAD_ADDR;
+	tail = I915_READ_TAIL(ring) & TAIL_ADDR;
+	acthd = intel_ring_get_active_head(ring);
+	empty = list_empty(&ring->request_list);
+
+	i915_get_extra_instdone(dev, instdone, ring);
+	instdone_cmp = (memcmp(hc->prev_instdone,
+			instdone, sizeof(instdone)) == 0) ? 1 : 0;
+
+	if (!empty) {
+		/* Examine the seqno's to see where the HW has got to
+		* (Only call ring_last_seqno when the list is non-empty)*/
+		cur_seqno = ring->get_seqno(ring, false);
+		last_seqno = ring_last_seqno(ring);
+	}
+
+	if (empty || i915_seqno_passed(cur_seqno, last_seqno)) {
+		/* If the request list is empty or the HW has passed the
+		* last seqno of the last item in the request list then the
+		* HW is considered idle.
+		* The driver may not have cleaned up the request list yet */
+		pending_work = 0;
+	}
+
+	idle = ((head == tail) && (pending_work == 0));
+
+	DRM_DEBUG_TDR("[%d] HD: 0x%08x 0x%08x, ACTHD: 0x%08x 0x%08x IC: %d\n",
+		ring->id, head, hc->last_hd, acthd, hc->last_acthd,
+		instdone_cmp);
+	DRM_DEBUG_TDR("E:%d PW:%d TL:0x%08x Csq:0x%08x Lsq:0x%08x Idle: %d\n",
+		empty, pending_work, tail, cur_seqno, last_seqno, idle);
+
+	/* Check both head and active head.
+	* Neither is enough on its own - acthd can be pointing within the
+	* batch buffer so is more likely to be moving, but the same
+	* underlying buffer object could be submitted more than once.
+	* If it happens to pause at exactly the same place in the batch
+	* buffer and we sample it at that moment then we could see it as
+	* hung over 3 sample periods that do not belong to the same
+	* batch submission - this would result in a false positive.
+	* We know that the head pointer will have advanced for each
+	* batch buffer as the ring has to contain a new MI_BATCH_BUFFER_START
+	* for every do_exec call, so by combining head and active head we can
+	* ensure that the hang detection distinguishes between batch buffers*/
+	if ((hc->last_acthd == acthd)
+	    && (hc->last_hd == head)
+	    && instdone_cmp) {
+		/* Ring hasn't advanced in this sampling period */
+		if (idle) {
+			/* The hardware is idle */
+			if (waitqueue_active(&ring->irq_queue)) {
+				/* We expect the wait queue to drain
+				* if the hardware has remained idle
+				* for 3 consecutive samples. Wake up
+				* the queue on each sample to try and
+				* release it, but if it persists then
+				* trigger a reset */
+
+				DRM_DEBUG_TDR("Possible stuck wait (0x%08x)\n",
+					ring->last_irq_seqno);
+				wake_up_all(&ring->irq_queue);
+				i915_hangcheck_hung(hc);
 			} else {
-				/* We always increment the hangcheck score
-				 * if the ring is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * ring is in a legitimate wait for another
-				 * ring. In that case the waiting ring is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				ring->hangcheck.action = ring_stuck(ring,
-								    acthd);
-
-				switch (ring->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-					break;
-				case HANGCHECK_ACTIVE:
-					ring->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					ring->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					ring->hangcheck.score += HUNG;
-					stuck[i] = true;
-					break;
-				}
+				/* Hardware and driver both idle */
+				hc->count = 0;
+				resched_timer = 0;
 			}
 		} else {
-			ring->hangcheck.action = HANGCHECK_ACTIVE;
-
-			/* Gradually reduce the count so that we catch DoS
-			 * attempts across multiple batches.
-			 */
-			if (ring->hangcheck.score > 0)
-				ring->hangcheck.score--;
-		}
-
-		ring->hangcheck.seqno = seqno;
-		ring->hangcheck.acthd = acthd;
-		busy_count += busy;
-	}
-
-	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score > FIRE) {
-			DRM_INFO("%s on %s\n",
-				 stuck[i] ? "stuck" : "no progress",
-				 ring->name);
-			rings_hung++;
+			/* The hardware is busy but has not advanced
+			* since the last sample - possible hang*/
+			i915_hangcheck_hung(hc);
 		}
+	} else {
+		/* The state has changed so the hardware is active */
+		hc->count = 0;
 	}
 
-	if (rings_hung)
-		return i915_handle_error(dev, true);
-
-	if (busy_count)
-		/* Reset timer case chip hangs without another request
-		 * being added */
-		i915_queue_hangcheck(dev);
-}
-
-void i915_queue_hangcheck(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	if (!i915_enable_hangcheck)
-		return;
+	/* Always update last sampled state */
+	hc->last_hd = head;
+	hc->last_acthd = acthd;
+	memcpy(hc->prev_instdone, instdone, sizeof(instdone));
 
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	if (resched_timer)
+		mod_timer(&hc->timer, jiffies + DRM_I915_HANGCHECK_JIFFIES);
 }
 
 static void ibx_irq_preinstall(struct drm_device *dev)
@@ -3189,7 +3248,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, NULL);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3371,7 +3430,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, NULL);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3616,7 +3675,7 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, NULL);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3775,10 +3834,8 @@ void intel_irq_init(struct drm_device *dev)
 	INIT_WORK(&dev_priv->gpu_error.work, i915_error_work_func);
 	INIT_WORK(&dev_priv->rps.work, gen6_pm_rps_work);
 	INIT_WORK(&dev_priv->l3_parity.error_work, ivybridge_parity_work);
+	init_waitqueue_head(&dev_priv->error_queue);
 
-	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
-		    i915_hangcheck_elapsed,
-		    (unsigned long) dev);
 	setup_timer(&dev_priv->hotplug_reenable_timer, i915_reenable_hotplug_timer_func,
 		    (unsigned long) dev_priv);
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 2df2366..c5131ce 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -8484,7 +8484,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 		goto err_unpin;
 	}
 
-	len = 4;
+	len = 12;
 	if (ring->id == RCS)
 		len += 6;
 
@@ -8512,11 +8512,34 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 		intel_ring_emit(ring, ring->scratch.gtt_offset + 256);
 	}
 
+	/* Set a flag to indicate that a page flip interrupt is expected.
+	* The flag is used by the TDR logic to detect whether the blitter hung
+	* on a page flip command, in which case it will need to manually
+	* complete the page flip.
+	* The 'flag' is actually the pipe value associated with this page
+	* flip + 1 so that the TDR code knows which pipe failed to flip.
+	* A value of 0 indicates that a flip is not currently in progress on
+	* the HW.*/
+	intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+	intel_ring_emit(ring, I915_GEM_PGFLIP_INDEX <<
+				MI_STORE_DWORD_INDEX_SHIFT);
+	intel_ring_emit(ring, intel_crtc->pipe + 1);
+	intel_ring_emit(ring, MI_NOOP);
+
 	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
 	intel_ring_emit(ring, (fb->pitches[0] | obj->tiling_mode));
 	intel_ring_emit(ring, i915_gem_obj_ggtt_offset(obj) + intel_crtc->dspaddr_offset);
 	intel_ring_emit(ring, (MI_NOOP));
 
+	/* Clear the flag as soon as we pass over the page flip command.
+	* If we passed over the command without hanging then an interrupt should
+	* be received to complete the page flip.*/
+	intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+	intel_ring_emit(ring, I915_GEM_PGFLIP_INDEX <<
+				MI_STORE_DWORD_INDEX_SHIFT);
+	intel_ring_emit(ring, 0);
+	intel_ring_emit(ring, MI_NOOP);
+
 	intel_mark_page_flip_active(intel_crtc);
 	__intel_ring_advance(ring);
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index cce29d0..37c3ed6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -47,6 +47,14 @@ void __intel_ring_advance(struct intel_ring_buffer *ring)
 
 	ring->tail &= ring->size - 1;
 
+	/* Re-schedule the hangcheck timer each time the ring is given new work
+	* so that we can detect hangs caused by commands inserted directly
+	* to the ring as well as bad batch buffers */
+	if (!dev_priv->ums.mm_suspended && i915_enable_hangcheck) {
+		mod_timer(&dev_priv->hangcheck[ring->id].timer,
+			jiffies + DRM_I915_HANGCHECK_JIFFIES);
+	}
+
 	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
 		return;
 	ring->write_tail(ring, ring->tail);
@@ -1591,7 +1599,7 @@ static int ring_wait_for_space(struct intel_ring_buffer *ring, int n)
 		msleep(1);
 
 		ret = i915_gem_check_wedge(&dev_priv->gpu_error,
-					   dev_priv->mm.interruptible);
+					   dev_priv->mm.interruptible, ring);
 		if (ret)
 			return ret;
 	} while (!time_after(jiffies, end));
@@ -1691,7 +1699,7 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
 	int ret;
 
 	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
-				   dev_priv->mm.interruptible);
+				   dev_priv->mm.interruptible, ring);
 	if (ret)
 		return ret;
 
@@ -2010,7 +2018,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
 		/* Spin waiting for the device to ack the reset request */
 		ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
 					& GEN6_GRDOM_RENDER) == 0, 500);
-		DRM_DEBUG("RCS Reset\n");
+		DRM_DEBUG_TDR("RCS Reset\n");
 		break;
 
 
@@ -2020,7 +2028,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
 		/* Spin waiting for the device to ack the reset request */
 		ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
 				& GEN6_GRDOM_BLT) == 0, 500);
-		DRM_DEBUG("BCS Reset\n");
+		DRM_DEBUG_TDR("BCS Reset\n");
 		break;
 
 
@@ -2030,7 +2038,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
 		/* Spin waiting for the device to ack the reset request */
 		ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
 					& GEN6_GRDOM_MEDIA) == 0, 500);
-		DRM_DEBUG("VCS Reset\n");
+		DRM_DEBUG_TDR("VCS Reset\n");
 		break;
 
 	case VECS:
@@ -2039,7 +2047,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
 		/* Spin waiting for the device to ack the reset request */
 		ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
 					& GEN6_GRDOM_VEBOX) == 0, 500);
-		DRM_DEBUG("VECS Reset\n");
+		DRM_DEBUG_TDR("VECS Reset\n");
 		break;
 
 	default:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index cd96ad9..6d45b61 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -198,6 +198,8 @@ struct  intel_ring_buffer {
 	u32 saved_state[I915_RING_CONTEXT_SIZE];
 	struct intel_ring_hangcheck hangcheck;
 
+	uint32_t last_irq_seqno;
+
 	struct {
 		struct drm_i915_gem_object *obj;
 		u32 gtt_offset;
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 5349215..7c71fc0 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -806,9 +806,13 @@ static int gen6_do_reset(struct drm_device *dev)
 
 int intel_gpu_reset(struct drm_device *dev)
 {
+	drm_i915_private_t *dev_priv = dev->dev_private;
 	switch (INTEL_INFO(dev)->gen) {
 	case 7:
-	case 6: return gen6_do_reset(dev);
+	case 6:
+		dev_priv->total_resets++;
+		DRM_DEBUG_TDR("total_resets %d\n", dev_priv->total_resets);
+		return gen6_do_reset(dev);
 	case 5: return ironlake_do_reset(dev);
 	case 4: return i965_do_reset(dev);
 	default: return -ENODEV;
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 1d4a920..73309c5 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -90,6 +90,7 @@ struct videomode;
 #define DRM_UT_DRIVER		0x02
 #define DRM_UT_KMS		0x04
 #define DRM_UT_PRIME		0x08
+#define DRM_UT_TDR		0x10
 /*
  * Three debug levels are defined.
  * drm_core, drm_driver, drm_kms
@@ -211,6 +212,11 @@ int drm_err(const char *func, const char *format, ...);
 		drm_ut_debug_printk(DRM_UT_PRIME, DRM_NAME,		\
 					__func__, fmt, ##args);		\
 	} while (0)
+#define DRM_DEBUG_TDR(fmt, args...)					\
+	do {								\
+		drm_ut_debug_printk(DRM_UT_TDR, DRM_NAME,		\
+					__func__, fmt, ##args);		\
+	} while (0)
 #define DRM_LOG(fmt, args...)						\
 	do {								\
 		drm_ut_debug_printk(DRM_UT_CORE, NULL,			\
@@ -235,6 +241,7 @@ int drm_err(const char *func, const char *format, ...);
 #define DRM_DEBUG_DRIVER(fmt, args...) do { } while (0)
 #define DRM_DEBUG_KMS(fmt, args...)	do { } while (0)
 #define DRM_DEBUG_PRIME(fmt, args...)	do { } while (0)
+#define DRM_DEBUG_TDR(fmt, args...)	do { } while (0)
 #define DRM_DEBUG(fmt, arg...)		 do { } while (0)
 #define DRM_LOG(fmt, arg...)		do { } while (0)
 #define DRM_LOG_KMS(fmt, args...) do { } while (0)
-- 
1.8.4




More information about the Intel-gfx mailing list