[Intel-gfx] [RFC 06/13] drm/i915: Communicating reset requests
Lister, Ian
ian.lister at intel.com
Mon Dec 16 17:03:03 CET 2013
From 0f1092b19d66cfe803b22f9e982c5e5be5a5ff23 Mon Sep 17 00:00:00 2001
Message-Id: <0f1092b19d66cfe803b22f9e982c5e5be5a5ff23.1387201899.git.ian.lister at intel.com>
In-Reply-To: <cover.1387201899.git.ian.lister at intel.com>
References: <cover.1387201899.git.ian.lister at intel.com>
From: ian-lister <ian.lister at intel.com>
Date: Tue, 10 Dec 2013 11:53:25 +0000
Subject: [RFC 06/13] drm/i915: Communicating reset requests
Per-engine recovery requires us to pass the hung ring information through
to i915_error_work_func. The reset_counter was sufficient on its own when
we were just dealing with global reset, but for per-engine reset we need
to signal reset requests for each engine.
The reset_counter offers a very tidy way of communicating to the
rest of the driver that a reset is in-progress or that a reset has
happened since the count was last checked, without using locks. Ideally
we want to keep this mechanism.
To resolve this, the reset_counter is now shared by global reset and
per-engine reset. It is an indication that some sort of recovery work
is in-progress. A new variable "reset_requests" has been added to pass
the specific reset details from i915_handle_error to i915_error_work_func.
Both of these function take gpu_error.lock whilst modifiying these
variables which guarantees that the I915_RESET_IN_PROGRESS flag
remains set whilstever gpu_error.reset_requests is non-zero.
This retains the best of the original scheme as the rest of the driver
can remain lockless when checking the reset_counter.
This patch is part of a set and i915_error_work_func doesn't yet attempt
to do per-engine reset recovery (it will convert all requests into
global resets).
Signed-off-by: ian-lister <ian.lister at intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 57 +++++++++++
drivers/gpu/drm/i915/i915_drv.h | 11 +++
drivers/gpu/drm/i915/i915_gem.c | 3 +-
drivers/gpu/drm/i915/i915_irq.c | 166 +++++++++++++++++++++++---------
drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +
drivers/gpu/drm/i915/intel_uncore.c | 22 ++++-
6 files changed, 210 insertions(+), 51 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 311c647..f00b19c 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -896,6 +896,62 @@ static const struct file_operations i915_error_state_fops = {
.release = i915_error_state_release,
};
+static ssize_t
+i915_ring_hangcheck_read(struct file *filp,
+ char __user *ubuf,
+ size_t max,
+ loff_t *ppos)
+{
+ /* Returns the total number of times the rings
+ * have hung and been reset since boot */
+ struct drm_device *dev = filp->private_data;
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ char buf[100];
+ int len;
+
+ len = scnprintf(buf, sizeof(buf),
+ "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X\n",
+ dev_priv->gpu_error.global_resets,
+ dev_priv->ring[RCS].hangcheck.total,
+ dev_priv->ring[VCS].hangcheck.total,
+ dev_priv->ring[BCS].hangcheck.total);
+
+ return simple_read_from_buffer(ubuf, max, ppos, buf, len);
+}
+
+static ssize_t
+i915_ring_hangcheck_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt,
+ loff_t *ppos)
+{
+ struct drm_device *dev = filp->private_data;
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ uint32_t i;
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
+
+ for (i = 0; i < I915_NUM_RINGS; i++) {
+ /* Reset the hangcheck counters */
+ dev_priv->ring[i].hangcheck.total = 0;
+ }
+
+ dev_priv->gpu_error.global_resets = 0;
+
+ spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
+
+ return cnt;
+}
+
+static const struct file_operations i915_ring_hangcheck_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = i915_ring_hangcheck_read,
+ .write = i915_ring_hangcheck_write,
+ .llseek = default_llseek,
+};
+
static int
i915_next_seqno_get(void *data, u64 *val)
{
@@ -3139,6 +3195,7 @@ static const struct i915_debugfs_files {
{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
{"i915_ring_test_irq", &i915_ring_test_irq_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
+ {"i915_ring_hangcheck", &i915_ring_hangcheck_fops},
{"i915_error_state", &i915_error_state_fops},
{"i915_next_seqno", &i915_next_seqno_fops},
{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0d8805e..1087e4e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1132,6 +1132,13 @@ struct i915_gpu_error {
*/
atomic_t reset_counter;
+ /* Tracking of per-engine reset requests.
+ * It must be synchronised with reset_counter by taking gpu_error.lock
+ * to ensure that the I915_RESET_IN_PROGRESS flag is always set if
+ * reset_requests is non-zero.*/
+#define I915_GLOBAL_RESET_REQUEST (1 << 31)
+ u32 reset_requests;
+
/**
* Special values/flags for reset_counter
*
@@ -1146,6 +1153,9 @@ struct i915_gpu_error {
* to global reset in case per-engine reset happens too frequently */
u32 score;
+ /* A count of global resets for debugfs stats */
+ u32 global_resets;
+
/**
* Waitqueue to signal when the reset has completed. Used by clients
* that wait for dev_priv->mm.wedged to settle.
@@ -1936,6 +1946,7 @@ extern int intel_gpu_reset(struct drm_device *dev);
extern int intel_gpu_engine_reset(struct drm_device *dev,
enum intel_ring_id engine);
extern int i915_reset(struct drm_device *dev);
+extern int i915_handle_hung_ring(struct intel_ring_buffer *ring);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a29b355..42a1647 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2351,7 +2351,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
struct drm_i915_gem_request,
list);
- if (request->seqno > completed_seqno)
+ if (!ring->hangcheck.status_updated
+ && (request->seqno > completed_seqno))
i915_set_reset_status(ring, request, acthd);
i915_gem_free_request(request);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0131423..f550b1e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2206,63 +2206,117 @@ static void i915_error_work_func(struct work_struct *work)
drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
gpu_error);
struct drm_device *dev = dev_priv->dev;
+ struct intel_ring_buffer *ring;
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
int ret;
-
- kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
-
+ u32 reset_requests;
+ unsigned long irqflags;
+ int i;
+
/*
* Note that there's only one work item which does gpu resets, so we
* need not worry about concurrent gpu resets potentially incrementing
* error->reset_counter twice. We only need to take care of another
- * racing irq/hangcheck declaring the gpu dead for a second time. A
- * quick check for that is good enough: schedule_work ensures the
- * correct ordering between hang detection and this work item, and since
- * the reset in-progress bit is only ever set by code outside of this
- * work we don't need to worry about any other races.
+ * racing irq/hangcheck declaring another ring hung.
+ * The spinlock ensures synchronisation with i915_handle_error and
+ * makes sure that the reset_counter doesn't get incremented until *all*
+ * pending reset requests have been processed.
*/
- if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
- DRM_DEBUG_DRIVER("resetting chip\n");
- kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
- reset_event);
+ spin_lock_irqsave(&error->lock, irqflags);
+ reset_requests = error->reset_requests;
+ error->reset_requests = 0;
+ spin_unlock_irqrestore(&error->lock, irqflags);
- /*
- * All state reset _must_ be completed before we update the
- * reset counter, for otherwise waiters might miss the reset
- * pending state and not properly drop locks, resulting in
- * deadlocks with the reset work.
- */
- ret = i915_reset(dev);
+ kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
- intel_display_handle_reset(dev);
+ if (reset_requests && !i915_terminally_wedged(error)) {
+ DRM_DEBUG_TDR("Processing reset requests: 0x%08x\n",
+ reset_requests);
- if (ret == 0) {
- /*
- * After all the gem state is reset, increment the reset
- * counter and wake up everyone waiting for the reset to
- * complete.
- *
- * Since unlock operations are a one-sided barrier only,
- * we need to insert a barrier here to order any seqno
- * updates before
- * the counter increment.
- */
- smp_mb__before_atomic_inc();
- atomic_inc(&dev_priv->gpu_error.reset_counter);
+ /* Grab mutex whilst processing individual reset requests*/
+ mutex_lock(&dev->struct_mutex);
- kobject_uevent_env(&dev->primary->kdev->kobj,
- KOBJ_CHANGE, reset_done_event);
- } else {
- atomic_set(&error->reset_counter, I915_WEDGED);
+ for_each_ring(ring, dev_priv, i) {
+ /* Reset the status update flag. It ensures that
+ * the status is only updated once for each ring per
+ * call to error_work_func even if we fall back to
+ * a global reset,
+ */
+ ring->hangcheck.status_updated = 0;
+
+ /* Global reset takes precedence */
+ if (reset_requests & I915_GLOBAL_RESET_REQUEST)
+ continue;
+
+ if (reset_requests & (0x1 << ring->id)) {
+ DRM_DEBUG_TDR("reset request for %s\n",
+ ring->name);
+
+ if (i915_handle_hung_ring(ring) != 0) {
+ DRM_ERROR("%s reset failed\n",
+ ring->name);
+
+ /* Try a global reset instead */
+ reset_requests =
+ I915_GLOBAL_RESET_REQUEST;
+ continue;
+ }
+ }
}
- /*
- * Note: The wake_up also serves as a memory barrier so that
- * waiters see the update value of the reset counter atomic_t.
+ mutex_unlock(&dev->struct_mutex);
+
+ if (reset_requests & I915_GLOBAL_RESET_REQUEST) {
+ DRM_DEBUG_DRIVER("resetting chip\n");
+ kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
+ reset_event);
+
+ /*
+ * All state reset _must_ be completed before we update the
+ * reset counter, for otherwise waiters might miss the reset
+ * pending state and not properly drop locks, resulting in
+ * deadlocks with the reset work.
+ * After all the gem state is reset, increment the reset
+ * counter and wake up everyone waiting for the reset to
+ * complete.
+ */
+ ret = i915_reset(dev);
+
+ intel_display_handle_reset(dev);
+
+ if (ret != 0)
+ atomic_set(&error->reset_counter, I915_WEDGED);
+ }
+
+ /* Only increment reset counter if no new work has been added
+ * since we sampled reset_requests. If more work has been
+ * added then i915_error_work_func will be scheduled again so
+ * leave the in-progress flag set.
*/
- i915_error_wake_up(dev_priv, true);
+ if (!i915_terminally_wedged(error)) {
+ spin_lock_irqsave(&error->lock, irqflags);
+ if (error->reset_requests == 0) {
+ /* Since unlock operations are a one-sided barrier only,
+ * we need to insert a barrier here to order any seqno
+ * updates before the counter increment.
+ */
+ smp_mb__before_atomic_inc();
+ atomic_inc(&error->reset_counter);
+ }
+ spin_unlock_irqrestore(&error->lock, irqflags);
+
+ /*
+ * Note: The wake_up also serves as a memory barrier so that
+ * waiters see the update value of the reset counter atomic_t.
+ */
+ i915_error_wake_up(dev_priv, true);
+ }
+
+ /* Send uevent to indicate that we have finished recovery work*/
+ kobject_uevent_env(&dev->primary->kdev->kobj,
+ KOBJ_CHANGE, reset_done_event);
}
}
@@ -2379,13 +2433,25 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
void i915_handle_error(struct drm_device *dev, u32 mask)
{
struct drm_i915_private *dev_priv = dev->dev_private;
+ unsigned long irqflags;
i915_capture_error_state(dev);
i915_report_and_clear_eir(dev);
- if (mask > 0) {
+ if (mask) {
+ /* Once the error work function executes it will sample the
+ * current value of reset_requests and start processing them.
+ * If we detect another hang before it completes then we need
+ * to ensure that it doesn't clear the IN_PROGRESS flag
+ * prematurely as the rest of the driver relies on reset_counter
+ * alone. A spinlock is used to synchronise between reset_counter
+ * and reset_requests.
+ */
+ spin_lock_irqsave(&dev_priv->gpu_error.lock, irqflags);
+ dev_priv->gpu_error.reset_requests = mask;
atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
- &dev_priv->gpu_error.reset_counter);
+ &dev_priv->gpu_error.reset_counter);
+ spin_unlock_irqrestore(&dev_priv->gpu_error.lock, irqflags);
/*
* Wakeup waiting processes so that the reset work function
@@ -2405,9 +2471,13 @@ void i915_handle_error(struct drm_device *dev, u32 mask)
/*
* Our reset work can grab modeset locks (since it needs to reset the
- * state of outstanding pagelips). Hence it must not be run on our own
- * dev-priv->wq work queue for otherwise the flush_work in the pageflip
- * code will deadlock.
+ * state of outstanding pageflips). Hence it must not be run on our own
+ * dev_priv->wq work queue for otherwise the flush_work in the pageflip
+ * code will deadlock. If error_work is already in the work queue then
+ * it will not be added again. If it isn't in the queue or it is
+ * currently executing then this call will add it the queue again so
+ * that even if it misses the reset flags during the current call it
+ * is guaranteed to see them on the next call.
*/
schedule_work(&dev_priv->gpu_error.work);
}
@@ -2921,12 +2991,12 @@ static void i915_hangcheck_elapsed(unsigned long data)
if (INTEL_INFO(dev)->gen < 7) {
/* Per-engine reset not supported */
dev_priv->gpu_error.score = 0;
- hang_mask = GLOBAL_RESET_REQUEST;
+ hang_mask = I915_GLOBAL_RESET_REQUEST;
} else if (dev_priv->gpu_error.score > FIRE) {
/* Per-engine reset occuring too frequently. Fall back
* to a global reset to try and resolve the hang */
dev_priv->gpu_error.score = 0;
- hang_mask = GLOBAL_RESET_REQUEST;
+ hang_mask = I915_GLOBAL_RESET_REQUEST;
DRM_DEBUG_TDR("Rings hanging too frequently\n");
}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b9125dc..7235720 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -66,7 +66,9 @@ struct intel_ring_hangcheck {
int score;
enum intel_ring_hangcheck_action action;
u32 resets; /* Total resets applied to this ring/engine*/
+ u32 total; /* Total resets applied to this ring/engine*/
u32 last_head; /* Head value recorded at last hang */
+ u32 status_updated;
};
struct intel_ring_buffer {
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index c51e389..8ab5385 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -975,6 +975,9 @@ static int gen6_do_reset(struct drm_device *dev)
/* Spin waiting for the device to ack the reset request */
ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_FULL) == 0, 500);
+ if (ret == 0)
+ dev_priv->gpu_error.global_resets++;
+
intel_uncore_forcewake_reset(dev);
spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
@@ -998,8 +1001,6 @@ static int gen6_do_engine_reset(struct drm_device *dev,
*/
spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
- ring->hangcheck.resets++;
-
/* Reset the engine.
* GEN6_GDRST is not in the gt power well so no need to check
* for fifo space for the write or forcewake the chip for
@@ -1012,6 +1013,9 @@ static int gen6_do_engine_reset(struct drm_device *dev,
ret = wait_for_atomic((__raw_i915_read32(dev_priv,
GEN6_GDRST)
& GEN6_GRDOM_RENDER) == 0, 500);
+ if (ret == 0)
+ ring->hangcheck.total++;
+
DRM_DEBUG_TDR("RCS Reset\n");
break;
@@ -1022,6 +1026,9 @@ static int gen6_do_engine_reset(struct drm_device *dev,
ret = wait_for_atomic((__raw_i915_read32(dev_priv,
GEN6_GDRST)
& GEN6_GRDOM_BLT) == 0, 500);
+ if (ret == 0)
+ ring->hangcheck.total++;
+
DRM_DEBUG_TDR("BCS Reset\n");
break;
@@ -1032,6 +1039,9 @@ static int gen6_do_engine_reset(struct drm_device *dev,
ret = wait_for_atomic((__raw_i915_read32(dev_priv,
GEN6_GDRST)
& GEN6_GRDOM_MEDIA) == 0, 500);
+ if (ret == 0)
+ ring->hangcheck.total++;
+
DRM_DEBUG_TDR("VCS Reset\n");
break;
@@ -1041,6 +1051,9 @@ static int gen6_do_engine_reset(struct drm_device *dev,
/* Spin waiting for the device to ack the reset request */
ret = wait_for_atomic((I915_READ_NOTRACE(GEN6_GDRST)
& GEN6_GRDOM_VEBOX) == 0, 500);
+ if (ret == 0)
+ ring->hangcheck.total++;
+
DRM_DEBUG_TDR("VECS Reset\n");
break;
@@ -1145,6 +1158,11 @@ int i915_handle_hung_ring(struct intel_ring_buffer *ring)
i915_set_reset_status(ring, request, acthd);
}
+ /* Flag that we have updated the stats for this ring in case
+ * a global reset comes in to prevent it from processing stats
+ * for this ring a second time */
+ ring->hangcheck.status_updated = 1;
+
/* Check if the ring has hung on an MI_DISPLAY_FLIP command.
* The pipe value will be stored in the HWS page if it has.*/
pipe = intel_read_status_page(ring, I915_GEM_PGFLIP_INDEX);
--
1.8.5.1
More information about the Intel-gfx
mailing list