[Intel-gfx] [PATCH 1/2] drm/i915: Fix __wait_seqno to use true infinite timeouts
Ben Widawsky
ben at bwidawsk.net
Tue Aug 27 08:11:01 CEST 2013
On Mon, Aug 26, 2013 at 02:51:53PM +0100, Chris Wilson wrote:
> When we switched to always using a timeout in conjunction with
> wait_seqno, we lost the ability to detect missed interrupts. Since, we
> have had issues with interrupts on a number of generations, and they are
> required to be delivered in a timely fashion for a smooth UX, it is
> important that we do log errors found in the wild and prevent the
> display stalling for upwards of 1s every time the seqno interrupt is
> missed.
>
> Rather than continue to fix up the timeouts to work around the interface
> impedence in wait_event_*(), open code the combination of
> wait_event[_interruptible][_timeout], and use the exposed timer to
> poll for seqno should we detect a lost interrupt.
>
> v2: In order to satisfy the debug requirement of logging missed
> interrupts with the real world requirments of making machines work even
> if interrupts are hosed, we revert to polling after detecting a missed
> interrupt.
>
> v3: Throw in a debugfs interface to simulate broken hw not reporting
> interrupts.
This interface should be a separate patch.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 68 ++++++++++++++++++++
> drivers/gpu/drm/i915/i915_drv.h | 5 ++
> drivers/gpu/drm/i915/i915_gem.c | 114 ++++++++++++++++++++--------------
> drivers/gpu/drm/i915/i915_gpu_error.c | 1 +
> drivers/gpu/drm/i915/i915_irq.c | 11 ++--
> 5 files changed, 148 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 55ab924..e823169 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
> i915_ring_stop_get, i915_ring_stop_set,
> "0x%08llx\n");
>
> +static int
> +i915_ring_missed_irq_get(void *data, u64 *val)
> +{
> + struct drm_device *dev = data;
> + drm_i915_private_t *dev_priv = dev->dev_private;
> +
> + *val = dev_priv->gpu_error.missed_irq_rings;
> + return 0;
> +}
> +
> +static int
> +i915_ring_missed_irq_set(void *data, u64 val)
> +{
> + struct drm_device *dev = data;
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + int ret;
> +
> + /* Lock against concurrent debugfs callers */
> + ret = mutex_lock_interruptible(&dev->struct_mutex);
> + if (ret)
> + return ret;
> + dev_priv->gpu_error.missed_irq_rings = val;
> + mutex_unlock(&dev->struct_mutex);
> +
> + return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
> + i915_ring_missed_irq_get, i915_ring_missed_irq_set,
> + "0x%08llx\n");
> +
> +static int
> +i915_ring_test_irq_get(void *data, u64 *val)
> +{
> + struct drm_device *dev = data;
> + drm_i915_private_t *dev_priv = dev->dev_private;
> +
> + *val = dev_priv->gpu_error.test_irq_rings;
> +
> + return 0;
> +}
> +
> +static int
> +i915_ring_test_irq_set(void *data, u64 val)
> +{
> + struct drm_device *dev = data;
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + int ret;
> +
> + DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
> +
> + /* Lock against concurrent debugfs callers */
> + ret = mutex_lock_interruptible(&dev->struct_mutex);
> + if (ret)
> + return ret;
> +
> + dev_priv->gpu_error.test_irq_rings = val;
> + mutex_unlock(&dev->struct_mutex);
> +
> + return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
> + i915_ring_test_irq_get, i915_ring_test_irq_set,
> + "0x%08llx\n");
> +
> #define DROP_UNBOUND 0x1
> #define DROP_BOUND 0x2
> #define DROP_RETIRE 0x4
> @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files {
> {"i915_min_freq", &i915_min_freq_fops},
> {"i915_cache_sharing", &i915_cache_sharing_fops},
> {"i915_ring_stop", &i915_ring_stop_fops},
> + {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> + {"i915_ring_test_irq", &i915_ring_test_irq_fops},
> {"i915_gem_drop_caches", &i915_drop_caches_fops},
> {"i915_error_state", &i915_error_state_fops},
> {"i915_next_seqno", &i915_next_seqno_fops},
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 84b95b1..ac6db6e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -993,6 +993,8 @@ struct i915_gpu_error {
>
> unsigned long last_reset;
>
> + unsigned long missed_irq_rings;
> +
> /**
> * State variable and reset counter controlling the reset flow
> *
> @@ -1031,6 +1033,9 @@ struct i915_gpu_error {
>
> /* For gpu hang simulation. */
> unsigned int stop_rings;
> +
> + /* For missed irq/seqno simulation. */
> + unsigned int test_irq_rings;
> };
>
> enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ccfebf5..4ed782f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
> return ret;
> }
>
> +static void fake_irq(unsigned long data)
> +{
> + wake_up_process((struct task_struct *)data);
> +}
> +
> +static bool missed_irq(struct drm_i915_private *dev_priv,
> + struct intel_ring_buffer *ring)
> +{
> + return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> +}
> +
> /**
> * __wait_seqno - wait until execution of seqno has finished
> * @ring: the ring expected to report seqno
> @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
> bool interruptible, struct timespec *timeout)
> {
> drm_i915_private_t *dev_priv = ring->dev->dev_private;
> - struct timespec before, now, wait_time={1,0};
> - unsigned long timeout_jiffies;
> - long end;
> - bool wait_forever = true;
> + struct timespec before, now;
> + DEFINE_WAIT(wait);
> + long timeout_jiffies;
> int ret;
>
> WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
> @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
> if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
> return 0;
>
> - trace_i915_gem_request_wait_begin(ring, seqno);
> -
> - if (timeout != NULL) {
> - wait_time = *timeout;
> - wait_forever = false;
> - }
> -
> - timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
> + timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
>
> - if (WARN_ON(!ring->irq_get(ring)))
> + if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
> + WARN_ON(!ring->irq_get(ring)))
> return -ENODEV;
>
> - /* Record current time in case interrupted by signal, or wedged * */
> + /* Record current time in case interrupted by signal, or wedged */
> + trace_i915_gem_request_wait_begin(ring, seqno);
> getrawmonotonic(&before);
> + for (;;) {
> + struct timer_list timer;
> + unsigned long expire;
>
> -#define EXIT_COND \
> - (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
> - i915_reset_in_progress(&dev_priv->gpu_error) || \
> - reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> - do {
> - if (interruptible)
> - end = wait_event_interruptible_timeout(ring->irq_queue,
> - EXIT_COND,
> - timeout_jiffies);
> - else
> - end = wait_event_timeout(ring->irq_queue, EXIT_COND,
> - timeout_jiffies);
> + prepare_to_wait(&ring->irq_queue, &wait,
> + interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
>
> /* We need to check whether any gpu reset happened in between
> * the caller grabbing the seqno and now ... */
> - if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> - end = -EAGAIN;
> + if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
> + /* ... but upgrade the -EGAIN to an -EIO if the gpu
^ EAGAIN (I realize it's
copypasta)
> + * is truely gone. */
^ truly
> + ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> + if (ret == 0)
> + ret = -EAGAIN;
> + break;
> + }
>
> - /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
> - * gone. */
> - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> - if (ret)
> - end = ret;
> - } while (end == 0 && wait_forever);
> + if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
> + ret = 0;
> + break;
> + }
> +
> + if (interruptible && signal_pending(current)) {
> + ret = -ERESTARTSYS;
> + break;
> + }
> +
Not sure, but I think the signal check should come first. As long as
you've thought about it, I'm okay with whatever.
> + if (timeout_jiffies <= 0) {
> + ret = -ETIME;
> + break;
> + }
>
> + timer.function = NULL;
> + if (timeout || missed_irq(dev_priv, ring)) {
> + setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
> + expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
> + mod_timer(&timer, expire);
> + }
Isn't this just a fancy schedule_timeout_interruptible?
> +
> + schedule();
> +
> + if (timeout)
> + timeout_jiffies = expire - jiffies;
> +
> + if (timer.function) {
> + del_singleshot_timer_sync(&timer);
> + destroy_timer_on_stack(&timer);
> + }
> + }
> getrawmonotonic(&now);
> + trace_i915_gem_request_wait_end(ring, seqno);
>
> ring->irq_put(ring);
> - trace_i915_gem_request_wait_end(ring, seqno);
> -#undef EXIT_COND
> +
> + finish_wait(&ring->irq_queue, &wait);
>
> if (timeout) {
> struct timespec sleep_time = timespec_sub(now, before);
> @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
> set_normalized_timespec(timeout, 0, 0);
> }
>
> - switch (end) {
> - case -EIO:
> - case -EAGAIN: /* Wedged */
> - case -ERESTARTSYS: /* Signal */
> - return (int)end;
> - case 0: /* Timeout */
> - return -ETIME;
> - default: /* Completed */
> - WARN_ON(end < 0); /* We're not aware of other errors */
> - return 0;
> - }
> + return ret;
> }
>
> /**
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 558e568..62368f4 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
> err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
> err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
> err_printf(m, "CCID: 0x%08x\n", error->ccid);
> + err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
>
> for (i = 0; i < dev_priv->num_fence_regs; i++)
> err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index a03b445..7ebf105 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
> if (ring_idle(ring, seqno)) {
> if (waitqueue_active(&ring->irq_queue)) {
> /* Issue a wake-up to catch stuck h/w. */
> - DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> - ring->name);
> - wake_up_all(&ring->irq_queue);
> - ring->hangcheck.score += HUNG;
> + if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> + DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> + ring->name);
> + wake_up_all(&ring->irq_queue);
> + }
> + /* Safeguard against driver failure */
> + ring->hangcheck.score += BUSY;
> } else
> busy = false;
> } else {
Honestly, I don't think using the scheduler APIs is a step in the right
direction wrt readability - but I have no better solution, and it seems
to do what you want.
Anyway, I tried to review it, and failed. If you want to split it up
more, I can try again.
Meanwhile, the idea seems okay to me.
--
Ben Widawsky, Intel Open Source Technology Center
More information about the Intel-gfx
mailing list