[Intel-gfx] [RFC 0/8] Force preemption

Fri Mar 16 20:53:01 UTC 2018

Quoting jeff.mcgee at intel.com (2018-03-16 18:30:57)
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> Force preemption uses engine reset to enforce a limit on the time
> that a request targeted for preemption can block. This feature is
> a requirement in automotive systems where the GPU may be shared by
> clients of critically high priority and clients of low priority that
> may not have been curated to be preemption friendly. There may be
> more general applications of this feature. I'm sharing as an RFC to
> stimulate that discussion and also to get any technical feedback
> that I can before submitting to the product kernel that needs this.
> I have developed the patches for ease of rebase, given that this is
> for the moment considered a non-upstreamable feature. It would be
> possible to refactor hangcheck to fully incorporate force preemption
> as another tier of patience (or impatience) with the running request.

Last night I spent 15mins and wrote a similar RFC as Joonas keep
muttering and I thought he was looking for code not that was some...

The real only difference is the approach to handling the pending timer.
I do not agree with your reset changes as they appear to percolate even
more uncertainity through the code. If the preempt-context is in flight
when the reset is triggered, the last active request is still defined by
incomplete request; the new request has not been submitted. This is
serialised by checking for EXECLISTS_ACTIVE_PREEMPT. Up until the point
that is cleared and the next request submitted; we should reset the
engine and resubmit. The race in tasklet handling is immaterial at that
point, you have already paid the cost in using a timer, the workqueue
and serialising for the reset.

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 76ea1da923bd..d9da68efbc86 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3969,8 +3969,8 @@ i915_wedged_set(void *data, u64 val)
                engine->hangcheck.stalled = true;
        }
 
-       i915_handle_error(i915, val, "Manually set wedged engine mask = %llx",
-                         val);
+       i915_handle_error(i915, val, I915_ERROR_CAPTURE,
+                         "Manually set wedged engine mask = %llx", val);
 
        wait_on_bit(&i915->gpu_error.flags,
                    I915_RESET_HANDOFF,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 26c1dd4b542e..d817b5e55c96 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2755,10 +2755,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
                           &dev_priv->gpu_error.hangcheck_work, delay);
 }
 
-__printf(3, 4)
+__printf(4, 5)
 void i915_handle_error(struct drm_i915_private *dev_priv,
                       u32 engine_mask,
+                      unsigned long flags,
                       const char *fmt, ...);
+#define I915_ERROR_CAPTURE BIT(0)
 
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 extern void intel_irq_fini(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5aaf356fd27f..0a255ffe4c51 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2593,6 +2593,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
  * i915_handle_error - handle a gpu error
  * @dev_priv: i915 device private
  * @engine_mask: mask representing engines that are hung
+ * @flags: control flags
  * @fmt: Error message format string
  *
  * Do some basic checking of register state at error time and
@@ -2603,16 +2604,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
  */
 void i915_handle_error(struct drm_i915_private *dev_priv,
                       u32 engine_mask,
+                      unsigned long flags,
                       const char *fmt, ...)
 {
        struct intel_engine_cs *engine;
        unsigned int tmp;
-       va_list args;
-       char error_msg[80];
-
-       va_start(args, fmt);
-       vscnprintf(error_msg, sizeof(error_msg), fmt, args);
-       va_end(args);
 
        /*
         * In most cases it's guaranteed that we get here with an RPM
@@ -2624,8 +2620,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
        intel_runtime_pm_get(dev_priv);
 
        engine_mask &= DEVICE_INFO(dev_priv)->ring_mask;
-       i915_capture_error_state(dev_priv, engine_mask, error_msg);
-       i915_clear_error_registers(dev_priv);
+
+       if (flags & I915_ERROR_CAPTURE) {
+               char error_msg[80];
+               va_list args;
+
+               va_start(args, fmt);
+               vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+               va_end(args);
+
+               i915_capture_error_state(dev_priv, engine_mask, error_msg);
+               i915_clear_error_registers(dev_priv);
+       }
 
        /*
         * Try engine reset when available. We fall back to full reset if
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 42e45ae87393..13d1a269c771 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
         */
        tmp = I915_READ_CTL(engine);
        if (tmp & RING_WAIT) {
-               i915_handle_error(dev_priv, 0,
+               i915_handle_error(dev_priv, 0, 0,
                                  "Kicking stuck wait on %s",
                                  engine->name);
                I915_WRITE_CTL(engine, tmp);
@@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
                default:
                        return ENGINE_DEAD;
                case 1:
-                       i915_handle_error(dev_priv, 0,
+                       i915_handle_error(dev_priv, 0, 0,
                                          "Kicking stuck semaphore on %s",
                                          engine->name);
                        I915_WRITE_CTL(engine, tmp);
@@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
                                 "%s, ", engine->name);
        msg[len-2] = '\0';
 
-       return i915_handle_error(i915, hung, "%s", msg);
+       return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 396abef55dd3..55f25241a199 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -529,8 +529,35 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
        if (execlists->ctrl_reg)
                writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 
-       execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
-       execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
+       execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
+       execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
+
+       /* Set a timer to force preemption vs hostile userspace */
+       if (execlists->preempt_timeout_ns)
+               hrtimer_start(&execlists->preempt_timer,
+                             ktime_set(0, execlists->preempt_timeout_ns),
+                             HRTIMER_MODE_REL);
+}
+
+static enum hrtimer_restart preempt_timeout(struct hrtimer *hrtimer)
+{
+       struct intel_engine_cs *engine =
+               container_of(hrtimer, typeof(*engine), execlists.preempt_timer);
+
+       if (execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
+               queue_work(system_highpri_wq, &engine->execlists.preempt_reset);
+
+       return HRTIMER_NORESTART;
+}
+
+static void preempt_reset(struct work_struct *work)
+{
+       struct intel_engine_cs *engine =
+               container_of(work, typeof(*engine), execlists.preempt_reset);
+
+       if (execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
+               i915_handle_error(engine->i915, BIT(engine->id), 0,
+                                 "preemption timed out on %s", engine->name);
 }
 
 static void update_rps(struct intel_engine_cs *engine)
@@ -946,6 +973,7 @@ static void execlists_submission_tasklet(unsigned long data)
                                                                EXECLISTS_ACTIVE_PREEMPT));
                                execlists_clear_active(execlists,
                                                       EXECLISTS_ACTIVE_PREEMPT);
+                               hrtimer_try_to_cancel(&execlists->preempt_timer);
                                continue;
                        }
 
@@ -2180,6 +2208,11 @@ logical_ring_setup(struct intel_engine_cs *engine)
        tasklet_init(&engine->execlists.tasklet,
                     execlists_submission_tasklet, (unsigned long)engine);
 
+       INIT_WORK(&engine->execlists.preempt_reset, preempt_reset);
+       hrtimer_init(&engine->execlists.preempt_timer,
+                    CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       engine->execlists.preempt_timer.function = preempt_timeout;
+
        logical_ring_default_vfuncs(engine);
        logical_ring_default_irqs(engine);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 495b21fc33db..74fcff8a2a6e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -307,6 +307,10 @@ struct intel_engine_execlists {
         * @preempt_complete_status: expected CSB upon completing preemption
         */
        u32 preempt_complete_status;
+
+       struct hrtimer preempt_timer;
+       struct work_struct preempt_reset;
+       unsigned long preempt_timeout_ns;
 };
 
 #define INTEL_ENGINE_CS_MAX_NAME 8
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index c61a51e46eb2..e99705068190 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1091,7 +1091,7 @@ static int igt_handle_error(void *arg)
        engine->hangcheck.stalled = true;
        engine->hangcheck.seqno = intel_engine_get_seqno(engine);
 
-       i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+       i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__);
 
        xchg(&i915->gpu_error.first_error, error);