[Intel-gfx] [RFC 0/8] Force preemption
Chris Wilson
chris at chris-wilson.co.uk
Fri Mar 16 20:53:01 UTC 2018
Quoting jeff.mcgee at intel.com (2018-03-16 18:30:57)
> From: Jeff McGee <jeff.mcgee at intel.com>
>
> Force preemption uses engine reset to enforce a limit on the time
> that a request targeted for preemption can block. This feature is
> a requirement in automotive systems where the GPU may be shared by
> clients of critically high priority and clients of low priority that
> may not have been curated to be preemption friendly. There may be
> more general applications of this feature. I'm sharing as an RFC to
> stimulate that discussion and also to get any technical feedback
> that I can before submitting to the product kernel that needs this.
> I have developed the patches for ease of rebase, given that this is
> for the moment considered a non-upstreamable feature. It would be
> possible to refactor hangcheck to fully incorporate force preemption
> as another tier of patience (or impatience) with the running request.
Last night I spent 15mins and wrote a similar RFC as Joonas keep
muttering and I thought he was looking for code not that was some...
The real only difference is the approach to handling the pending timer.
I do not agree with your reset changes as they appear to percolate even
more uncertainity through the code. If the preempt-context is in flight
when the reset is triggered, the last active request is still defined by
incomplete request; the new request has not been submitted. This is
serialised by checking for EXECLISTS_ACTIVE_PREEMPT. Up until the point
that is cleared and the next request submitted; we should reset the
engine and resubmit. The race in tasklet handling is immaterial at that
point, you have already paid the cost in using a timer, the workqueue
and serialising for the reset.
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 76ea1da923bd..d9da68efbc86 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3969,8 +3969,8 @@ i915_wedged_set(void *data, u64 val)
engine->hangcheck.stalled = true;
}
- i915_handle_error(i915, val, "Manually set wedged engine mask = %llx",
- val);
+ i915_handle_error(i915, val, I915_ERROR_CAPTURE,
+ "Manually set wedged engine mask = %llx", val);
wait_on_bit(&i915->gpu_error.flags,
I915_RESET_HANDOFF,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 26c1dd4b542e..d817b5e55c96 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2755,10 +2755,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
&dev_priv->gpu_error.hangcheck_work, delay);
}
-__printf(3, 4)
+__printf(4, 5)
void i915_handle_error(struct drm_i915_private *dev_priv,
u32 engine_mask,
+ unsigned long flags,
const char *fmt, ...);
+#define I915_ERROR_CAPTURE BIT(0)
extern void intel_irq_init(struct drm_i915_private *dev_priv);
extern void intel_irq_fini(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5aaf356fd27f..0a255ffe4c51 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2593,6 +2593,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
* i915_handle_error - handle a gpu error
* @dev_priv: i915 device private
* @engine_mask: mask representing engines that are hung
+ * @flags: control flags
* @fmt: Error message format string
*
* Do some basic checking of register state at error time and
@@ -2603,16 +2604,11 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
*/
void i915_handle_error(struct drm_i915_private *dev_priv,
u32 engine_mask,
+ unsigned long flags,
const char *fmt, ...)
{
struct intel_engine_cs *engine;
unsigned int tmp;
- va_list args;
- char error_msg[80];
-
- va_start(args, fmt);
- vscnprintf(error_msg, sizeof(error_msg), fmt, args);
- va_end(args);
/*
* In most cases it's guaranteed that we get here with an RPM
@@ -2624,8 +2620,18 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
intel_runtime_pm_get(dev_priv);
engine_mask &= DEVICE_INFO(dev_priv)->ring_mask;
- i915_capture_error_state(dev_priv, engine_mask, error_msg);
- i915_clear_error_registers(dev_priv);
+
+ if (flags & I915_ERROR_CAPTURE) {
+ char error_msg[80];
+ va_list args;
+
+ va_start(args, fmt);
+ vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+ va_end(args);
+
+ i915_capture_error_state(dev_priv, engine_mask, error_msg);
+ i915_clear_error_registers(dev_priv);
+ }
/*
* Try engine reset when available. We fall back to full reset if
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 42e45ae87393..13d1a269c771 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -246,7 +246,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
*/
tmp = I915_READ_CTL(engine);
if (tmp & RING_WAIT) {
- i915_handle_error(dev_priv, 0,
+ i915_handle_error(dev_priv, 0, 0,
"Kicking stuck wait on %s",
engine->name);
I915_WRITE_CTL(engine, tmp);
@@ -258,7 +258,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
default:
return ENGINE_DEAD;
case 1:
- i915_handle_error(dev_priv, 0,
+ i915_handle_error(dev_priv, 0, 0,
"Kicking stuck semaphore on %s",
engine->name);
I915_WRITE_CTL(engine, tmp);
@@ -392,7 +392,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
"%s, ", engine->name);
msg[len-2] = '\0';
- return i915_handle_error(i915, hung, "%s", msg);
+ return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
}
/*
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 396abef55dd3..55f25241a199 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -529,8 +529,35 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
if (execlists->ctrl_reg)
writel(EL_CTRL_LOAD, execlists->ctrl_reg);
- execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
- execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
+ execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
+ execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
+
+ /* Set a timer to force preemption vs hostile userspace */
+ if (execlists->preempt_timeout_ns)
+ hrtimer_start(&execlists->preempt_timer,
+ ktime_set(0, execlists->preempt_timeout_ns),
+ HRTIMER_MODE_REL);
+}
+
+static enum hrtimer_restart preempt_timeout(struct hrtimer *hrtimer)
+{
+ struct intel_engine_cs *engine =
+ container_of(hrtimer, typeof(*engine), execlists.preempt_timer);
+
+ if (execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
+ queue_work(system_highpri_wq, &engine->execlists.preempt_reset);
+
+ return HRTIMER_NORESTART;
+}
+
+static void preempt_reset(struct work_struct *work)
+{
+ struct intel_engine_cs *engine =
+ container_of(work, typeof(*engine), execlists.preempt_reset);
+
+ if (execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
+ i915_handle_error(engine->i915, BIT(engine->id), 0,
+ "preemption timed out on %s", engine->name);
}
static void update_rps(struct intel_engine_cs *engine)
@@ -946,6 +973,7 @@ static void execlists_submission_tasklet(unsigned long data)
EXECLISTS_ACTIVE_PREEMPT));
execlists_clear_active(execlists,
EXECLISTS_ACTIVE_PREEMPT);
+ hrtimer_try_to_cancel(&execlists->preempt_timer);
continue;
}
@@ -2180,6 +2208,11 @@ logical_ring_setup(struct intel_engine_cs *engine)
tasklet_init(&engine->execlists.tasklet,
execlists_submission_tasklet, (unsigned long)engine);
+ INIT_WORK(&engine->execlists.preempt_reset, preempt_reset);
+ hrtimer_init(&engine->execlists.preempt_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ engine->execlists.preempt_timer.function = preempt_timeout;
+
logical_ring_default_vfuncs(engine);
logical_ring_default_irqs(engine);
}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 495b21fc33db..74fcff8a2a6e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -307,6 +307,10 @@ struct intel_engine_execlists {
* @preempt_complete_status: expected CSB upon completing preemption
*/
u32 preempt_complete_status;
+
+ struct hrtimer preempt_timer;
+ struct work_struct preempt_reset;
+ unsigned long preempt_timeout_ns;
};
#define INTEL_ENGINE_CS_MAX_NAME 8
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index c61a51e46eb2..e99705068190 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1091,7 +1091,7 @@ static int igt_handle_error(void *arg)
engine->hangcheck.stalled = true;
engine->hangcheck.seqno = intel_engine_get_seqno(engine);
- i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+ i915_handle_error(i915, intel_engine_flag(engine), 0, "%s", __func__);
xchg(&i915->gpu_error.first_error, error);
More information about the Intel-gfx
mailing list