[Intel-gfx] [RFC 8/8] drm/i915: Force preemption to complete via engine reset
Chris Wilson
chris at chris-wilson.co.uk
Fri Mar 16 20:39:42 UTC 2018
Quoting jeff.mcgee at intel.com (2018-03-16 18:31:05)
> From: Jeff McGee <jeff.mcgee at intel.com>
>
> The hardware can complete the requested preemption at only certain points
> in execution. Thus an uncooperative request that avoids those points can
> block a preemption indefinitely. Our only option to bound the preemption
> latency is to trigger reset and recovery just as we would if a request had
> hung the hardware. This is so-called forced preemption. This change adds
> that capability as an option for systems with extremely strict scheduling
> latency requirements for its high priority requests. This option must be
> used with great care. The force-preempted requests will be discarded at
> the point of reset, resulting in various degrees of disruption to the
> owning application up to and including crash.
>
> The option is enabled by specifying a non-zero value for new i915 module
> parameter fpreempt_timeout. This value becomes the time in milliseconds
> after initiation of preemption at which the reset is triggered if the
> preemption has not completed normally.
>
> Test: Run IGT gem_exec_fpreempt.
> Change-Id: Iafd3609012621c57fa9e490dfeeac46ae541b5c2
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.c | 14 ++++++++-
> drivers/gpu/drm/i915/i915_drv.h | 2 ++
> drivers/gpu/drm/i915/i915_gem.c | 37 +++++++++++++++++++++--
> drivers/gpu/drm/i915/i915_params.c | 3 ++
> drivers/gpu/drm/i915/i915_params.h | 1 +
> drivers/gpu/drm/i915/intel_engine_cs.c | 53 +++++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/intel_lrc.c | 24 ++++++++++++++-
> drivers/gpu/drm/i915/intel_ringbuffer.h | 6 ++++
> 8 files changed, 136 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index b627370d5a9c..3f2394d61ea2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -811,8 +811,16 @@ static int i915_workqueues_init(struct drm_i915_private *dev_priv)
> if (dev_priv->hotplug.dp_wq == NULL)
> goto out_free_wq;
>
> + if (INTEL_INFO(dev_priv)->has_logical_ring_preemption) {
> + dev_priv->fpreempt_wq = alloc_ordered_workqueue("i915-fpreempt",
> + WQ_HIGHPRI);
> + if (dev_priv->fpreempt_wq == NULL)
> + goto out_free_dp_wq;
Doesn't require ordering, the resets are if either required to fall back
to a full reset.
> + }
> return 0;
>
> +out_free_dp_wq:
> + destroy_workqueue(dev_priv->hotplug.dp_wq);
> out_free_wq:
> destroy_workqueue(dev_priv->wq);
> out_err:
> @@ -832,6 +840,8 @@ static void i915_engines_cleanup(struct drm_i915_private *i915)
>
> static void i915_workqueues_cleanup(struct drm_i915_private *dev_priv)
> {
> + if (INTEL_INFO(dev_priv)->has_logical_ring_preemption)
> + destroy_workqueue(dev_priv->fpreempt_wq);
> destroy_workqueue(dev_priv->hotplug.dp_wq);
> destroy_workqueue(dev_priv->wq);
> }
> @@ -2007,7 +2017,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
>
> if (!(flags & I915_RESET_QUIET)) {
> dev_notice(engine->i915->drm.dev,
> - "Resetting %s after gpu hang\n", engine->name);
> + "Resetting %s %s\n", engine->name,
> + engine->fpreempt_active ?
> + "for force preemption" : "after gpu hang");
> }
> error->reset_engine_count[engine->id]++;
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ade09f97be5c..514e640d8406 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2290,6 +2290,8 @@ struct drm_i915_private {
> */
> struct workqueue_struct *wq;
>
> + struct workqueue_struct *fpreempt_wq;
> +
> /* Display functions */
> struct drm_i915_display_funcs display;
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 9780d9026ce6..d556743c578a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2811,9 +2811,21 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
> return request;
> }
>
> -static bool engine_stalled(struct intel_engine_cs *engine)
> +static bool engine_stalled(struct intel_engine_cs *engine,
> + struct drm_i915_gem_request *request)
> {
> if (!intel_vgpu_active(engine->i915)) {
> + if (engine->fpreempt_active) {
> + /* Pardon the request if it managed to complete or
> + * preempt prior to the reset.
> + */
> + if (i915_gem_request_completed(request) ||
> + intel_engine_preempt_finished(engine))
> + return false;
> +
No, once you pull the trigger just go through with it. You should never
pull the trigger if you are worried about the consequences.
> + return true;
> + }
> +
> if (!engine->hangcheck.stalled)
> return false;
>
> @@ -2858,6 +2870,13 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> tasklet_kill(&engine->execlists.irq_tasklet);
> tasklet_disable(&engine->execlists.irq_tasklet);
>
> + /* There may be a force preemption timer active on this engine but
> + * not yet expired, i.e. not the reason we are about to reset this
> + * engine. Cancel it. If force preemption timeout is the reason we
> + * are resetting the engine, this call will have no efffect.
> + */
Done by just clearing execlists->active.
> + intel_engine_cancel_fpreempt(engine);
> +
> if (engine->irq_seqno_barrier)
> engine->irq_seqno_barrier(engine);
>
> @@ -2958,7 +2977,7 @@ i915_gem_reset_request(struct intel_engine_cs *engine,
> * subsequent hangs.
> */
>
> - if (engine_stalled(engine)) {
> + if (engine_stalled(engine, request)) {
> i915_gem_context_mark_guilty(request->ctx);
> skip_request(request);
>
> @@ -2966,6 +2985,13 @@ i915_gem_reset_request(struct intel_engine_cs *engine,
> if (i915_gem_context_is_banned(request->ctx))
> engine_skip_context(request);
> } else {
> + /* If the request that we just pardoned was the target of a
> + * force preemption there is no possibility of the next
> + * request in line having started.
> + */
> + if (engine->fpreempt_active)
> + return NULL;
> +
> /*
> * Since this is not the hung engine, it may have advanced
> * since the hang declaration. Double check by refinding
> @@ -3035,6 +3061,13 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
>
> void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> {
> + /* Mark any active force preemption as complete then kick
> + * the tasklet.
> + */
> + engine->fpreempt_active = false;
> + if (engine->execlists.first)
> + tasklet_schedule(&engine->execlists.irq_tasklet);
Stray chunk.
> +
> tasklet_enable(&engine->execlists.irq_tasklet);
> kthread_unpark(engine->breadcrumbs.signaler);
> }
> diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
> index 63751a1de74a..9a0deb6e3920 100644
> --- a/drivers/gpu/drm/i915/i915_params.c
> +++ b/drivers/gpu/drm/i915/i915_params.c
> @@ -247,3 +247,6 @@ MODULE_PARM_DESC(enable_conformance_check, "To toggle the GVT guest conformance
>
> module_param_named(disable_gvt_fw_loading, i915_modparams.disable_gvt_fw_loading, bool, 0400);
> MODULE_PARM_DESC(disable_gvt_fw_loading, "Disable GVT-g fw loading.");
> +
> +i915_param_named(fpreempt_timeout, uint, 0600,
> + "Wait time in msecs before forcing a preemption with reset (0:never force [default])");
> diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
> index e8c2ba4cb1e6..65fbffa6333c 100644
> --- a/drivers/gpu/drm/i915/i915_params.h
> +++ b/drivers/gpu/drm/i915/i915_params.h
> @@ -54,6 +54,7 @@
> func(int, edp_vswing); \
> func(int, reset); \
> func(unsigned int, inject_load_failure); \
> + func(unsigned int, fpreempt_timeout); \
> /* leave bools at the end to not create holes */ \
> func(bool, alpha_support); \
> func(bool, enable_cmd_parser); \
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c9bf2347f7a4..af6cc2d0f7e9 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -411,6 +411,58 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
> execlists->first = NULL;
> }
>
> +void intel_engine_queue_fpreempt(struct intel_engine_cs *engine)
> +{
> + unsigned int timeout = i915_modparams.fpreempt_timeout;
> +
> + if (!timeout)
> + return;
> +
> + GEM_BUG_ON(engine->fpreempt_active);
> + hrtimer_start(&engine->fpreempt_timer,
> + ms_to_ktime(timeout), HRTIMER_MODE_REL);
> +}
> +
> +bool intel_engine_cancel_fpreempt(struct intel_engine_cs *engine)
> +{
> + hrtimer_cancel(&engine->fpreempt_timer);
> +
> + return !engine->fpreempt_active;
> +}
> +
> +static enum hrtimer_restart
> +intel_engine_fpreempt_timer(struct hrtimer *hrtimer)
> +{
> + struct intel_engine_cs *engine =
> + container_of(hrtimer, struct intel_engine_cs,
> + fpreempt_timer);
> +
> + engine->fpreempt_active = true;
> + queue_work(engine->i915->fpreempt_wq, &engine->fpreempt_work);
Ah I see you didn't check.
> +
> + return HRTIMER_NORESTART;
> +}
> +
> +static void intel_engine_fpreempt_work(struct work_struct *work)
> +{
> + struct intel_engine_cs *engine =
> + container_of(work, struct intel_engine_cs,
> + fpreempt_work);
> +
> + i915_handle_reset(engine->i915, intel_engine_flag(engine));
> +}
> +
> +static void intel_engine_init_fpreempt(struct intel_engine_cs *engine)
> +{
> + if (!INTEL_INFO(engine->i915)->has_logical_ring_preemption)
> + return;
> +
> + hrtimer_init(&engine->fpreempt_timer,
> + CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + engine->fpreempt_timer.function = intel_engine_fpreempt_timer;
> + INIT_WORK(&engine->fpreempt_work, intel_engine_fpreempt_work);
Just set the few pointers. It's not like we are saving anything.
> +}
> +
> /**
> * intel_engines_setup_common - setup engine state not requiring hw access
> * @engine: Engine to setup.
> @@ -426,6 +478,7 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
>
> intel_engine_init_timeline(engine);
> intel_engine_init_hangcheck(engine);
> + intel_engine_init_fpreempt(engine);
> i915_gem_batch_pool_init(engine, &engine->batch_pool);
>
> intel_engine_init_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 581483886153..17487f8e8b4c 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -628,6 +628,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> inject_preempt_context(engine);
> execlists_set_active(execlists,
> EXECLISTS_ACTIVE_PREEMPT);
> + intel_engine_queue_fpreempt(engine);
> goto unlock;
> } else {
> /*
> @@ -846,6 +847,7 @@ static void intel_lrc_irq_handler(unsigned long data)
> const u32 *buf =
> &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> unsigned int head, tail;
> + bool defer = false;
>
> /* However GVT emulation depends upon intercepting CSB mmio */
> if (unlikely(execlists->csb_use_mmio)) {
> @@ -919,6 +921,21 @@ static void intel_lrc_irq_handler(unsigned long data)
>
> if (status & GEN8_CTX_STATUS_ACTIVE_IDLE &&
> buf[2*head + 1] == PREEMPT_ID) {
> + /* Try to cancel any pending force preemption.
> + * If we are too late, hold off on processing
> + * the completed preemption until reset has
> + * run its course. It should recognize that
> + * the engine has preempted to idle then abort
> + * the reset. Then we can resume processing
> + * at this CSB head.
> + */
This can be much simpler than implied here.
-Chris
More information about the Intel-gfx
mailing list