[Intel-gfx] [PATCH 1/4] drm/i915: Do a synchronous switch-to-kernel-context on idling

Fri Mar 8 09:50:56 UTC 2019

On 08/03/2019 09:36, Chris Wilson wrote:
> When the system idles, we switch to the kernel context as a defensive
> measure (no users are harmed if the kernel context is lost). Currently,
> we issue a switch to kernel context and then come back later to see if
> the kernel context is still current and the system is idle. However,
> if we are no longer privy to the runqueue ordering, then we have to
> relax our assumptions about the logical state of the GPU and the only
> way to ensure that the kernel context is currently loaded is by issuing
> a request to run after all others, and wait for it to complete all while
> preventing anyone else from issuing their own requests.
> 
> v2: Pull wedging into switch_to_kernel_context_sync() but only after
> waiting (though only for the same short delay) for the active context to
> finish.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko

> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.c           |  14 +-
>   drivers/gpu/drm/i915/i915_drv.h           |   2 +-
>   drivers/gpu/drm/i915/i915_gem.c           | 154 +++++++++-------------
>   drivers/gpu/drm/i915/i915_gem_context.c   |   4 +
>   drivers/gpu/drm/i915/selftests/i915_gem.c |   9 +-
>   5 files changed, 73 insertions(+), 110 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 777a9a19414d..0d743907e7bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -714,8 +714,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
>   	return 0;
>   
>   cleanup_gem:
> -	if (i915_gem_suspend(dev_priv))
> -		DRM_ERROR("failed to idle hardware; continuing to unload!\n");
> +	i915_gem_suspend(dev_priv);
>   	i915_gem_fini(dev_priv);
>   cleanup_modeset:
>   	intel_modeset_cleanup(dev);
> @@ -1900,8 +1899,7 @@ void i915_driver_unload(struct drm_device *dev)
>   	/* Flush any external code that still may be under the RCU lock */
>   	synchronize_rcu();
>   
> -	if (i915_gem_suspend(dev_priv))
> -		DRM_ERROR("failed to idle hardware; continuing to unload!\n");
> +	i915_gem_suspend(dev_priv);
>   
>   	drm_atomic_helper_shutdown(dev);
>   
> @@ -2009,7 +2007,6 @@ static bool suspend_to_idle(struct drm_i915_private *dev_priv)
>   static int i915_drm_prepare(struct drm_device *dev)
>   {
>   	struct drm_i915_private *i915 = to_i915(dev);
> -	int err;
>   
>   	/*
>   	 * NB intel_display_suspend() may issue new requests after we've
> @@ -2017,12 +2014,9 @@ static int i915_drm_prepare(struct drm_device *dev)
>   	 * split out that work and pull it forward so that after point,
>   	 * the GPU is not woken again.
>   	 */
> -	err = i915_gem_suspend(i915);
> -	if (err)
> -		dev_err(&i915->drm.pdev->dev,
> -			"GEM idle failed, suspend/resume might fail\n");
> +	i915_gem_suspend(i915);
>   
> -	return err;
> +	return 0;
>   }
>   
>   static int i915_drm_suspend(struct drm_device *dev)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a5b314a0c415..b8a5281d8adf 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3032,7 +3032,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv);
>   void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
>   int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
>   			   unsigned int flags, long timeout);
> -int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
> +void i915_gem_suspend(struct drm_i915_private *dev_priv);
>   void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
>   void i915_gem_resume(struct drm_i915_private *dev_priv);
>   vm_fault_t i915_gem_fault(struct vm_fault *vmf);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index df2f4f65c2a4..f22de3b5a1f3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2828,13 +2828,6 @@ i915_gem_retire_work_handler(struct work_struct *work)
>   				   round_jiffies_up_relative(HZ));
>   }
>   
> -static inline bool
> -new_requests_since_last_retire(const struct drm_i915_private *i915)
> -{
> -	return (READ_ONCE(i915->gt.active_requests) ||
> -		work_pending(&i915->gt.idle_work.work));
> -}
> -
>   static void assert_kernel_context_is_current(struct drm_i915_private *i915)
>   {
>   	struct intel_engine_cs *engine;
> @@ -2843,7 +2836,8 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
>   	if (i915_reset_failed(i915))
>   		return;
>   
> -	GEM_BUG_ON(i915->gt.active_requests);
> +	i915_retire_requests(i915);
> +
>   	for_each_engine(engine, i915, id) {
>   		GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
>   		GEM_BUG_ON(engine->last_retired_context !=
> @@ -2851,77 +2845,86 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
>   	}
>   }
>   
> +static bool switch_to_kernel_context_sync(struct drm_i915_private *i915)
> +{
> +	bool result = true;
> +
> +	/*
> +	 * Even if we fail to switch, give whatever is running a small chance
> +	 * to save itself before we report the failure. Yes, this may be a
> +	 * false positive due to e.g. ENOMEM, caveat emptor!
> +	 */
> +	if (i915_gem_switch_to_kernel_context(i915))
> +		result = false;
> +
> +	if (i915_gem_wait_for_idle(i915,
> +				   I915_WAIT_LOCKED |
> +				   I915_WAIT_FOR_IDLE_BOOST,
> +				   I915_GEM_IDLE_TIMEOUT))
> +		result = false;
> +
> +	if (result) {
> +		assert_kernel_context_is_current(i915);
> +	} else {
> +		/* Forcibly cancel outstanding work and leave the gpu quiet. */
> +		dev_err(i915->drm.dev,
> +			"Failed to idle engines, declaring wedged!\n");
> +		GEM_TRACE_DUMP();
> +		i915_gem_set_wedged(i915);
> +	}
> +
> +	i915_retire_requests(i915); /* ensure we flush after wedging */
> +	return result;
> +}
> +
>   static void
>   i915_gem_idle_work_handler(struct work_struct *work)
>   {
> -	struct drm_i915_private *dev_priv =
> -		container_of(work, typeof(*dev_priv), gt.idle_work.work);
> +	struct drm_i915_private *i915 =
> +		container_of(work, typeof(*i915), gt.idle_work.work);
>   	bool rearm_hangcheck;
>   
> -	if (!READ_ONCE(dev_priv->gt.awake))
> +	if (!READ_ONCE(i915->gt.awake))
>   		return;
>   
> -	if (READ_ONCE(dev_priv->gt.active_requests))
> +	if (READ_ONCE(i915->gt.active_requests))
>   		return;
>   
> -	/*
> -	 * Flush out the last user context, leaving only the pinned
> -	 * kernel context resident. When we are idling on the kernel_context,
> -	 * no more new requests (with a context switch) are emitted and we
> -	 * can finally rest. A consequence is that the idle work handler is
> -	 * always called at least twice before idling (and if the system is
> -	 * idle that implies a round trip through the retire worker).
> -	 */
> -	mutex_lock(&dev_priv->drm.struct_mutex);
> -	i915_gem_switch_to_kernel_context(dev_priv);
> -	mutex_unlock(&dev_priv->drm.struct_mutex);
> -
> -	GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
> -		  READ_ONCE(dev_priv->gt.active_requests));
> -
> -	/*
> -	 * Wait for last execlists context complete, but bail out in case a
> -	 * new request is submitted. As we don't trust the hardware, we
> -	 * continue on if the wait times out. This is necessary to allow
> -	 * the machine to suspend even if the hardware dies, and we will
> -	 * try to recover in resume (after depriving the hardware of power,
> -	 * it may be in a better mmod).
> -	 */
> -	__wait_for(if (new_requests_since_last_retire(dev_priv)) return,
> -		   intel_engines_are_idle(dev_priv),
> -		   I915_IDLE_ENGINES_TIMEOUT * 1000,
> -		   10, 500);
> -
>   	rearm_hangcheck =
> -		cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> +		cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
>   
> -	if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
> +	if (!mutex_trylock(&i915->drm.struct_mutex)) {
>   		/* Currently busy, come back later */
> -		mod_delayed_work(dev_priv->wq,
> -				 &dev_priv->gt.idle_work,
> +		mod_delayed_work(i915->wq,
> +				 &i915->gt.idle_work,
>   				 msecs_to_jiffies(50));
>   		goto out_rearm;
>   	}
>   
>   	/*
> -	 * New request retired after this work handler started, extend active
> -	 * period until next instance of the work.
> +	 * Flush out the last user context, leaving only the pinned
> +	 * kernel context resident. Should anything unfortunate happen
> +	 * while we are idle (such as the GPU being power cycled), no users
> +	 * will be harmed.
>   	 */
> -	if (new_requests_since_last_retire(dev_priv))
> -		goto out_unlock;
> +	if (!work_pending(&i915->gt.idle_work.work) &&
> +	    !i915->gt.active_requests) {
> +		++i915->gt.active_requests; /* don't requeue idle */
>   
> -	__i915_gem_park(dev_priv);
> +		switch_to_kernel_context_sync(i915);
>   
> -	assert_kernel_context_is_current(dev_priv);
> +		if (!--i915->gt.active_requests) {
> +			__i915_gem_park(i915);
> +			rearm_hangcheck = false;
> +		}
> +	}
>   
> -	rearm_hangcheck = false;
> -out_unlock:
> -	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	mutex_unlock(&i915->drm.struct_mutex);
>   
>   out_rearm:
>   	if (rearm_hangcheck) {
> -		GEM_BUG_ON(!dev_priv->gt.awake);
> -		i915_queue_hangcheck(dev_priv);
> +		GEM_BUG_ON(!i915->gt.awake);
> +		i915_queue_hangcheck(i915);
>   	}
>   }
>   
> @@ -3128,7 +3131,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
>   			return err;
>   
>   		i915_retire_requests(i915);
> -		GEM_BUG_ON(i915->gt.active_requests);
>   	}
>   
>   	return 0;
> @@ -4340,10 +4342,9 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
>   	mutex_unlock(&i915->drm.struct_mutex);
>   }
>   
> -int i915_gem_suspend(struct drm_i915_private *i915)
> +void i915_gem_suspend(struct drm_i915_private *i915)
>   {
>   	intel_wakeref_t wakeref;
> -	int ret;
>   
>   	GEM_TRACE("\n");
>   
> @@ -4363,23 +4364,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
>   	 * state. Fortunately, the kernel_context is disposable and we do
>   	 * not rely on its state.
>   	 */
> -	if (!i915_reset_failed(i915)) {
> -		ret = i915_gem_switch_to_kernel_context(i915);
> -		if (ret)
> -			goto err_unlock;
> -
> -		ret = i915_gem_wait_for_idle(i915,
> -					     I915_WAIT_INTERRUPTIBLE |
> -					     I915_WAIT_LOCKED |
> -					     I915_WAIT_FOR_IDLE_BOOST,
> -					     I915_GEM_IDLE_TIMEOUT);
> -		if (ret == -EINTR)
> -			goto err_unlock;
> -
> -		/* Forcibly cancel outstanding work and leave the gpu quiet. */
> -		i915_gem_set_wedged(i915);
> -	}
> -	i915_retire_requests(i915); /* ensure we flush after wedging */
> +	switch_to_kernel_context_sync(i915);
>   
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	i915_reset_flush(i915);
> @@ -4399,12 +4384,6 @@ int i915_gem_suspend(struct drm_i915_private *i915)
>   	GEM_BUG_ON(i915->gt.awake);
>   
>   	intel_runtime_pm_put(i915, wakeref);
> -	return 0;
> -
> -err_unlock:
> -	mutex_unlock(&i915->drm.struct_mutex);
> -	intel_runtime_pm_put(i915, wakeref);
> -	return ret;
>   }
>   
>   void i915_gem_suspend_late(struct drm_i915_private *i915)
> @@ -4670,20 +4649,11 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
>   			goto err_active;
>   	}
>   
> -	err = i915_gem_switch_to_kernel_context(i915);
> -	if (err)
> -		goto err_active;
> -
> -	if (i915_gem_wait_for_idle(i915,
> -				   I915_WAIT_LOCKED,
> -				   I915_GEM_IDLE_TIMEOUT)) {
> -		i915_gem_set_wedged(i915);
> +	if (!switch_to_kernel_context_sync(i915)) {
>   		err = -EIO; /* Caller will declare us wedged */
>   		goto err_active;
>   	}
>   
> -	assert_kernel_context_is_current(i915);
> -
>   	/*
>   	 * Immediately park the GPU so that we enable powersaving and
>   	 * treat it as idle. The next time we issue a request, we will
> @@ -4927,7 +4897,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   err_init_hw:
>   	mutex_unlock(&dev_priv->drm.struct_mutex);
>   
> -	WARN_ON(i915_gem_suspend(dev_priv));
> +	i915_gem_suspend(dev_priv);
>   	i915_gem_suspend_late(dev_priv);
>   
>   	i915_gem_drain_workqueue(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index b9f321947982..9a3eb4f66d85 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -767,6 +767,10 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
>   	lockdep_assert_held(&i915->drm.struct_mutex);
>   	GEM_BUG_ON(!i915->kernel_context);
>   
> +	/* Inoperable, so presume the GPU is safely pointing into the void! */
> +	if (i915_terminally_wedged(i915))
> +		return 0;
> +
>   	i915_retire_requests(i915);
>   
>   	for_each_engine(engine, i915, id) {
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c
> index e77b7ed449ae..50bb7bbd26d3 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem.c
> @@ -84,14 +84,9 @@ static void simulate_hibernate(struct drm_i915_private *i915)
>   
>   static int pm_prepare(struct drm_i915_private *i915)
>   {
> -	int err = 0;
> -
> -	if (i915_gem_suspend(i915)) {
> -		pr_err("i915_gem_suspend failed\n");
> -		err = -EINVAL;
> -	}
> +	i915_gem_suspend(i915);
>   
> -	return err;
> +	return 0;
>   }
>   
>   static void pm_suspend(struct drm_i915_private *i915)
>