[Intel-xe] [PATCH] drm/xe: Fix an invalid locking wait context bug

Wed Jul 26 11:38:53 UTC 2023

On 25/07/2023 21:17, Rodrigo Vivi wrote:
> We cannot have spin locks around xe_irq_reset, since it will
> call the intel_display_power_is_enabled() function, and
> that needs a mutex lock. Hence causing the undesired
> "[ BUG: Invalid wait context ]"
> 
> We cannot convert i915's power domain lock to spin lock
> due to the nested dependency of non-atomic context waits.
> 
> So, let's move the xe_irq_reset functions from the
> critical area, while still ensuring that we are protecting
> the irq.enabled and ensuring the right serialization
> in the irq handlers.
> 
> v2: On the first version, I had missed the fact that
> irq.enabled is checked on the xe/display glue layer,
> and that i915 display code is actually using the irq
> spin lock properly. So, this got changed to a version
> suggested by Matthew Auld, along with introducing
> the lockdep_assert_held at the display glue code.
> 
> Suggested-by: Matthew Auld <matthew.auld at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> ---
>   drivers/gpu/drm/xe/display/ext/i915_irq.c |  1 +
>   drivers/gpu/drm/xe/xe_irq.c               | 32 ++++++++++++++++++-----
>   2 files changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/display/ext/i915_irq.c b/drivers/gpu/drm/xe/display/ext/i915_irq.c
> index fbb0e99143f6..ac435f7f854b 100644
> --- a/drivers/gpu/drm/xe/display/ext/i915_irq.c
> +++ b/drivers/gpu/drm/xe/display/ext/i915_irq.c
> @@ -107,6 +107,7 @@ void intel_display_irq_uninstall(struct drm_i915_private *dev_priv)
>   
>   bool intel_irqs_enabled(struct xe_device *xe)
>   {
> +	lockdep_assert_held(&xe->irq.lock);
>   	return xe->irq.enabled;
>   }
>   
> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> index eae190cb0969..d6b001f6a3c5 100644
> --- a/drivers/gpu/drm/xe/xe_irq.c
> +++ b/drivers/gpu/drm/xe/xe_irq.c
> @@ -309,6 +309,13 @@ static irqreturn_t xelp_irq_handler(int irq, void *arg)
>   	unsigned long intr_dw[2];
>   	u32 identity[32];
>   
> +	spin_lock_irq(&xe->irq.lock);
> +	if (!xe->irq.enabled) {
> +		spin_unlock_irq(&xe->irq.lock);
> +		return IRQ_NONE;
> +	}
> +	spin_unlock_irq(&xe->irq.lock);

I guess spin_lock() or spin_lock_irq_save() here and below, since this 
is inside hard irq?

> +
>   	master_ctl = xelp_intr_disable(xe);
>   	if (!master_ctl) {
>   		xelp_intr_enable(xe, false);
> @@ -371,6 +378,13 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>   
>   	/* TODO: This really shouldn't be copied+pasted */
>   
> +	spin_lock_irq(&xe->irq.lock);
> +	if (!xe->irq.enabled) {
> +		spin_unlock_irq(&xe->irq.lock);
> +		return IRQ_NONE;
> +	}
> +	spin_unlock_irq(&xe->irq.lock);
> +
>   	master_tile_ctl = dg1_intr_disable(xe);
>   	if (!master_tile_ctl) {
>   		dg1_intr_enable(xe, false);
> @@ -574,10 +588,14 @@ void xe_irq_shutdown(struct xe_device *xe)
>   
>   void xe_irq_suspend(struct xe_device *xe)
>   {
> +	int irq = to_pci_dev(xe->drm.dev)->irq;
> +
>   	spin_lock_irq(&xe->irq.lock);
> -	xe->irq.enabled = false;
> -	xe_irq_reset(xe);
> +	xe->irq.enabled = false; /* no new irqs */
>   	spin_unlock_irq(&xe->irq.lock);
> +
> +	synchronize_irq(irq); /* flush irqs */
> +	xe_irq_reset(xe); /* turn irqs off */
>   }
>   
>   void xe_irq_resume(struct xe_device *xe)
> @@ -585,13 +603,15 @@ void xe_irq_resume(struct xe_device *xe)
>   	struct xe_gt *gt;
>   	int id;
>   
> -	spin_lock_irq(&xe->irq.lock);
> +	/*
> +	 * lock not needed:
> +	 * 1. no irq will arrive before the postinstall
> +	 * 2. display is not yet resumed
> +	 */
>   	xe->irq.enabled = true;
>   	xe_irq_reset(xe);
> -	xe_irq_postinstall(xe);
> +	xe_irq_postinstall(xe); /* turn irqs on */
>   
>   	for_each_gt(gt, xe, id)
>   		xe_irq_enable_hwe(gt);
> -
> -	spin_unlock_irq(&xe->irq.lock);
>   }