[Intel-gfx] [RFC] drm/i915: Check if we hold a wakeref during ioread32/iowrite32

Fri Feb 26 15:42:58 UTC 2016

On ti, 2016-02-23 at 16:54 +0000, Chris Wilson wrote:
> On Tue, Feb 23, 2016 at 05:09:29PM +0200, Imre Deak wrote:
> > On ti, 2016-02-23 at 14:55 +0000, Chris Wilson wrote:
> > > On Tue, Feb 23, 2016 at 04:47:17PM +0200, Imre Deak wrote:
> > [...]
> > > How's the separation of struct_mutex from rpm going so that we
> > > can
> > > forgo
> > > adding assertions and use explicit power management instead?
> > 
> > It's planned to be done, but no one is working on that yet. This is
> > something we could still have regardless, similarly to other
> > helpers
> > accessing the device.
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c
> b/drivers/gpu/drm/i915/i915_drv.c
> index 31b600d31158..b8687b6a6acb 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1430,24 +1430,6 @@ static int intel_runtime_suspend(struct device
> *device)
>  
>         DRM_DEBUG_KMS("Suspending device\n");
>  
> -       /*
> -        * We could deadlock here in case another thread holding
> struct_mutex
> -        * calls RPM suspend concurrently, since the RPM suspend will
> wait
> -        * first for this RPM suspend to finish. In this case the
> concurrent
> -        * RPM resume will be followed by its RPM suspend
> counterpart. Still
> -        * for consistency return -EAGAIN, which will reschedule this
> suspend.
> -        */
> -       if (!mutex_trylock(&dev->struct_mutex)) {
> -               DRM_DEBUG_KMS("device lock contention, deffering
> suspend\n");
> -               /*
> -                * Bump the expiration timestamp, otherwise the
> suspend won't
> -                * be rescheduled.
> -                */
> -               pm_runtime_mark_last_busy(device);
> -
> -               return -EAGAIN;
> -       }
> -
>         disable_rpm_wakeref_asserts(dev_priv);
>  
>         /*
> @@ -1455,7 +1437,6 @@ static int intel_runtime_suspend(struct device
> *device)
>          * an RPM reference.
>          */
>         i915_gem_release_all_mmaps(dev_priv);
> -       mutex_unlock(&dev->struct_mutex);
>  
>         intel_guc_suspend(dev);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c
> index 79706621e6e4..4f6127466822 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1670,9 +1670,13 @@ i915_gem_release_mmap(struct
> drm_i915_gem_object *obj)
>         /* Serialisation between user GTT access and our code depends
> upon
>          * revoking the CPU's PTE whilst the mutex is held. The next
> user
>          * pagefault then has to wait until we release the mutex.
> +        *
> +        * Note that RPM complicates somewhat by adding an additional
> +        * requirement that operations to the GGTT be made holding
> the RPM
> +        * wakeref. This in turns allow us to release the mmap from
> within
> +        * the RPM suspend code ignoring the struct_mutex
> serialisation in
> +        * lieu of the RPM barriers.
>          */
> -       lockdep_assert_held(&obj->base.dev->struct_mutex);
> -
>         if (!obj->fault_mappable)
>                 return;
>  
> @@ -1685,11 +1689,21 @@ i915_gem_release_mmap(struct
> drm_i915_gem_object *obj)
>         obj->fault_mappable = false;
>  }
>  
> +static void assert_rpm_release_all_mmaps(struct drm_i915_private
> *dev_priv)
> +{
> +       assert_rpm_wakelock_held(dev_priv);
> +}
> +
>  void
>  i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv)
>  {
>         struct drm_i915_gem_object *obj;
>  
> +       /* This should only be called by RPM as we require the
> bound_list
> +        * to be protected by the RPM barriers and not struct_mutex.
> +        * We check that we are holding the wakeref whenever we
> manipulate
> +        * the dev_priv->mm.bound_list (via
> assert_rpm_release_all_mmaps).
> +        */
>         list_for_each_entry(obj, &dev_priv->mm.bound_list,
> global_list)
>                 i915_gem_release_mmap(obj);
>  }
> @@ -2224,9 +2238,11 @@ i915_gem_object_retire__read(struct
> i915_gem_active *active,
>          * so that we don't steal from recently used but inactive
> objects
>          * (unless we are forced to ofc!)
>          */
> -       if (obj->bind_count)
> +       if (obj->bind_count) {
> +               assert_rpm_release_all_mmaps(request->i915);
>                 list_move_tail(&obj->global_list,
>                                &request->i915->mm.bound_list);
> +       }
>  
>         if (i915_gem_object_has_active_reference(obj)) {
>                 i915_gem_object_unset_active_reference(obj);
> @@ -2751,9 +2767,11 @@ int i915_vma_unbind(struct i915_vma *vma)
>  
>         /* Since the unbound list is global, only move to that list
> if
>          * no more VMAs exist. */
> -       if (--obj->bind_count == 0)
> +       if (--obj->bind_count == 0) {
> +               assert_rpm_release_all_mmaps(to_i915(obj->base.dev));
>                 list_move_tail(&obj->global_list,
>                                &to_i915(obj->base.dev)-
> >mm.unbound_list);
> +       }
>  
>         /* And finally now the object is completely decoupled from
> this vma,
>          * we can drop its hold on the backing storage and allow it
> to be
> @@ -2913,6 +2931,7 @@ i915_vma_insert(struct i915_vma *vma,
>                 goto err_remove_node;
>         }
>  
> +       assert_rpm_release_all_mmaps(dev_priv);
>         list_move_tail(&obj->global_list, &dev_priv->mm.bound_list);
>         list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
>         obj->bind_count++;

Ok, so IIUC with the above we'd have the locking rule that to change
the mm.bound_list or obj->fault_mappable for an object on mm.bound_list
you need to hold both a wakeref _and_ struct_mutex. To iterate through
the mm.bound_list you only need to hold either a wakeref _or_
struct_mutex.

Based on these rules I don't see a problem with the above, but I would
also add the assert to i915_gem_object_bind_to_vm() and
i915_gem_shrink(). In fact I can't see that we take a wakeref on the
i915_gem_shrink() path.

Did you plan to send a complete patch?

--Imre