[Intel-gfx] [PATCH] drm/i915: Fix random -ENOSPC eviction errors due to locked vma objects

Thomas Hellström thomas.hellstrom at linux.intel.com
Wed Aug 17 07:34:40 UTC 2022


On Wed, 2022-08-17 at 09:55 +0300, Sviatoslav Peleshko wrote:
> The i915_gem_object_trylock we had in the grab_vma() makes it return
> false
> when the vma->obj is already locked. In this case we'll skip this vma
> during eviction, and eventually might be forced to return -ENOSPC
> even
> though we could've evicted this vma if we waited for the lock a bit.
> 
> To fix this, replace the i915_gem_object_trylock with
> i915_gem_object_lock.
> And because we have to worry about the potential deadlock now,
> bubble-up
> the error code, so it will be correctly handled by the WW mechanism.
> 
> This fixes the issue
> https://gitlab.freedesktop.org/drm/intel/-/issues/6564
> 
> Fixes: 7e00897be8bf ("drm/i915: Add object locking to
> i915_gem_evict_for_node and i915_gem_evict_something, v2.")
> Signed-off-by: Sviatoslav Peleshko
> <sviatoslav.peleshko at globallogic.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_evict.c | 69 ++++++++++++++++++-------
> --
>  1 file changed, 46 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
> b/drivers/gpu/drm/i915/i915_gem_evict.c
> index f025ee4fa526..9d43f213f68f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -55,49 +55,58 @@ static int ggtt_flush(struct intel_gt *gt)
>         return intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
>  }
>  
> -static bool grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
> +static int grab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
>  {
> +       int ret = 0;
> +
>         /*
>          * We add the extra refcount so the object doesn't drop to
> zero until
>          * after ungrab_vma(), this way trylock is always paired with
> unlock.
>          */
>         if (i915_gem_object_get_rcu(vma->obj)) {
> -               if (!i915_gem_object_trylock(vma->obj, ww)) {
> +               ret = i915_gem_object_lock(vma->obj, ww);

Isn't the vm->mutex held here? If so, then this would be a lockdep
violation.

/Thomas




> +               if (ret)
>                         i915_gem_object_put(vma->obj);
> -                       return false;
> -               }
>         } else {
>                 /* Dead objects don't need pins */
>                 atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
>         }
>  
> -       return true;
> +       return ret;
>  }
>  
> -static void ungrab_vma(struct i915_vma *vma)
> +static void ungrab_vma(struct i915_vma *vma, struct i915_gem_ww_ctx
> *ww)
>  {
>         if (dying_vma(vma))
>                 return;
>  
> -       i915_gem_object_unlock(vma->obj);
> +       if (!ww)
> +               i915_gem_object_unlock(vma->obj);
> +
>         i915_gem_object_put(vma->obj);
>  }
>  
> -static bool
> +static int
>  mark_free(struct drm_mm_scan *scan,
>           struct i915_gem_ww_ctx *ww,
>           struct i915_vma *vma,
>           unsigned int flags,
>           struct list_head *unwind)
>  {
> +       int err;
> +
>         if (i915_vma_is_pinned(vma))
> -               return false;
> +               return -ENOSPC;
>  
> -       if (!grab_vma(vma, ww))
> -               return false;
> +       err = grab_vma(vma, ww);
> +       if (err)
> +               return err;
>  
>         list_add(&vma->evict_link, unwind);
> -       return drm_mm_scan_add_block(scan, &vma->node);
> +       if (!drm_mm_scan_add_block(scan, &vma->node))
> +               return -ENOSPC;
> +
> +       return 0;
>  }
>  
>  static bool defer_evict(struct i915_vma *vma)
> @@ -150,6 +159,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>         enum drm_mm_insert_mode mode;
>         struct i915_vma *active;
>         int ret;
> +       int err = 0;
>  
>         lockdep_assert_held(&vm->mutex);
>         trace_i915_gem_evict(vm, min_size, alignment, flags);
> @@ -210,17 +220,23 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>                         continue;
>                 }
>  
> -               if (mark_free(&scan, ww, vma, flags, &eviction_list))
> +               err = mark_free(&scan, ww, vma, flags,
> &eviction_list);
> +               if (!err)
>                         goto found;
> +               if (err == -EDEADLK)
> +                       break;
>         }
>  
>         /* Nothing found, clean up and bail out! */
>         list_for_each_entry_safe(vma, next, &eviction_list,
> evict_link) {
>                 ret = drm_mm_scan_remove_block(&scan, &vma->node);
>                 BUG_ON(ret);
> -               ungrab_vma(vma);
> +               ungrab_vma(vma, ww);
>         }
>  
> +       if (err == -EDEADLK)
> +               return err;
> +
>         /*
>          * Can we unpin some objects such as idle hw contents,
>          * or pending flips? But since only the GGTT has global
> entries
> @@ -267,7 +283,7 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>                         __i915_vma_pin(vma);
>                 } else {
>                         list_del(&vma->evict_link);
> -                       ungrab_vma(vma);
> +                       ungrab_vma(vma, ww);
>                 }
>         }
>  
> @@ -277,17 +293,21 @@ i915_gem_evict_something(struct
> i915_address_space *vm,
>                 __i915_vma_unpin(vma);
>                 if (ret == 0)
>                         ret = __i915_vma_unbind(vma);
> -               ungrab_vma(vma);
> +               ungrab_vma(vma, ww);
>         }
>  
>         while (ret == 0 && (node = drm_mm_scan_color_evict(&scan))) {
>                 vma = container_of(node, struct i915_vma, node);
>  
>                 /* If we find any non-objects (!vma), we cannot evict
> them */
> -               if (vma->node.color != I915_COLOR_UNEVICTABLE &&
> -                   grab_vma(vma, ww)) {
> -                       ret = __i915_vma_unbind(vma);
> -                       ungrab_vma(vma);
> +               if (vma->node.color != I915_COLOR_UNEVICTABLE) {
> +                       ret = grab_vma(vma, ww);
> +                       if (!ret) {
> +                               ret = __i915_vma_unbind(vma);
> +                               ungrab_vma(vma, ww);
> +                       } else if (ret != -EDEADLK) {
> +                               ret = -ENOSPC;
> +                       }
>                 } else {
>                         ret = -ENOSPC;
>                 }
> @@ -382,8 +402,11 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
>                         break;
>                 }
>  
> -               if (!grab_vma(vma, ww)) {
> -                       ret = -ENOSPC;
> +               ret = grab_vma(vma, ww);
> +               if (ret) {
> +                       if (ret != -EDEADLK)
> +                               ret = -ENOSPC;
> +
>                         break;
>                 }
>  
> @@ -405,7 +428,7 @@ int i915_gem_evict_for_node(struct
> i915_address_space *vm,
>                 if (ret == 0)
>                         ret = __i915_vma_unbind(vma);
>  
> -               ungrab_vma(vma);
> +               ungrab_vma(vma, ww);
>         }
>  
>         return ret;



More information about the Intel-gfx mailing list