[Intel-gfx] [PATCH 27/66] drm/i915/gem: Pull execbuf dma resv under a single critical section

Thu Jul 30 12:57:40 UTC 2020

On 7/28/20 5:16 PM, Chris Wilson wrote:
> Quoting Thomas Hellström (Intel) (2020-07-27 19:08:39)
>> On 7/15/20 1:51 PM, Chris Wilson wrote:
>>> Acquire all the objects and their backing storage, and page directories,
>>> as used by execbuf under a single common ww_mutex. Albeit we have to
>>> restart the critical section a few times in order to handle various
>>> restrictions (such as avoiding copy_(from|to)_user and mmap_sem).
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> ---
>>>    .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 168 +++++++++---------
>>>    .../i915/gem/selftests/i915_gem_execbuffer.c  |   8 +-
>>>    2 files changed, 87 insertions(+), 89 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> index ebabc0746d50..db433f3f18ec 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
>>> @@ -20,6 +20,7 @@
>>>    #include "gt/intel_gt_pm.h"
>>>    #include "gt/intel_gt_requests.h"
>>>    #include "gt/intel_ring.h"
>>> +#include "mm/i915_acquire_ctx.h"
>>>    
>>>    #include "i915_drv.h"
>>>    #include "i915_gem_clflush.h"
>>> @@ -244,6 +245,8 @@ struct i915_execbuffer {
>>>        struct intel_context *context; /* logical state for the request */
>>>        struct i915_gem_context *gem_context; /** caller's context */
>>>    
>>> +     struct i915_acquire_ctx acquire; /** lock for _all_ DMA reservations */
>>> +
>>>        struct i915_request *request; /** our request to build */
>>>        struct eb_vma *batch; /** identity of the batch obj/vma */
>>>    
>>> @@ -389,42 +392,6 @@ static void eb_vma_array_put(struct eb_vma_array *arr)
>>>        kref_put(&arr->kref, eb_vma_array_destroy);
>>>    }
>>>    
>>> -static int
>>> -eb_lock_vma(struct i915_execbuffer *eb, struct ww_acquire_ctx *acquire)
>>> -{
>>> -     struct eb_vma *ev;
>>> -     int err = 0;
>>> -
>>> -     list_for_each_entry(ev, &eb->submit_list, submit_link) {
>>> -             struct i915_vma *vma = ev->vma;
>>> -
>>> -             err = ww_mutex_lock_interruptible(&vma->resv->lock, acquire);
>>> -             if (err == -EDEADLK) {
>>> -                     struct eb_vma *unlock = ev, *en;
>>> -
>>> -                     list_for_each_entry_safe_continue_reverse(unlock, en,
>>> -                                                               &eb->submit_list,
>>> -                                                               submit_link) {
>>> -                             ww_mutex_unlock(&unlock->vma->resv->lock);
>>> -                             list_move_tail(&unlock->submit_link, &eb->submit_list);
>>> -                     }
>>> -
>>> -                     GEM_BUG_ON(!list_is_first(&ev->submit_link, &eb->submit_list));
>>> -                     err = ww_mutex_lock_slow_interruptible(&vma->resv->lock,
>>> -                                                            acquire);
>>> -             }
>>> -             if (err) {
>>> -                     list_for_each_entry_continue_reverse(ev,
>>> -                                                          &eb->submit_list,
>>> -                                                          submit_link)
>>> -                             ww_mutex_unlock(&ev->vma->resv->lock);
>>> -                     break;
>>> -             }
>>> -     }
>>> -
>>> -     return err;
>>> -}
>>> -
>>>    static int eb_create(struct i915_execbuffer *eb)
>>>    {
>>>        /* Allocate an extra slot for use by the sentinel */
>>> @@ -668,6 +635,25 @@ eb_add_vma(struct i915_execbuffer *eb,
>>>        }
>>>    }
>>>    
>>> +static int eb_lock_mm(struct i915_execbuffer *eb)
>>> +{
>>> +     struct eb_vma *ev;
>>> +     int err;
>>> +
>>> +     list_for_each_entry(ev, &eb->bind_list, bind_link) {
>>> +             err = i915_acquire_ctx_lock(&eb->acquire, ev->vma->obj);
>>> +             if (err)
>>> +                     return err;
>>> +     }
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static int eb_acquire_mm(struct i915_execbuffer *eb)
>>> +{
>>> +     return i915_acquire_mm(&eb->acquire);
>>> +}
>>> +
>>>    struct eb_vm_work {
>>>        struct dma_fence_work base;
>>>        struct eb_vma_array *array;
>>> @@ -1390,7 +1376,15 @@ static int eb_reserve_vm(struct i915_execbuffer *eb)
>>>        unsigned long count;
>>>        struct eb_vma *ev;
>>>        unsigned int pass;
>>> -     int err = 0;
>>> +     int err;
>>> +
>>> +     err = eb_lock_mm(eb);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     err = eb_acquire_mm(eb);
>>> +     if (err)
>>> +             return err;
>>>    
>>>        count = 0;
>>>        INIT_LIST_HEAD(&unbound);
>>> @@ -1416,10 +1410,15 @@ static int eb_reserve_vm(struct i915_execbuffer *eb)
>>>        if (count == 0)
>>>                return 0;
>>>    
>>> +     /* We need to reserve page directories, release all, start over */
>>> +     i915_acquire_ctx_fini(&eb->acquire);
>>> +
>>>        pass = 0;
>>>        do {
>>>                struct eb_vm_work *work;
>>>    
>>> +             i915_acquire_ctx_init(&eb->acquire);
>> Couldn't we do a i915_acquire_ctx_rollback() here to avoid losing our
>> ticket? That would mean deferring i915_acquire_ctx_done() until all
>> potential rollbacks have been performed.
> We need to completely drop the acquire-class for catching up with userptr
> (and anything else deferred that doesn't meet the current fence semantics).
Hmm, what other deferrered stuff are we doing that doesn't meet the 
current fence semantics? (Which I interpret as "everything deferred that 
we spawn during a CS needs to be either synced before we publish the 
fence or considered part of the fence signaling critical path")
>
> I thought it was sensible to drop all around the waits in this loop, and
> tidier to always reacquire at the beginning of each loop.
>
>> Or even better if we defer _ctx_done(), couldn't we just continue
>> locking the pts here instead of dropping and re-acquiring everything?
> Userptr would like to have word. If you just mean these do lines, then
> yes fini/init is overkill -- it just looked simpler than doing it at the
> end of the loop. The steady-state load is not meant to get past the
> optimistic fastpath.

Indeed complicating the code to not drop these locks outside of the 
faspath doesn't sound like a good idea.

However, if we drop the acquire ctx we might be subject to starving, and 
I figure waiting for userptr would still work as long as we drop all the 
locks.

> -Chris

/Thomas