[Intel-gfx] [PATCH 27/28] drm/i915: Allow vma binding to occur asynchronously

Mon Jun 10 19:51:31 UTC 2019

Quoting Matthew Auld (2019-06-10 18:34:52)
> On Mon, 10 Jun 2019 at 08:21, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> >
> > If we let pages be allocated asynchronously, we also then want to push
> > the binding process into an asynchronous task. Make it so, utilising the
> > recent improvements to fence error tracking and struct_mutex reduction.
> >
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> 
> [snip]
> 
> > +static int queue_async_bind(struct i915_vma *vma,
> > +                           enum i915_cache_level cache_level,
> > +                           u32 flags)
> > +{
> > +       bool ready = true;
> > +
> > +       /* We are not allowed to shrink inside vm->mutex! */
> > +       vma->async.dma = kmalloc(sizeof(*vma->async.dma),
> > +                                GFP_NOWAIT | __GFP_NOWARN);
> > +       if (!vma->async.dma)
> > +               return -ENOMEM;
> > +
> > +       dma_fence_init(vma->async.dma,
> > +                      &async_bind_ops,
> > +                      &async_lock,
> > +                      vma->vm->i915->mm.unordered_timeline,
> > +                      0);
> > +
> > +       /* XXX find and avoid allocations under reservation_object locks */
> > +       if (!i915_vma_trylock(vma)) {
> > +               kfree(fetch_and_zero(&vma->async.dma));
> > +               return -EAGAIN;
> > +       }
> > +
> > +       if (rcu_access_pointer(vma->resv->fence_excl)) { /* async pages */
> > +               struct dma_fence *f = reservation_object_get_excl(vma->resv);
> > +
> > +               if (!dma_fence_add_callback(f,
> > +                                           &vma->async.cb,
> > +                                           __queue_async_bind))
> > +                       ready = false;
> > +       }
> > +       reservation_object_add_excl_fence(vma->resv, vma->async.dma);
> > +       i915_vma_unlock(vma);
> > +
> > +       i915_vm_get(vma->vm);
> > +       i915_vma_get(vma);
> > +       __i915_vma_pin(vma); /* avoid being shrunk */
> > +
> > +       vma->async.cache_level = cache_level;
> > +       vma->async.flags = flags;
> 
> Do we need to move this stuff to before the add_callback?

Yes. I'm used to having an i915_sw_fence_commit to have everything in
place before I let go :(

> > +       if (ready)
> > +               __queue_async_bind(vma->async.dma, &vma->async.cb);
> > +
> > +       return 0;
> > +}
> > +
> >  /**
> >   * i915_vma_bind - Sets up PTEs for an VMA in it's corresponding address space.
> >   * @vma: VMA to map
> > @@ -293,17 +405,12 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
> >         u32 vma_flags;
> >         int ret;
> >
> > +       GEM_BUG_ON(!flags);
> >         GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
> >         GEM_BUG_ON(vma->size > vma->node.size);
> > -
> > -       if (GEM_DEBUG_WARN_ON(range_overflows(vma->node.start,
> > -                                             vma->node.size,
> > -                                             vma->vm->total)))
> > -               return -ENODEV;
> > -
> > -       if (GEM_DEBUG_WARN_ON(!flags))
> > -               return -EINVAL;
> > -
> > +       GEM_BUG_ON(range_overflows(vma->node.start,
> > +                                  vma->node.size,
> > +                                  vma->vm->total));
> >         bind_flags = 0;
> >         if (flags & PIN_GLOBAL)
> >                 bind_flags |= I915_VMA_GLOBAL_BIND;
> > @@ -318,14 +425,18 @@ int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
> 
> Are we aiming for vma_bind/vma_bind_async/vma_pin/vma_pin_async etc ?

At the moment, it's a pretty straightforward split between global being
synchronous and user being asynchronous. Now, user should always remain
async, if only because we have to play games with avoiding allocations
underneath vm->mutex; but we may want to make global optionally async
(if we have some caller that cares).

The implication of the question is that you would rather be a flag
upfront so that we don't have to retrofit later on :)

> >         if (bind_flags == 0)
> >                 return 0;
> >
> > -       GEM_BUG_ON(!vma->pages);
> > +       if ((bind_flags & ~vma_flags) & I915_VMA_LOCAL_BIND)
> > +               bind_flags |= I915_VMA_ALLOC_BIND;
> >
> >         trace_i915_vma_bind(vma, bind_flags);
> > -       ret = vma->ops->bind_vma(vma, cache_level, bind_flags);
> > +       if (bind_flags & I915_VMA_ALLOC_BIND)
> > +               ret = queue_async_bind(vma, cache_level, bind_flags);
> > +       else
> > +               ret = __vma_bind(vma, cache_level, bind_flags);
> >         if (ret)
> >                 return ret;
> 
> Looks like clear_pages() is called unconditionally in vma_remove() if
> we error out here, even though that is now set as part of the worker?

Right, I moved the ->set_pages, I should move the ->clear_pages.

> > -       vma->flags |= bind_flags;
> > +       vma->flags |= bind_flags & ~I915_VMA_ALLOC_BIND;
> >         return 0;
> >  }
> >
> > @@ -569,7 +680,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
> >         }
> >
> >         if (vma->obj) {
> > -               ret = i915_gem_object_pin_pages(vma->obj);
> > +               ret = i915_gem_object_pin_pages_async(vma->obj);
> >                 if (ret)
> >                         return ret;
> >
> > @@ -578,25 +689,19 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
> >                 cache_level = 0;
> >         }
> >
> > -       GEM_BUG_ON(vma->pages);
> > -
> > -       ret = vma->ops->set_pages(vma);
> > -       if (ret)
> > -               goto err_unpin;
> > -
> 
> I guess one casualty here is the !offset_fixed path for
> huge-gtt-pages, since we need to inspect the vma->page_sizes below.
> Though probably not the end of the world if that's the case.

Whoops. That's a nuisance as there is no way we can know at this stage,
so we'll just have to assume the worst.
-Chris