[Intel-gfx] [PATCH 2/2] drm/i915: allow sync points within batches

Wed Sep 3 17:41:06 CEST 2014

On Wed, 3 Sep 2014 08:01:55 +0100
Chris Wilson <chris at chris-wilson.co.uk> wrote:

> On Tue, Sep 02, 2014 at 02:32:41PM -0700, Jesse Barnes wrote:
> > Use a new reloc type to allow userspace to insert sync points within
> > batches before they're submitted.  The corresponding fence fds are
> > returned in the offset field of the returned reloc tree, and can be
> > operated on with the sync fence APIs.
> > 
> > Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h            |   4 +
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
> >  drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
> >  include/uapi/drm/i915_drm.h                |  11 ++-
> >  4 files changed, 167 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 6eb119e..410eedf 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
> >  void i915_sync_fini(struct drm_i915_private *dev_priv);
> >  int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
> >  				 struct drm_file *file);
> > +int i915_sync_fence_create(struct intel_engine_cs *ring,
> > +			   struct intel_context *ctx,
> > +			   u32 seqno);
> > +
> >  
> >  #define PIN_MAPPABLE 0x1
> >  #define PIN_NONBLOCK 0x2
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 60998fc..32ec599 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -32,6 +32,7 @@
> >  #include "i915_trace.h"
> >  #include "intel_drv.h"
> >  #include <linux/dma_remapping.h>
> > +#include "../../../staging/android/sync.h"
> >  
> >  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
> >  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
> > @@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
> >  		!obj->map_and_fenceable ||
> >  		obj->cache_level != I915_CACHE_NONE);
> >  }
> > +static int
> > +emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
> > +		  struct drm_i915_gem_relocation_entry *reloc)
> > +{
> > +	uint32_t page_offset = offset_in_page(reloc->offset);
> > +	char *vaddr;
> > +	int ret;
> > +
> > +	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> > +				reloc->offset >> PAGE_SHIFT));
> > +	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
> > +	*(uint32_t *)(vaddr + page_offset + 4) =
> > +		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
> > +	*(uint32_t *)(vaddr + page_offset + 8) =
> > +		obj->ring->outstanding_lazy_seqno;
> > +	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
> > +
> > +	kunmap_atomic(vaddr);
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
> > +		  struct drm_i915_gem_relocation_entry *reloc)
> > +{
> > +	struct drm_device *dev = obj->base.dev;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	uint32_t __iomem *reloc_entry;
> > +	void __iomem *reloc_page;
> > +	int ret;
> > +
> > +	ret = i915_gem_object_set_to_gtt_domain(obj, true);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = i915_gem_object_put_fence(obj);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Map the page containing the relocation we're going to perform.  */
> > +	reloc->offset += i915_gem_obj_ggtt_offset(obj);
> > +	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
> > +			reloc->offset & PAGE_MASK);
> > +
> > +	reloc_entry = (uint32_t __iomem *)
> > +		(reloc_page + offset_in_page(reloc->offset));
> > +	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
> > +	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
> > +		  reloc_entry);
> > +	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
> > +	iowrite32(MI_USER_INTERRUPT, reloc_entry);
> > +
> > +	io_mapping_unmap_atomic(reloc_page);
> 
> These commands are illegal/invalid inside the object, only valid inside
> the ring.

Hm, we ought to be able to write to no privileged space with
STORE_DWORD, but that does mean moving to context specific pages in
process space, or at least adding them to our existing scheme.

I haven't tried MI_USER_INTERRUPT from a batch, if we can't do it from
a non-privileged batch that nixes one of the other neat features we
could have (fine grained intra-batch userspace synchronization).

> > +	return 0;
> > +}
> >  
> >  static int
> >  relocate_entry_cpu(struct drm_i915_gem_object *obj,
> > @@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
> >  static int
> >  i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> >  				   struct eb_vmas *eb,
> > -				   struct drm_i915_gem_relocation_entry *reloc)
> > +				   struct drm_i915_gem_relocation_entry *reloc,
> > +				   struct intel_context *ctx)
> 
> Hmm. That's a nuisance. But no, you only use it to automatically create
> a fence not to patch the batch, so you can just use an object-flag.
> 
> This fits neatly into requests.

Most definitely.  What do you think of the potential upside in the DDX
for this, assuming we get dword writes from batches working?

-- 
Jesse Barnes, Intel Open Source Technology Center