[Intel-gfx] [PATCH 2/2] drm/i915: allow sync points within batches

Jesse Barnes jbarnes at virtuousgeek.org
Tue Sep 2 23:32:41 CEST 2014


Use a new reloc type to allow userspace to insert sync points within
batches before they're submitted.  The corresponding fence fds are
returned in the offset field of the returned reloc tree, and can be
operated on with the sync fence APIs.

Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
---
 drivers/gpu/drm/i915/i915_drv.h            |   4 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 125 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_sync.c           |  58 ++++++++++---
 include/uapi/drm/i915_drm.h                |  11 ++-
 4 files changed, 167 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6eb119e..410eedf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2284,6 +2284,10 @@ int i915_sync_init(struct drm_i915_private *dev_priv);
 void i915_sync_fini(struct drm_i915_private *dev_priv);
 int i915_sync_create_fence_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file);
+int i915_sync_fence_create(struct intel_engine_cs *ring,
+			   struct intel_context *ctx,
+			   u32 seqno);
+
 
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 60998fc..32ec599 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "../../../staging/android/sync.h"
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -262,6 +263,67 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 		!obj->map_and_fenceable ||
 		obj->cache_level != I915_CACHE_NONE);
 }
+static int
+emit_sync_obj_cpu(struct drm_i915_gem_object *obj,
+		  struct drm_i915_gem_relocation_entry *reloc)
+{
+	uint32_t page_offset = offset_in_page(reloc->offset);
+	char *vaddr;
+	int ret;
+
+	ret = i915_gem_object_set_to_cpu_domain(obj, true);
+	if (ret)
+		return ret;
+
+	vaddr = kmap_atomic(i915_gem_object_get_page(obj,
+				reloc->offset >> PAGE_SHIFT));
+	*(uint32_t *)(vaddr + page_offset) = MI_STORE_DWORD_INDEX;
+	*(uint32_t *)(vaddr + page_offset + 4) =
+		I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*(uint32_t *)(vaddr + page_offset + 8) =
+		obj->ring->outstanding_lazy_seqno;
+	*(uint32_t *)(vaddr + page_offset + 12) = MI_USER_INTERRUPT;
+
+	kunmap_atomic(vaddr);
+
+	return 0;
+}
+
+static int
+emit_sync_obj_gtt(struct drm_i915_gem_object *obj,
+		  struct drm_i915_gem_relocation_entry *reloc)
+{
+	struct drm_device *dev = obj->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t __iomem *reloc_entry;
+	void __iomem *reloc_page;
+	int ret;
+
+	ret = i915_gem_object_set_to_gtt_domain(obj, true);
+	if (ret)
+		return ret;
+
+	ret = i915_gem_object_put_fence(obj);
+	if (ret)
+		return ret;
+
+	/* Map the page containing the relocation we're going to perform.  */
+	reloc->offset += i915_gem_obj_ggtt_offset(obj);
+	reloc_page = io_mapping_map_atomic_wc(dev_priv->gtt.mappable,
+			reloc->offset & PAGE_MASK);
+
+	reloc_entry = (uint32_t __iomem *)
+		(reloc_page + offset_in_page(reloc->offset));
+	iowrite32(MI_STORE_DWORD_INDEX, reloc_entry);
+	iowrite32(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT,
+		  reloc_entry);
+	iowrite32(obj->ring->outstanding_lazy_seqno, reloc_entry);
+	iowrite32(MI_USER_INTERRUPT, reloc_entry);
+
+	io_mapping_unmap_atomic(reloc_page);
+
+	return 0;
+}
 
 static int
 relocate_entry_cpu(struct drm_i915_gem_object *obj,
@@ -349,7 +411,8 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_vmas *eb,
-				   struct drm_i915_gem_relocation_entry *reloc)
+				   struct drm_i915_gem_relocation_entry *reloc,
+				   struct intel_context *ctx)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_gem_object *target_obj;
@@ -433,23 +496,39 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	if (obj->active && in_atomic())
 		return -EFAULT;
 
-	if (use_cpu_reloc(obj))
-		ret = relocate_entry_cpu(obj, reloc, target_offset);
-	else
-		ret = relocate_entry_gtt(obj, reloc, target_offset);
+	if (reloc->write_domain & I915_GEM_DOMAIN_SYNC_OBJ) {
+		int fd;
+
+		/* get a new seqno */
+		intel_ring_begin(obj->ring, 0);
+
+		if (use_cpu_reloc(obj))
+			ret = emit_sync_obj_cpu(obj, reloc);
+		else
+			ret = emit_sync_obj_gtt(obj, reloc);
+
+		fd = i915_sync_fence_create(obj->ring, ctx,
+					    obj->ring->outstanding_lazy_seqno);
+		reloc->presumed_offset = fd;
+	} else {
+		if (use_cpu_reloc(obj))
+			ret = relocate_entry_cpu(obj, reloc, target_offset);
+		else
+			ret = relocate_entry_gtt(obj, reloc, target_offset);
+		/* and update the user's relocation entry */
+		reloc->presumed_offset = target_offset;
+	}
 
 	if (ret)
 		return ret;
 
-	/* and update the user's relocation entry */
-	reloc->presumed_offset = target_offset;
-
 	return 0;
 }
 
 static int
 i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
-				 struct eb_vmas *eb)
+				 struct eb_vmas *eb,
+				 struct intel_context *ctx)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
@@ -473,7 +552,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r);
+			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, ctx);
 			if (ret)
 				return ret;
 
@@ -496,13 +575,14 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 static int
 i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 				      struct eb_vmas *eb,
-				      struct drm_i915_gem_relocation_entry *relocs)
+				      struct drm_i915_gem_relocation_entry *relocs,
+				      struct intel_context *ctx)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	int i, ret;
 
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], ctx);
 		if (ret)
 			return ret;
 	}
@@ -511,7 +591,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 }
 
 static int
-i915_gem_execbuffer_relocate(struct eb_vmas *eb)
+i915_gem_execbuffer_relocate(struct eb_vmas *eb, struct intel_context *ctx)
 {
 	struct i915_vma *vma;
 	int ret = 0;
@@ -525,7 +605,7 @@ i915_gem_execbuffer_relocate(struct eb_vmas *eb)
 	 */
 	pagefault_disable();
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
-		ret = i915_gem_execbuffer_relocate_vma(vma, eb);
+		ret = i915_gem_execbuffer_relocate_vma(vma, eb, ctx);
 		if (ret)
 			break;
 	}
@@ -664,6 +744,13 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *ring,
 			obj->tiling_mode != I915_TILING_NONE;
 		need_mappable = need_fence || need_reloc_mappable(vma);
 
+		/*
+		 * If we're emitting a sync obj, we always need a reloc
+		 * pass to write the seqno.
+		 */
+		if (entry->flags & EXEC_OBJECT_SYNC_OBJ)
+			*need_relocs = true;
+
 		if (need_mappable)
 			list_move(&vma->exec_list, &ordered_vmas);
 		else
@@ -734,7 +821,8 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 				  struct drm_file *file,
 				  struct intel_engine_cs *ring,
 				  struct eb_vmas *eb,
-				  struct drm_i915_gem_exec_object2 *exec)
+				  struct drm_i915_gem_exec_object2 *exec,
+				  struct intel_context *ctx)
 {
 	struct drm_i915_gem_relocation_entry *reloc;
 	struct i915_address_space *vm;
@@ -830,7 +918,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 	list_for_each_entry(vma, &eb->vmas, exec_list) {
 		int offset = vma->exec_entry - exec;
 		ret = i915_gem_execbuffer_relocate_vma_slow(vma, eb,
-							    reloc + reloc_offset[offset]);
+							    reloc + reloc_offset[offset], ctx);
 		if (ret)
 			goto err;
 	}
@@ -1340,17 +1428,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
 	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
+
 	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs);
 	if (ret)
 		goto err;
 
 	/* The objects are in their final locations, apply the relocations. */
 	if (need_relocs)
-		ret = i915_gem_execbuffer_relocate(eb);
+		ret = i915_gem_execbuffer_relocate(eb, ctx);
 	if (ret) {
 		if (ret == -EFAULT) {
 			ret = i915_gem_execbuffer_relocate_slow(dev, args, file, ring,
-								eb, exec);
+								eb, exec, ctx);
 			BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 		}
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_sync.c b/drivers/gpu/drm/i915/i915_sync.c
index 4938616..bd54fca 100644
--- a/drivers/gpu/drm/i915/i915_sync.c
+++ b/drivers/gpu/drm/i915/i915_sync.c
@@ -195,32 +195,72 @@ static struct fence_ops i915_fence_ops = {
 	.timeline_value_str =	i915_fence_timeline_value_str,
 };
 
-static struct fence *i915_fence_create(struct intel_engine_cs *ring,
-				       struct intel_context *ctx)
+static struct i915_fence *__i915_fence_create(struct intel_engine_cs *ring,
+					      struct intel_context *ctx,
+					      u32 seqno)
 {
 	struct i915_fence *fence;
-	int ret;
 
 	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
 	if (!fence)
 		return NULL;
 
+	fence->ring = ring;
+	fence->ctx = ctx;
+	fence->seqno = ring->outstanding_lazy_seqno;
+	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
+		   fence->seqno);
+
+	return fence;
+}
+
+static struct fence *i915_fence_create(struct intel_engine_cs *ring,
+				       struct intel_context *ctx)
+{
+	struct i915_fence *fence;
+	int ret;
+
 	ret = ring->add_request(ring);
 	if (ret) {
 		DRM_ERROR("add_request failed\n");
-		fence_free((struct fence *)fence);
 		return NULL;
 	}
 
-	fence->ring = ring;
-	fence->ctx = ctx;
-	fence->seqno = ring->outstanding_lazy_seqno;
-	fence_init(&fence->base, &i915_fence_ops, &fence_lock, ctx->user_handle,
-		   fence->seqno);
+	fence = __i915_fence_create(ring, ctx, ring->outstanding_lazy_seqno);
 
 	return &fence->base;
 }
 
+int i915_sync_fence_create(struct intel_engine_cs *ring,
+			   struct intel_context *ctx,
+			   u32 seqno)
+{
+	struct i915_fence *fence;
+	struct sync_fence *sfence;
+	char name[64];
+	int fd = get_unused_fd_flags(O_CLOEXEC);
+
+	fence = __i915_fence_create(ring, ctx, seqno);
+	if (!fence) {
+		fd = -ENOMEM;
+		goto err;
+	}
+
+	snprintf(name, sizeof(name), "0x%08x:0x%08x",
+		 ctx->user_handle, seqno);
+	sfence = sync_fence_create_dma(name, &fence->base);
+	if (!sfence) {
+		fence_free((struct fence *)fence);
+		fd = -ENOMEM;
+		goto err;
+	}
+
+	sync_fence_install(sfence, fd);
+
+err:
+	return fd;
+}
+
 /**
  * i915_sync_create_fence_ioctl - fence creation function
  * @dev: drm device
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 65bd271..edadab2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -585,6 +585,8 @@ struct drm_i915_gem_relocation_entry {
 #define I915_GEM_DOMAIN_VERTEX		0x00000020
 /** GTT domain - aperture and scanout */
 #define I915_GEM_DOMAIN_GTT		0x00000040
+/** Sync object - special for inline fences */
+#define I915_GEM_DOMAIN_SYNC_OBJ	0x00000080
 /** @} */
 
 struct drm_i915_gem_exec_object {
@@ -661,10 +663,11 @@ struct drm_i915_gem_exec_object2 {
 	 */
 	__u64 offset;
 
-#define EXEC_OBJECT_NEEDS_FENCE (1<<0)
-#define EXEC_OBJECT_NEEDS_GTT	(1<<1)
-#define EXEC_OBJECT_WRITE	(1<<2)
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1)
+#define EXEC_OBJECT_NEEDS_FENCE (1<<0) /* requires fence regsiter */
+#define EXEC_OBJECT_NEEDS_GTT	(1<<1) /* needs global GTT mapping */
+#define EXEC_OBJECT_WRITE	(1<<2) /* object will be written */
+#define EXEC_OBJECT_SYNC_OBJ	(1<<3) /* emit a sync obj instead */
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SYNC_OBJ<<1)
 	__u64 flags;
 
 	__u64 rsvd1;
-- 
1.9.1




More information about the Intel-gfx mailing list