[Intel-gfx] [PATCH 15/15] drm/i915: Async GPU relocation processing

Thu Feb 23 16:18:30 UTC 2017

If the user requires patching of their batch or auxiliary buffers, we
currently make the alterations on the cpu. If they are active on the GPU
at the time, we wait under the struct_mutex for them to finish executing
before we rewrite the contents. This happens if shared relocation trees
are used between different contexts with separate address space (and the
buffers then have different addresses in each), the 3D state will need
to be adjusted between execution on each context. However, we don't need
to use the CPU to do the relocation patching, as we could queue commands
to the GPU to perform it and use fences to serialise the operation with
the current activity and future - so the operation on the GPU appears
just as atomic as performing it immediately. Performing the relocation
rewrites on the GPU is not free, in terms of pure throughput, the number
of relocations/s is about halved - but more importantly so is the time
under the struct_mutex.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |   1 -
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 217 +++++++++++++++++++++++++++--
 2 files changed, 208 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f46bad7680ec..7f33ef1cfca8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4106,7 +4106,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
-			GEM_BUG_ON(!i915_vma_is_ggtt(vma));
 			GEM_BUG_ON(i915_vma_is_active(vma));
 			vma->flags &= ~I915_VMA_PIN_MASK;
 			i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index dbb53281ddb1..058e0bf6697a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -41,7 +41,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+	FORCE_CPU_RELOC = 1,
+	FORCE_GTT_RELOC,
+	FORCE_GPU_RELOC,
+};
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
 
 #define  __EXEC_OBJECT_HAS_PIN		BIT(31)
 #define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
@@ -184,9 +189,14 @@ struct i915_execbuffer {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
-		bool use_64bit_reloc;
-		bool has_llc;
-		bool has_fence;
+		unsigned int gen;
+		bool use_64bit_reloc : 1;
+		bool has_llc : 1;
+		bool has_fence : 1;
+
+		struct drm_i915_gem_request *rq;
+		u32 *rq_cmd;
+		unsigned int rq_size;
 	} reloc_cache;
 	u64 invalid_flags;
 	u32 context_flags;
@@ -430,8 +440,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 	if (!i915_gem_object_has_struct_page(obj))
 		return false;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+		return true;
+
+	if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+		return false;
 
 	return (cache->has_llc ||
 		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
@@ -803,10 +816,13 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->gen = INTEL_GEN(i915);
 	cache->has_llc = HAS_LLC(i915);
-	cache->has_fence = INTEL_GEN(i915) < 4;
-	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+	cache->has_fence = cache->gen < 4;
+	cache->use_64bit_reloc = cache->gen >= 8;
 	cache->node.allocated = false;
+	cache->rq = NULL;
+	cache->rq_size = 0;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -828,10 +844,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
 	return &i915->ggtt;
 }
 
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+	i915_gem_object_unpin_map(cache->rq->batch->obj);
+	i915_gem_chipset_flush(cache->rq->i915);
+
+	__i915_add_request(cache->rq, true);
+	cache->rq = NULL;
+}
+
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
+	if (cache->rq)
+		reloc_gpu_flush(cache);
+
 	if (!cache->vaddr)
 		return;
 
@@ -995,6 +1025,112 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
+static u32 *reloc_gpu(struct i915_execbuffer *eb,
+		      struct i915_vma *vma,
+		      unsigned int len)
+{
+	struct reloc_cache *cache = &eb->reloc_cache;
+	u32 *cmd;
+
+	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+		reloc_gpu_flush(cache);
+
+	if (!cache->rq) {
+		struct drm_i915_gem_object *obj;
+		struct drm_i915_gem_request *rq;
+		struct i915_vma *batch;
+		int err;
+
+		GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
+
+		obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+					      PAGE_SIZE);
+		if (IS_ERR(obj))
+			return ERR_CAST(obj);
+
+		cmd = i915_gem_object_pin_map(obj,
+					      cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
+		i915_gem_object_unpin_pages(obj);
+		if (IS_ERR(cmd))
+			return ERR_CAST(cmd);
+
+		err = i915_gem_object_set_to_gtt_domain(obj, false);
+		if (err) {
+err_unmap:
+			i915_gem_object_unpin_map(obj);
+			return ERR_PTR(err);
+		}
+
+		batch = i915_vma_instance(obj, vma->vm, NULL);
+		if (IS_ERR(batch)) {
+			err = PTR_ERR(batch);
+			goto err_unmap;
+		}
+
+		err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+		if (err)
+			goto err_unmap;
+
+		rq = i915_gem_request_alloc(eb->engine, eb->ctx);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+err_unpin:
+			i915_vma_unpin(batch);
+			goto err_unmap;
+		}
+
+		err = i915_gem_request_await_object(rq,
+						    vma->obj,
+						    EXEC_OBJECT_WRITE);
+		if (err) {
+err_request:
+			i915_add_request(rq);
+			goto err_unpin;
+		}
+
+		err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
+		if (err)
+			goto err_request;
+
+		err = i915_switch_context(rq);
+		if (err)
+			goto err_request;
+
+		err = eb->engine->emit_bb_start(rq,
+						batch->node.start, PAGE_SIZE,
+						cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+		if (err)
+			goto err_request;
+
+		GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+								 true));
+		i915_vma_move_to_active(batch, rq, 0);
+		reservation_object_lock(obj->resv, NULL);
+		reservation_object_add_excl_fence(obj->resv, &rq->fence);
+		reservation_object_unlock(obj->resv);
+		i915_vma_unpin(batch);
+
+		i915_vma_move_to_active(vma, rq, true);
+		reservation_object_lock(vma->obj->resv, NULL);
+		reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
+		reservation_object_unlock(vma->obj->resv);
+
+		vma->obj->base.write_domain = 0;
+		vma->obj->base.read_domains = I915_GEM_GPU_DOMAINS;
+
+		rq->batch = batch;
+
+		cache->rq = rq;
+		cache->rq_cmd = cmd;
+		cache->rq_size = 0;
+	}
+
+	cmd = cache->rq_cmd + cache->rq_size;
+	cache->rq_size += len;
+
+	return cmd;
+}
+
 static u64
 relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
@@ -1007,6 +1143,67 @@ relocate_entry(struct i915_vma *vma,
 	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
+	if (!eb->reloc_cache.vaddr &&
+	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
+	     !reservation_object_test_signaled_rcu(obj->resv, true))) {
+		const unsigned int gen = eb->reloc_cache.gen;
+		unsigned int len;
+		u32 *batch;
+		u64 addr;
+
+		if (wide)
+			len = offset & 7 ? 8 : 5;
+		else if (gen >= 4)
+			len = 4;
+		else if (gen >= 3)
+			len = 3;
+		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
+			goto repeat;
+
+		batch = reloc_gpu(eb, vma, len);
+		if (IS_ERR(batch))
+			goto repeat;
+
+		addr = gen8_canonical_addr(vma->node.start + offset);
+		if (wide) {
+			if (offset & 7) {
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+
+				addr = gen8_canonical_addr(addr + 4);
+
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = upper_32_bits(target_offset);
+			} else {
+				*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+				*batch++ = upper_32_bits(target_offset);
+			}
+		} else if (gen >= 6) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else if (gen >= 4) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else {
+			*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		}
+
+		goto out;
+	}
+
 repeat:
 	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
@@ -1023,6 +1220,7 @@ relocate_entry(struct i915_vma *vma,
 		goto repeat;
 	}
 
+out:
 	return gen8_canonical_addr(target->node.start) | 1;
 }
 
@@ -1083,7 +1281,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
+	if (!DBG_FORCE_RELOC &&
+	    gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
-- 
2.11.0