[Intel-gfx] [CI 35/35] drm/i915/cmdparser: Accelerate copies from WC memory

Thu Aug 18 05:45:57 UTC 2016

If we need to use clflush to prepare our batch for reads from memory, we
can bypass the cache instead by using non-temporal copies.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.william.auld at gmail.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 73 +++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index e128e3ab8452..7d1424e514be 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -965,8 +965,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 {
 	unsigned int src_needs_clflush;
 	unsigned int dst_needs_clflush;
-	void *dst, *ptr;
-	int offset, n;
+	void *dst;
 	int ret;
 
 	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
@@ -983,31 +982,51 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 	if (IS_ERR(dst))
 		goto unpin_dst;
 
-	ptr = dst;
-	offset = offset_in_page(batch_start_offset);
-
-	/* We can avoid clflushing partial cachelines before the write if we
-	 * only every write full cache-lines. Since we know that both the
-	 * source and destination are in multiples of PAGE_SIZE, we can simply
-	 * round up to the next cacheline. We don't care about copying too much
-	 * here as we only validate up to the end of the batch.
-	 */
-	if (dst_needs_clflush & CLFLUSH_BEFORE)
-		batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
-
-	for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
-		int len = min_t(int, batch_len, PAGE_SIZE - offset);
-		void *vaddr;
-
-		vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
-		if (src_needs_clflush)
-			drm_clflush_virt_range(vaddr + offset, len);
-		memcpy(ptr, vaddr + offset, len);
-		kunmap_atomic(vaddr);
-
-		ptr += len;
-		batch_len -= len;
-		offset = 0;
+	if (src_needs_clflush &&
+	    i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) {
+		void *src;
+
+		src = i915_gem_object_pin_map(src_obj, I915_MAP_WC);
+		if (IS_ERR(src))
+			goto shmem_copy;
+
+		i915_memcpy_from_wc(dst,
+				    src + batch_start_offset,
+				    ALIGN(batch_len, 16));
+		i915_gem_object_unpin_map(src_obj);
+	} else {
+		void *ptr;
+		int offset, n;
+
+shmem_copy:
+		offset = offset_in_page(batch_start_offset);
+
+		/* We can avoid clflushing partial cachelines before the write
+		 * if we only every write full cache-lines. Since we know that
+		 * both the source and destination are in multiples of
+		 * PAGE_SIZE, we can simply round up to the next cacheline.
+		 * We don't care about copying too much here as we only
+		 * validate up to the end of the batch.
+		 */
+		if (dst_needs_clflush & CLFLUSH_BEFORE)
+			batch_len = roundup(batch_len,
+					    boot_cpu_data.x86_clflush_size);
+
+		ptr = dst;
+		for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
+			int len = min_t(int, batch_len, PAGE_SIZE - offset);
+			void *vaddr;
+
+			vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
+			if (src_needs_clflush)
+				drm_clflush_virt_range(vaddr + offset, len);
+			memcpy(ptr, vaddr + offset, len);
+			kunmap_atomic(vaddr);
+
+			ptr += len;
+			batch_len -= len;
+			offset = 0;
+		}
 	}
 
 	/* dst_obj is returned with vmap pinned */
-- 
2.9.3