[igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC

Chris Wilson chris at chris-wilson.co.uk
Wed Feb 28 09:00:16 UTC 2018


Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine
v4: Replace opencoded memcpy_from_wc
v5: Always flush the internal buffer before use (Eric)
v6: Assume bulk moves, so check for dst alignment.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Eric Anholt <eric at anholt.net>
---
 lib/igt_fb.c                   |   3 +-
 lib/igt_x86.c                  | 124 +++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h                  |   2 +
 tests/gem_fence_thrash.c       |  63 +--------------------
 tests/gem_mmap_gtt.c           |  37 +-----------
 tests/gem_tiled_pread_pwrite.c |  37 +-----------
 6 files changed, 132 insertions(+), 134 deletions(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..54539456 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	char buf[16];
+
+	/* Flush the internal buffer of potential stale gfx data */
+	__builtin_ia32_mfence();
+
+	if ((uintptr_t)src & 15) {
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	/* We assume we are doing bulk transfers, so prefer aligned moves */
+	if (((uintptr_t)dst & 15) == 0) {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_store_si128(D + 0, tmp[0]);
+			_mm_store_si128(D + 1, tmp[1]);
+			_mm_store_si128(D + 2, tmp[2]);
+			_mm_store_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	} else {
+		while (len >= 64) {
+			__m128i *S = (__m128i *)src;
+			__m128i *D = (__m128i *)dst;
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S + 0);
+			tmp[1] = _mm_stream_load_si128(S + 1);
+			tmp[2] = _mm_stream_load_si128(S + 2);
+			tmp[3] = _mm_stream_load_si128(S + 3);
+
+			_mm_storeu_si128(D + 0, tmp[0]);
+			_mm_storeu_si128(D + 1, tmp[1]);
+			_mm_storeu_si128(D + 2, tmp[2]);
+			_mm_storeu_si128(D + 3, tmp[3]);
+
+			src += 64;
+			dst += 64;
+			len -= 64;
+		}
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
index c8ff961d..2d7fb2ff 100644
--- a/tests/gem_fence_thrash.c
+++ b/tests/gem_fence_thrash.c
@@ -107,75 +107,16 @@ bo_copy (void *_arg)
 	return NULL;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-
-#define MOVNT 512
-
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-static void copy_wc_cacheline(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-		__m128i tmp[4];
-
-		tmp[0] = _mm_stream_load_si128(S++);
-		tmp[1] = _mm_stream_load_si128(S++);
-		tmp[2] = _mm_stream_load_si128(S++);
-		tmp[3] = _mm_stream_load_si128(S++);
-
-		_mm_store_si128(D++, tmp[0]);
-		_mm_store_si128(D++, tmp[1]);
-		_mm_store_si128(D++, tmp[2]);
-		_mm_store_si128(D++, tmp[3]);
-	} else
-		memcpy(dst, src, CACHELINE);
-}
-
-#pragma GCC pop_options
-
-#else
-
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
 
 static void copy_wc_cacheline(void *dst, const void *src)
 {
-	memcpy(dst, src, CACHELINE);
+	igt_memcpy_from_wc(dst, src, CACHELINE);
 }
 
-#endif
-
 static void
 _bo_write_verify(struct test *t)
 {
diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
index 0f598125..6a332b25 100644
--- a/tests/gem_mmap_gtt.c
+++ b/tests/gem_mmap_gtt.c
@@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
 	munmap(linear_pattern, PAGE_SIZE);
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 static unsigned int tile_row_size(int tiling, unsigned int stride)
 {
diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
index 7b5577fd..313daa38 100644
--- a/tests/gem_tiled_pread_pwrite.c
+++ b/tests/gem_tiled_pread_pwrite.c
@@ -100,45 +100,10 @@ create_bo(int fd)
 	return handle;
 }
 
-#if defined(__x86_64__) && !defined(__clang__)
-#define MOVNT 512
-
-#pragma GCC push_options
-#pragma GCC target("sse4.1")
-
-#include <smmintrin.h>
-__attribute__((noinline))
-static void copy_wc_page(void *dst, void *src)
-{
-	if (igt_x86_features() & SSE4_1) {
-		__m128i *S = (__m128i *)src;
-		__m128i *D = (__m128i *)dst;
-
-		for (int i = 0; i < PAGE_SIZE/64; i++) {
-			__m128i tmp[4];
-
-			tmp[0] = _mm_stream_load_si128(S++);
-			tmp[1] = _mm_stream_load_si128(S++);
-			tmp[2] = _mm_stream_load_si128(S++);
-			tmp[3] = _mm_stream_load_si128(S++);
-
-			_mm_store_si128(D++, tmp[0]);
-			_mm_store_si128(D++, tmp[1]);
-			_mm_store_si128(D++, tmp[2]);
-			_mm_store_si128(D++, tmp[3]);
-		}
-	} else
-		memcpy(dst, src, PAGE_SIZE);
-}
-
-#pragma GCC pop_options
-
-#else
 static void copy_wc_page(void *dst, const void *src)
 {
-	memcpy(dst, src, PAGE_SIZE);
+	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
 }
-#endif
 
 igt_simple_main
 {
-- 
2.16.2



More information about the igt-dev mailing list