[igt-dev] [PATCH igt v2] lib: Provide an accelerated routine for readback from WC

Chris Wilson chris at chris-wilson.co.uk
Tue Feb 27 22:17:49 UTC 2018


Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 lib/igt_fb.c  |  3 +-
 lib/igt_x86.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/igt_x86.h |  2 ++
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
 #include "drmtest.h"
 #include "igt_fb.h"
 #include "igt_kms.h"
+#include "igt_x86.h"
 #include "ioctl_wrappers.h"
 #include "intel_chipset.h"
 
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
 	 * it's faster to copy the whole BO to a temporary buffer and convert
 	 * from there.
 	 */
-	memcpy(buf, blit->linear.map, blit->linear.size);
+	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
 	y = &buf[blit->linear.offsets[0]];
 	uv = &buf[blit->linear.offsets[1]];
 
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..e15034da 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
 #endif
 
 #include "igt_x86.h"
+
+#include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * SECTION:igt_x86
@@ -174,3 +177,99 @@ char *igt_x86_features_to_string(unsigned features, char *line)
 	return ret;
 }
 #endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({                            \
+	typeof(x) _min1 = (x);                  \
+	typeof(y) _min2 = (y);                  \
+	(void) (&_min1 == &_min2);              \
+	_min1 < _min2 ? _min1 : _min2;		\
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+	if ((uintptr_t)src & 15) {
+		char buf[16];
+		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
+		unsigned long misalign = (uintptr_t)src & 15;
+		unsigned long copy = min(len, 16 - misalign);
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128(S));
+
+		memcpy(dst, buf + misalign, copy);
+
+		dst += copy;
+		src += copy;
+		len -= copy;
+	}
+
+	while (len >= 64) {
+		__m128i *S = (__m128i *)src;
+		__m128i *D = (__m128i *)dst;
+		__m128i tmp[4];
+
+		tmp[0] = _mm_stream_load_si128(S + 0);
+		tmp[1] = _mm_stream_load_si128(S + 1);
+		tmp[2] = _mm_stream_load_si128(S + 2);
+		tmp[3] = _mm_stream_load_si128(S + 3);
+
+		_mm_storeu_si128(D + 0, tmp[0]);
+		_mm_storeu_si128(D + 1, tmp[1]);
+		_mm_storeu_si128(D + 2, tmp[2]);
+		_mm_storeu_si128(D + 3, tmp[3]);
+
+		src += 64;
+		dst += 64;
+		len -= 64;
+	}
+
+	while (len >= 16) {
+		_mm_storeu_si128((__m128i *)dst,
+				 _mm_stream_load_si128((__m128i *)src));
+
+		src += 16;
+		dst += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		char buf[16];
+
+		_mm_storeu_si128((__m128i *)buf,
+				 _mm_stream_load_si128((__m128i *)src));
+		memcpy(dst, buf, len);
+	}
+}
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+	if (igt_x86_features() & SSE4_1)
+		return memcpy_from_wc_sse41;
+
+	return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+	__attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#pragma GCC pop_options
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
 }
 #endif
 
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #endif /* IGT_X86_H */
-- 
2.16.2



More information about the igt-dev mailing list