[igt-dev] [PATCH igt v3] lib: Provide an accelerated routine for readback from WC
Chris Wilson
chris at chris-wilson.co.uk
Tue Feb 27 22:20:57 UTC 2018
Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.
v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
lib/igt_fb.c | 3 +-
lib/igt_x86.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/igt_x86.h | 2 ++
3 files changed, 103 insertions(+), 1 deletion(-)
diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
#include "drmtest.h"
#include "igt_fb.h"
#include "igt_kms.h"
+#include "igt_x86.h"
#include "ioctl_wrappers.h"
#include "intel_chipset.h"
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
* it's faster to copy the whole BO to a temporary buffer and convert
* from there.
*/
- memcpy(buf, blit->linear.map, blit->linear.size);
+ igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
y = &buf[blit->linear.offsets[0]];
uv = &buf[blit->linear.offsets[1]];
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..eba4c898 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
#endif
#include "igt_x86.h"
+
+#include <stdint.h>
#include <stdio.h>
+#include <string.h>
/**
* SECTION:igt_x86
@@ -174,3 +177,99 @@ char *igt_x86_features_to_string(unsigned features, char *line)
return ret;
}
#endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#define min(x, y) ({ \
+ typeof(x) _min1 = (x); \
+ typeof(y) _min2 = (y); \
+ (void) (&_min1 == &_min2); \
+ _min1 < _min2 ? _min1 : _min2; \
+})
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+ if ((uintptr_t)src & 15) {
+ char buf[16];
+ __m128i *S = (__m128i *)((uintptr_t)src & ~15);
+ unsigned long misalign = (uintptr_t)src & 15;
+ unsigned long copy = min(len, 16 - misalign);
+
+ _mm_storeu_si128((__m128i *)buf,
+ _mm_stream_load_si128(S));
+
+ memcpy(dst, buf + misalign, copy);
+
+ dst += copy;
+ src += copy;
+ len -= copy;
+ }
+
+ while (len >= 64) {
+ __m128i *S = (__m128i *)src;
+ __m128i *D = (__m128i *)dst;
+ __m128i tmp[4];
+
+ tmp[0] = _mm_stream_load_si128(S + 0);
+ tmp[1] = _mm_stream_load_si128(S + 1);
+ tmp[2] = _mm_stream_load_si128(S + 2);
+ tmp[3] = _mm_stream_load_si128(S + 3);
+
+ _mm_storeu_si128(D + 0, tmp[0]);
+ _mm_storeu_si128(D + 1, tmp[1]);
+ _mm_storeu_si128(D + 2, tmp[2]);
+ _mm_storeu_si128(D + 3, tmp[3]);
+
+ src += 64;
+ dst += 64;
+ len -= 64;
+ }
+
+ while (len >= 16) {
+ _mm_storeu_si128((__m128i *)dst,
+ _mm_stream_load_si128((__m128i *)src));
+
+ src += 16;
+ dst += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ char buf[16];
+
+ _mm_storeu_si128((__m128i *)buf,
+ _mm_stream_load_si128((__m128i *)src));
+ memcpy(dst, buf, len);
+ }
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+ memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+ if (igt_x86_features() & SSE4_1)
+ return memcpy_from_wc_sse41;
+
+ return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+ __attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+ memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
}
#endif
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
#endif /* IGT_X86_H */
--
2.16.2
More information about the igt-dev
mailing list