[igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC
Chris Wilson
chris at chris-wilson.co.uk
Tue Feb 27 21:50:40 UTC 2018
Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
lib/igt_fb.c | 3 ++-
lib/igt_x86.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
lib/igt_x86.h | 2 ++
3 files changed, 50 insertions(+), 1 deletion(-)
diff --git a/lib/igt_fb.c b/lib/igt_fb.c
index ecd73053..7404ba7c 100644
--- a/lib/igt_fb.c
+++ b/lib/igt_fb.c
@@ -32,6 +32,7 @@
#include "drmtest.h"
#include "igt_fb.h"
#include "igt_kms.h"
+#include "igt_x86.h"
#include "ioctl_wrappers.h"
#include "intel_chipset.h"
@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
* it's faster to copy the whole BO to a temporary buffer and convert
* from there.
*/
- memcpy(buf, blit->linear.map, blit->linear.size);
+ igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
y = &buf[blit->linear.offsets[0]];
uv = &buf[blit->linear.offsets[1]];
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..b7b57284 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,10 @@
#endif
#include "igt_x86.h"
+
+#include <stdint.h>
#include <stdio.h>
+#include <string.h>
/**
* SECTION:igt_x86
@@ -174,3 +177,46 @@ char *igt_x86_features_to_string(unsigned features, char *line)
return ret;
}
#endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+
+#include <smmintrin.h>
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+ if (igt_x86_features() & SSE4_1 && ((uintptr_t)src & 15) == 0) {
+ while (len >= 64) {
+ __m128i *S = (__m128i *)src;
+ __m128i *D = (__m128i *)dst;
+ __m128i tmp[4];
+
+ tmp[0] = _mm_stream_load_si128(S + 0);
+ tmp[1] = _mm_stream_load_si128(S + 1);
+ tmp[2] = _mm_stream_load_si128(S + 2);
+ tmp[3] = _mm_stream_load_si128(S + 3);
+
+ _mm_storeu_si128(D + 0, tmp[0]);
+ _mm_storeu_si128(D + 1, tmp[1]);
+ _mm_storeu_si128(D + 2, tmp[2]);
+ _mm_storeu_si128(D + 3, tmp[3]);
+
+ src += 64;
+ dst += 64;
+ len -= 64;
+ }
+ }
+
+ memcpy(dst, src, len);
+}
+
+#pragma GCC pop_options
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+ memcpy(dst, src, len);
+}
+#endif
diff --git a/lib/igt_x86.h b/lib/igt_x86.h
index 27b7f0fd..d4f8c343 100644
--- a/lib/igt_x86.h
+++ b/lib/igt_x86.h
@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
}
#endif
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
#endif /* IGT_X86_H */
--
2.16.2
More information about the igt-dev
mailing list