[igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
Ville Syrjälä
ville.syrjala at linux.intel.com
Wed Feb 28 17:12:44 UTC 2018
On Wed, Feb 28, 2018 at 09:00:16AM +0000, Chris Wilson wrote:
> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.
>
> v2: Don't be lazy and handle misalignment.
> v3: Switch out of sse41 before emitting the generic memcpy routine
> v4: Replace opencoded memcpy_from_wc
> v5: Always flush the internal buffer before use (Eric)
> v6: Assume bulk moves, so check for dst alignment.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Eric Anholt <eric at anholt.net>
> ---
> lib/igt_fb.c | 3 +-
> lib/igt_x86.c | 124 +++++++++++++++++++++++++++++++++++++++++
> lib/igt_x86.h | 2 +
> tests/gem_fence_thrash.c | 63 +--------------------
> tests/gem_mmap_gtt.c | 37 +-----------
> tests/gem_tiled_pread_pwrite.c | 37 +-----------
> 6 files changed, 132 insertions(+), 134 deletions(-)
>
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index ecd73053..7404ba7c 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -32,6 +32,7 @@
> #include "drmtest.h"
> #include "igt_fb.h"
> #include "igt_kms.h"
> +#include "igt_x86.h"
> #include "ioctl_wrappers.h"
> #include "intel_chipset.h"
>
> @@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
> * it's faster to copy the whole BO to a temporary buffer and convert
> * from there.
> */
> - memcpy(buf, blit->linear.map, blit->linear.size);
> + igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
> y = &buf[blit->linear.offsets[0]];
> uv = &buf[blit->linear.offsets[1]];
>
> diff --git a/lib/igt_x86.c b/lib/igt_x86.c
> index 0ed3c6f1..54539456 100644
> --- a/lib/igt_x86.c
> +++ b/lib/igt_x86.c
> @@ -36,7 +36,10 @@
> #endif
>
> #include "igt_x86.h"
> +
> +#include <stdint.h>
> #include <stdio.h>
> +#include <string.h>
>
> /**
> * SECTION:igt_x86
> @@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
> return ret;
> }
> #endif
> +
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
What's this MOVNT define?
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +#pragma GCC diagnostic ignored "-Wpointer-arith"
> +
> +#define min(x, y) ({ \
> + typeof(x) _min1 = (x); \
> + typeof(y) _min2 = (y); \
> + (void) (&_min1 == &_min2); \
> + _min1 < _min2 ? _min1 : _min2; \
> +})
igt_aux.h has this already I believe.
> +
> +#include <smmintrin.h>
> +static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
> +{
> + char buf[16];
> +
> + /* Flush the internal buffer of potential stale gfx data */
> + __builtin_ia32_mfence();
Isn't there a _mm_mfence()?
Apart from those everything looks all right to me.
Reviewed-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
> +
> + if ((uintptr_t)src & 15) {
> + __m128i *S = (__m128i *)((uintptr_t)src & ~15);
> + unsigned long misalign = (uintptr_t)src & 15;
> + unsigned long copy = min(len, 16 - misalign);
> +
> + _mm_storeu_si128((__m128i *)buf,
> + _mm_stream_load_si128(S));
> +
> + memcpy(dst, buf + misalign, copy);
> +
> + dst += copy;
> + src += copy;
> + len -= copy;
> + }
> +
> + /* We assume we are doing bulk transfers, so prefer aligned moves */
> + if (((uintptr_t)dst & 15) == 0) {
> + while (len >= 64) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S + 0);
> + tmp[1] = _mm_stream_load_si128(S + 1);
> + tmp[2] = _mm_stream_load_si128(S + 2);
> + tmp[3] = _mm_stream_load_si128(S + 3);
> +
> + _mm_store_si128(D + 0, tmp[0]);
> + _mm_store_si128(D + 1, tmp[1]);
> + _mm_store_si128(D + 2, tmp[2]);
> + _mm_store_si128(D + 3, tmp[3]);
> +
> + src += 64;
> + dst += 64;
> + len -= 64;
> + }
> + } else {
> + while (len >= 64) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S + 0);
> + tmp[1] = _mm_stream_load_si128(S + 1);
> + tmp[2] = _mm_stream_load_si128(S + 2);
> + tmp[3] = _mm_stream_load_si128(S + 3);
> +
> + _mm_storeu_si128(D + 0, tmp[0]);
> + _mm_storeu_si128(D + 1, tmp[1]);
> + _mm_storeu_si128(D + 2, tmp[2]);
> + _mm_storeu_si128(D + 3, tmp[3]);
> +
> + src += 64;
> + dst += 64;
> + len -= 64;
> + }
> + }
> +
> + while (len >= 16) {
> + _mm_storeu_si128((__m128i *)dst,
> + _mm_stream_load_si128((__m128i *)src));
> +
> + src += 16;
> + dst += 16;
> + len -= 16;
> + }
> +
> + if (len) {
> + _mm_storeu_si128((__m128i *)buf,
> + _mm_stream_load_si128((__m128i *)src));
> + memcpy(dst, buf, len);
> + }
> +}
> +
> +#pragma GCC pop_options
> +
> +static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> + memcpy(dst, src, len);
> +}
> +
> +static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
> +{
> + if (igt_x86_features() & SSE4_1)
> + return memcpy_from_wc_sse41;
> +
> + return memcpy_from_wc;
> +}
> +
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> + __attribute__((ifunc("resolve_memcpy_from_wc")));
> +
> +#else
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> + memcpy(dst, src, len);
> +}
> +#endif
> diff --git a/lib/igt_x86.h b/lib/igt_x86.h
> index 27b7f0fd..d4f8c343 100644
> --- a/lib/igt_x86.h
> +++ b/lib/igt_x86.h
> @@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
> }
> #endif
>
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
> #endif /* IGT_X86_H */
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index c8ff961d..2d7fb2ff 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -107,75 +107,16 @@ bo_copy (void *_arg)
> return NULL;
> }
>
> -#if defined(__x86_64__) && !defined(__clang__)
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -
> -#define MOVNT 512
> -
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> -
> - for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - }
> - } else
> - memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -static void copy_wc_cacheline(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - } else
> - memcpy(dst, src, CACHELINE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> -
> static void copy_wc_page(void *dst, const void *src)
> {
> - memcpy(dst, src, PAGE_SIZE);
> + igt_memcpy_from_wc(dst, src, PAGE_SIZE);
> }
>
> static void copy_wc_cacheline(void *dst, const void *src)
> {
> - memcpy(dst, src, CACHELINE);
> + igt_memcpy_from_wc(dst, src, CACHELINE);
> }
>
> -#endif
> -
> static void
> _bo_write_verify(struct test *t)
> {
> diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
> index 0f598125..6a332b25 100644
> --- a/tests/gem_mmap_gtt.c
> +++ b/tests/gem_mmap_gtt.c
> @@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
> munmap(linear_pattern, PAGE_SIZE);
> }
>
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> -
> - for (int i = 0; i < PAGE_SIZE/64; i++) {
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - }
> - } else
> - memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> static void copy_wc_page(void *dst, const void *src)
> {
> - memcpy(dst, src, PAGE_SIZE);
> + igt_memcpy_from_wc(dst, src, PAGE_SIZE);
> }
> -#endif
>
> static unsigned int tile_row_size(int tiling, unsigned int stride)
> {
> diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
> index 7b5577fd..313daa38 100644
> --- a/tests/gem_tiled_pread_pwrite.c
> +++ b/tests/gem_tiled_pread_pwrite.c
> @@ -100,45 +100,10 @@ create_bo(int fd)
> return handle;
> }
>
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> -
> - for (int i = 0; i < PAGE_SIZE/64; i++) {
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - }
> - } else
> - memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> static void copy_wc_page(void *dst, const void *src)
> {
> - memcpy(dst, src, PAGE_SIZE);
> + igt_memcpy_from_wc(dst, src, PAGE_SIZE);
> }
> -#endif
>
> igt_simple_main
> {
> --
> 2.16.2
>
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
--
Ville Syrjälä
Intel OTC
More information about the igt-dev
mailing list