[igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC

Ville Syrjälä ville.syrjala at linux.intel.com
Wed Feb 28 17:12:44 UTC 2018


On Wed, Feb 28, 2018 at 09:00:16AM +0000, Chris Wilson wrote:
> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.
> 
> v2: Don't be lazy and handle misalignment.
> v3: Switch out of sse41 before emitting the generic memcpy routine
> v4: Replace opencoded memcpy_from_wc
> v5: Always flush the internal buffer before use (Eric)
> v6: Assume bulk moves, so check for dst alignment.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Eric Anholt <eric at anholt.net>
> ---
>  lib/igt_fb.c                   |   3 +-
>  lib/igt_x86.c                  | 124 +++++++++++++++++++++++++++++++++++++++++
>  lib/igt_x86.h                  |   2 +
>  tests/gem_fence_thrash.c       |  63 +--------------------
>  tests/gem_mmap_gtt.c           |  37 +-----------
>  tests/gem_tiled_pread_pwrite.c |  37 +-----------
>  6 files changed, 132 insertions(+), 134 deletions(-)
> 
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index ecd73053..7404ba7c 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -32,6 +32,7 @@
>  #include "drmtest.h"
>  #include "igt_fb.h"
>  #include "igt_kms.h"
> +#include "igt_x86.h"
>  #include "ioctl_wrappers.h"
>  #include "intel_chipset.h"
>  
> @@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
>  	 * it's faster to copy the whole BO to a temporary buffer and convert
>  	 * from there.
>  	 */
> -	memcpy(buf, blit->linear.map, blit->linear.size);
> +	igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
>  	y = &buf[blit->linear.offsets[0]];
>  	uv = &buf[blit->linear.offsets[1]];
>  
> diff --git a/lib/igt_x86.c b/lib/igt_x86.c
> index 0ed3c6f1..54539456 100644
> --- a/lib/igt_x86.c
> +++ b/lib/igt_x86.c
> @@ -36,7 +36,10 @@
>  #endif
>  
>  #include "igt_x86.h"
> +
> +#include <stdint.h>
>  #include <stdio.h>
> +#include <string.h>
>  
>  /**
>   * SECTION:igt_x86
> @@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
>  	return ret;
>  }
>  #endif
> +
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512

What's this MOVNT define?

> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +#pragma GCC diagnostic ignored "-Wpointer-arith"
> +
> +#define min(x, y) ({                            \
> +	typeof(x) _min1 = (x);                  \
> +	typeof(y) _min2 = (y);                  \
> +	(void) (&_min1 == &_min2);              \
> +	_min1 < _min2 ? _min1 : _min2;		\
> +})

igt_aux.h has this already I believe.

> +
> +#include <smmintrin.h>
> +static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
> +{
> +	char buf[16];
> +
> +	/* Flush the internal buffer of potential stale gfx data */
> +	__builtin_ia32_mfence();

Isn't there a _mm_mfence()?

Apart from those everything looks all right to me.
Reviewed-by: Ville Syrjälä <ville.syrjala at linux.intel.com>

> +
> +	if ((uintptr_t)src & 15) {
> +		__m128i *S = (__m128i *)((uintptr_t)src & ~15);
> +		unsigned long misalign = (uintptr_t)src & 15;
> +		unsigned long copy = min(len, 16 - misalign);
> +
> +		_mm_storeu_si128((__m128i *)buf,
> +				 _mm_stream_load_si128(S));
> +
> +		memcpy(dst, buf + misalign, copy);
> +
> +		dst += copy;
> +		src += copy;
> +		len -= copy;
> +	}
> +
> +	/* We assume we are doing bulk transfers, so prefer aligned moves */
> +	if (((uintptr_t)dst & 15) == 0) {
> +		while (len >= 64) {
> +			__m128i *S = (__m128i *)src;
> +			__m128i *D = (__m128i *)dst;
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S + 0);
> +			tmp[1] = _mm_stream_load_si128(S + 1);
> +			tmp[2] = _mm_stream_load_si128(S + 2);
> +			tmp[3] = _mm_stream_load_si128(S + 3);
> +
> +			_mm_store_si128(D + 0, tmp[0]);
> +			_mm_store_si128(D + 1, tmp[1]);
> +			_mm_store_si128(D + 2, tmp[2]);
> +			_mm_store_si128(D + 3, tmp[3]);
> +
> +			src += 64;
> +			dst += 64;
> +			len -= 64;
> +		}
> +	} else {
> +		while (len >= 64) {
> +			__m128i *S = (__m128i *)src;
> +			__m128i *D = (__m128i *)dst;
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S + 0);
> +			tmp[1] = _mm_stream_load_si128(S + 1);
> +			tmp[2] = _mm_stream_load_si128(S + 2);
> +			tmp[3] = _mm_stream_load_si128(S + 3);
> +
> +			_mm_storeu_si128(D + 0, tmp[0]);
> +			_mm_storeu_si128(D + 1, tmp[1]);
> +			_mm_storeu_si128(D + 2, tmp[2]);
> +			_mm_storeu_si128(D + 3, tmp[3]);
> +
> +			src += 64;
> +			dst += 64;
> +			len -= 64;
> +		}
> +	}
> +
> +	while (len >= 16) {
> +		_mm_storeu_si128((__m128i *)dst,
> +				 _mm_stream_load_si128((__m128i *)src));
> +
> +		src += 16;
> +		dst += 16;
> +		len -= 16;
> +	}
> +
> +	if (len) {
> +		_mm_storeu_si128((__m128i *)buf,
> +				 _mm_stream_load_si128((__m128i *)src));
> +		memcpy(dst, buf, len);
> +	}
> +}
> +
> +#pragma GCC pop_options
> +
> +static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	memcpy(dst, src, len);
> +}
> +
> +static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
> +{
> +	if (igt_x86_features() & SSE4_1)
> +		return memcpy_from_wc_sse41;
> +
> +	return memcpy_from_wc;
> +}
> +
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +	__attribute__((ifunc("resolve_memcpy_from_wc")));
> +
> +#else
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> +	memcpy(dst, src, len);
> +}
> +#endif
> diff --git a/lib/igt_x86.h b/lib/igt_x86.h
> index 27b7f0fd..d4f8c343 100644
> --- a/lib/igt_x86.h
> +++ b/lib/igt_x86.h
> @@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
>  }
>  #endif
>  
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
>  #endif /* IGT_X86_H */
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index c8ff961d..2d7fb2ff 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -107,75 +107,16 @@ bo_copy (void *_arg)
>  	return NULL;
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -
> -#define MOVNT 512
> -
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -static void copy_wc_cacheline(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -		__m128i tmp[4];
> -
> -		tmp[0] = _mm_stream_load_si128(S++);
> -		tmp[1] = _mm_stream_load_si128(S++);
> -		tmp[2] = _mm_stream_load_si128(S++);
> -		tmp[3] = _mm_stream_load_si128(S++);
> -
> -		_mm_store_si128(D++, tmp[0]);
> -		_mm_store_si128(D++, tmp[1]);
> -		_mm_store_si128(D++, tmp[2]);
> -		_mm_store_si128(D++, tmp[3]);
> -	} else
> -		memcpy(dst, src, CACHELINE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> -
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
>  
>  static void copy_wc_cacheline(void *dst, const void *src)
>  {
> -	memcpy(dst, src, CACHELINE);
> +	igt_memcpy_from_wc(dst, src, CACHELINE);
>  }
>  
> -#endif
> -
>  static void
>  _bo_write_verify(struct test *t)
>  {
> diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
> index 0f598125..6a332b25 100644
> --- a/tests/gem_mmap_gtt.c
> +++ b/tests/gem_mmap_gtt.c
> @@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
>  	munmap(linear_pattern, PAGE_SIZE);
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/64; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
> -#endif
>  
>  static unsigned int tile_row_size(int tiling, unsigned int stride)
>  {
> diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
> index 7b5577fd..313daa38 100644
> --- a/tests/gem_tiled_pread_pwrite.c
> +++ b/tests/gem_tiled_pread_pwrite.c
> @@ -100,45 +100,10 @@ create_bo(int fd)
>  	return handle;
>  }
>  
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> -	if (igt_x86_features() & SSE4_1) {
> -		__m128i *S = (__m128i *)src;
> -		__m128i *D = (__m128i *)dst;
> -
> -		for (int i = 0; i < PAGE_SIZE/64; i++) {
> -			__m128i tmp[4];
> -
> -			tmp[0] = _mm_stream_load_si128(S++);
> -			tmp[1] = _mm_stream_load_si128(S++);
> -			tmp[2] = _mm_stream_load_si128(S++);
> -			tmp[3] = _mm_stream_load_si128(S++);
> -
> -			_mm_store_si128(D++, tmp[0]);
> -			_mm_store_si128(D++, tmp[1]);
> -			_mm_store_si128(D++, tmp[2]);
> -			_mm_store_si128(D++, tmp[3]);
> -		}
> -	} else
> -		memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
>  static void copy_wc_page(void *dst, const void *src)
>  {
> -	memcpy(dst, src, PAGE_SIZE);
> +	igt_memcpy_from_wc(dst, src, PAGE_SIZE);
>  }
> -#endif
>  
>  igt_simple_main
>  {
> -- 
> 2.16.2
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev

-- 
Ville Syrjälä
Intel OTC


More information about the igt-dev mailing list