[Mesa-dev] [PATCH] i965: Provide sse2 version for rgba8 <-> bgra8 swizzle

Roland Scheidegger sroland at vmware.com
Fri Jan 15 22:34:01 PST 2016


There's actually a small bug in it, the argument order to
_mm_andnot_si128 is swapped. Otherwise it should be ok...

Roland

Am 16.01.2016 um 03:58 schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> The existing code used ssse3, and because it isn't compiled in a separate
> file compiled with that, it is usually not used (that, of course, could
> be fixed...), whereas sse2 is always present at least with 64bit builds.
> It is actually trivial to do with sse2 without pshufb, on some cpus (I'm
> looking at you, atom!) it might not even be slower.
> This is compile-tested only, it doesn't actually do what I really want
> (which is glReadPixels without doing byte access from an uncached region,
> which is what you'll get on intel chips not having llc, if your cpu doesn't
> support sse41 (in which case the rb would be copied over with movntdqa instead
> of mapped, so mesa format_utils byte swapping conversion will then access
> the cached region instead of the uncached one) - really need sse2-optimized
> convert_ubyte functions for a proper fix, otherwise google maps in firefox is
> reduced to fps below 1 fps), but hey why not. Don't even know how to hit it...
> ---
>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 73 +++++++++++++++++++++-----
>  1 file changed, 61 insertions(+), 12 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index 2383401..3268e07 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -36,10 +36,13 @@
>  #include "brw_context.h"
>  #include "intel_tiled_memcpy.h"
>  
> -#ifdef __SSSE3__
> +#if defined(__SSSE3__)
>  #include <tmmintrin.h>
> +#elif defined(__SSE2__)
> +#include <emmintrin.h>
>  #endif
>  
> +
>  #define FILE_DEBUG_FLAG DEBUG_TEXTURE
>  
>  #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
> @@ -56,23 +59,69 @@ static const uint32_t ytile_width = 128;
>  static const uint32_t ytile_height = 32;
>  static const uint32_t ytile_span = 16;
>  
> -#ifdef __SSSE3__
> +#if defined(__SSSE3__)
>  static const uint8_t rgba8_permutation[16] =
>     { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
>  
>  /* NOTE: dst must be 16-byte aligned. src may be unaligned. */
> -#define rgba8_copy_16_aligned_dst(dst, src)                            \
> -   _mm_store_si128((__m128i *)(dst),                                   \
> -                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
> -                                    *(__m128i *) rgba8_permutation))
> +static inline void
> +rgba8_copy_16_aligned_dst(void *dst, const void *src)
> +{
> +   __m128i reg;
> +   reg = _mm_loadu_si128((__m128i *)src);
> +   reg = _mm_shuffle_epi8(reg, *(__m128i *)rgba8_permutation);
> +   _mm_store_si128((__m128i *)dst, reg);
> +}
>  
>  /* NOTE: src must be 16-byte aligned. dst may be unaligned. */
> -#define rgba8_copy_16_aligned_src(dst, src)                            \
> -   _mm_storeu_si128((__m128i *)(dst),                                  \
> -                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
> -                                     *(__m128i *) rgba8_permutation))
> +static inline void
> +rgba8_copy_16_aligned_src(void *dst, const void *src)
> +{
> +   __m128i reg;
> +   reg = _mm_load_si128((__m128i *)src);
> +   reg = _mm_shuffle_epi8(reg, *(__m128i *)rgba8_permutation);
> +   _mm_storeu_si128((__m128i *)dst, reg);
> +}
> +
> +#elif defined(__SSE2__)
> +static inline void
> +rgba8_copy_16_aligned_dst(void *dst, const void *src)
> +{
> +   __m128i srcreg, dstreg, agmask, ag, rb, r, b;
> +
> +   agmask = _mm_set1_epi32(0xFF00FF00);
> +   srcreg = _mm_loadu_si128((__m128i *)src);
> +
> +   rb = _mm_andnot_si128(srcreg, agmask);
> +   ag = _mm_and_si128(srcreg, agmask);
> +   r = _mm_srli_epi32(rb, 16);
> +   b = _mm_slli_epi32(rb, 16);
> +   dstreg = _mm_or_si128(ag, r);
> +   dstreg = _mm_or_si128(dstreg, b);
> +
> +   _mm_store_si128((__m128i *)dst, dstreg);
> +}
> +
> +static inline void
> +rgba8_copy_16_aligned_src(void *dst, const void *src)
> +{
> +   __m128i srcreg, dstreg, agmask, ag, rb, r, b;
> +
> +   agmask = _mm_set1_epi32(0xFF00FF00);
> +   srcreg = _mm_load_si128((__m128i *)src);
> +
> +   rb = _mm_andnot_si128(srcreg, agmask);
> +   ag = _mm_and_si128(srcreg, agmask);
> +   r = _mm_srli_epi32(rb, 16);
> +   b = _mm_slli_epi32(rb, 16);
> +   dstreg = _mm_or_si128(ag, r);
> +   dstreg = _mm_or_si128(dstreg, b);
> +
> +   _mm_storeu_si128((__m128i *)dst, dstreg);
> +}
>  #endif
>  
> +
>  /**
>   * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
>   */
> @@ -82,7 +131,7 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
>     uint8_t *d = dst;
>     uint8_t const *s = src;
>  
> -#ifdef __SSSE3__
> +#if defined(__SSSE3__) || defined(__SSE2__)
>     if (bytes == 16) {
>        assert(!(((uintptr_t)dst) & 0xf));
>        rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
> @@ -120,7 +169,7 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
>     uint8_t *d = dst;
>     uint8_t const *s = src;
>  
> -#ifdef __SSSE3__
> +#if defined(__SSSE3__) || defined(__SSE2__)
>     if (bytes == 16) {
>        assert(!(((uintptr_t)src) & 0xf));
>        rgba8_copy_16_aligned_src(d+ 0, s+ 0);
> 



More information about the mesa-dev mailing list