[Mesa-dev] [PATCH 2/4] i965: Provide sse2 version for rgba8 <-> bgra8 swizzle

sroland at vmware.com sroland at vmware.com
Sun Jan 17 13:49:47 PST 2016


From: Roland Scheidegger <sroland at vmware.com>

The existing code used ssse3, and because it isn't compiled in a separate
file compiled with that, it is usually not used (that, of course, could
be fixed...), whereas sse2 is always present at least with 64bit builds.
It is actually trivial to do with sse2 without pshufb, on some cpus (I'm
looking at you, atom!) it might not even be slower.
This is compile-tested only, it doesn't actually do what I really want
(which is glReadPixels without doing byte access from an uncached region,
which is what you'll get on intel chips not having llc, if your cpu doesn't
support sse41 (in which case the rb would be copied over with movntdqa instead
of mapped, so mesa format_utils byte swapping conversion will then access
the cached region instead of the uncached one) - really need sse2-optimized
convert_ubyte functions for a proper fix, otherwise google maps in firefox is
reduced to fps below 1 fps), but hey why not. I don't have a gpu which could
possibly hit this, albeit I succesfully used the exact same code elsewhere.

v2: fix andnot argument order, add comments
---
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c  | 18 +++++++
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 73 +++++++++++++++++++++-----
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 108dd87..5fc4212 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2773,6 +2773,24 @@ intel_miptree_map(struct brw_context *brw,
    } else if (!(mode & GL_MAP_WRITE_BIT) &&
               !mt->compressed && cpu_has_sse4_1 &&
               (mt->pitch % 16 == 0)) {
+      /*
+       * XXX: without sse4_1, in some situations still really want to copy
+       * regardless. Presumably this is not done for performance reasons - 
+       * with movntdqa thanks to the 64byte streaming load buffer the
+       * uncached->cached copy followed by cached->cached later is always
+       * faster than doing "ordinary" uncached->cached copy.
+       * Without movntdqa, of course an additional copy doesn't help, albeit
+       * it has to be said the uncached->cached one is an order of magnitude
+       * slower than the later cached->cached one in any case.
+       * But mesa may not do a simple memcpy on that memory later - some
+       * glReadPixels paths for instance will well hit per-byte access which
+       * is a complete performance killer on uncached memory. So in these
+       * cases really want to copy regardless, unless the map below could
+       * play some tricks making the memory cached.
+       * (Or otherwise ensure mesa can't hit these paths often, for instance
+       * glReadPixels requiring conversions could be handled by meta, so in
+       * end it really would be just memcpy.)
+       */
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 2383401..42fdde1 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,10 +36,13 @@
 #include "brw_context.h"
 #include "intel_tiled_memcpy.h"
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__)
 #include <tmmintrin.h>
+#elif defined(__SSE2__)
+#include <emmintrin.h>
 #endif
 
+
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
 
 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
@@ -56,23 +59,69 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__)
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
 
 /* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
+static inline void
+rgba8_copy_16_aligned_dst(void *dst, const void *src)
+{
+   __m128i reg;
+   reg = _mm_loadu_si128((__m128i *)src);
+   reg = _mm_shuffle_epi8(reg, *(__m128i *)rgba8_permutation);
+   _mm_store_si128((__m128i *)dst, reg);
+}
 
 /* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
+static inline void
+rgba8_copy_16_aligned_src(void *dst, const void *src)
+{
+   __m128i reg;
+   reg = _mm_load_si128((__m128i *)src);
+   reg = _mm_shuffle_epi8(reg, *(__m128i *)rgba8_permutation);
+   _mm_storeu_si128((__m128i *)dst, reg);
+}
+
+#elif defined(__SSE2__)
+static inline void
+rgba8_copy_16_aligned_dst(void *dst, const void *src)
+{
+   __m128i srcreg, dstreg, agmask, ag, rb, r, b;
+
+   agmask = _mm_set1_epi32(0xFF00FF00);
+   srcreg = _mm_loadu_si128((__m128i *)src);
+
+   rb = _mm_andnot_si128(agmask, srcreg);
+   ag = _mm_and_si128(agmask, srcreg);
+   r = _mm_srli_epi32(rb, 16);
+   b = _mm_slli_epi32(rb, 16);
+   dstreg = _mm_or_si128(ag, r);
+   dstreg = _mm_or_si128(dstreg, b);
+
+   _mm_store_si128((__m128i *)dst, dstreg);
+}
+
+static inline void
+rgba8_copy_16_aligned_src(void *dst, const void *src)
+{
+   __m128i srcreg, dstreg, agmask, ag, rb, r, b;
+
+   agmask = _mm_set1_epi32(0xFF00FF00);
+   srcreg = _mm_load_si128((__m128i *)src);
+
+   rb = _mm_andnot_si128(agmask, srcreg);
+   ag = _mm_and_si128(agmask, srcreg);
+   r = _mm_srli_epi32(rb, 16);
+   b = _mm_slli_epi32(rb, 16);
+   dstreg = _mm_or_si128(ag, r);
+   dstreg = _mm_or_si128(dstreg, b);
+
+   _mm_storeu_si128((__m128i *)dst, dstreg);
+}
 #endif
 
+
 /**
  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
  */
@@ -82,7 +131,7 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__) || defined(__SSE2__)
    if (bytes == 16) {
       assert(!(((uintptr_t)dst) & 0xf));
       rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
@@ -120,7 +169,7 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__) || defined(__SSE2__)
    if (bytes == 16) {
       assert(!(((uintptr_t)src) & 0xf));
       rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-- 
2.1.4



More information about the mesa-dev mailing list