[Mesa-dev] [PATCH 4/4] main: add sse2/ssse3 code for handling all 4 channel ubyte unorm swizzles

Sun Jan 17 13:49:49 PST 2016

From: Roland Scheidegger <sroland at vmware.com>

Like the previous patch, but this time instead of direct format pack
functions, this handles convert_ubyte if the destination and source
were both ubyte unorm with 4 channels (so this can do things like
bgrx8->rgba8, apart from swizzling filling in 1's for alpha).
The biggest challenge is actually integrating this code into the macro
madness, can't say I found a clean solution for that but everything
compiles without warnings now...
Obviously, handling other channel numbers / conversions / channel sizes
would be nice, but that would need some more elaborate code generation
(and would still be quite difficult), probably effort would be better
spent juicing up meta to handle that.
This helps the webgl demo at webglsamples.org/aquarium/aquarium.html
by roughly a factor of 10 when using a non-sse41 capable cpu on my gm45.
Again, the huge gain is by not accessing uncached memory byte wise, but
even when using a cpu with sse41 (so all cached accesses) this is quite
noticeably faster and helps performance by about 20-30% here (cpu usage
of convert_ubyte dropped by roughly a factor of 3 in that case, down from
20% (which was about as expensive is the separate movntdqa copy) to 7% or
so). (Those numbers were using the sse2 version, the ssse3 code only seems
to gain about another 10% on that function even though it should execute
much faster, I suspect hitting memory limitations.)
---
 src/mesa/main/format_utils.c | 163 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 162 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/format_utils.c b/src/mesa/main/format_utils.c
index 5fdabd5..d440ef6 100644
--- a/src/mesa/main/format_utils.c
+++ b/src/mesa/main/format_utils.c
@@ -27,6 +27,14 @@
 #include "format_pack.h"
 #include "format_unpack.h"
 
+#define HAVE_UBYTE_SSE_CONV 0
+
+#if defined(__SSSE3__)
+#include <tmmintrin.h>
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
 const mesa_array_format RGBA32_FLOAT =
    MESA_ARRAY_FORMAT(4, 1, 1, 1, 4, 0, 1, 2, 3);
 
@@ -758,6 +766,148 @@ swizzle_convert_try_memcpy(void *dst,
       }                                           \
    } while (0)
 
+
+#ifndef __SSE2__
+static inline void
+swizzle_convert_loop_sse(void *void_dst, const void *void_src,
+                         const uint8_t swizzle[4], uint8_t tmp[7],
+                         unsigned count)
+{
+   (void)void_dst;
+   (void)void_src;
+   (void)swizzle;
+   (void)count;
+}
+
+#else
+static inline void
+swizzle_convert_loop_sse(void *void_dst, const void *void_src,
+                         const uint8_t swizzle[4], uint8_t tmp[7],
+                         unsigned count)
+{
+   int j;
+   const uint8_t *typed_src = void_src;
+   uint8_t *typed_dst = void_dst;
+   while (((uintptr_t)typed_src & 0xf) && count) {
+      for (j = 0; j < 4; ++j) {
+         tmp[j] = typed_src[j];
+      }
+      for (j = 0; j < 4; ++j) {
+         typed_dst[j] = tmp[swizzle[j]];
+      }
+      typed_src += 4;
+      typed_dst += 4;
+      count--;
+   }
+
+#ifndef __SSSE3__
+   if (count >= 8) {
+      uint32_t swizzled;
+      __m128i swizzles, onemask, five, lowbitsmask, shiftc[4];
+
+      swizzled = swizzle[0] | (swizzle[1] << 8) |
+                 (swizzle[2] << 16) | (swizzle[3] << 24);
+
+      five = _mm_set1_epi8(5 << 3);
+      lowbitsmask = _mm_set1_epi32(0xFF);
+      swizzles = _mm_set1_epi32(swizzled << 3);
+      onemask = _mm_cmpeq_epi8(swizzles, five);
+      /*
+       * This is kind of overkill, could do branchy code instead
+       * of pretending we don't know anything about the swizzle
+       * (even if gcc couldn't figure out the branches statically,
+       * branch predictors would have a field day...).
+       * Or of course, just a single pshufb (plus one cmp and or to
+       * handle the one swizzles) and call it a day...
+       * It isn't too bad though.
+       */
+      shiftc[0] = _mm_and_si128(lowbitsmask, swizzles);
+      shiftc[1] = _mm_and_si128(lowbitsmask, _mm_srli_epi32(swizzles, 8));
+      shiftc[2] = _mm_and_si128(lowbitsmask, _mm_srli_epi32(swizzles, 16));
+      shiftc[3] = _mm_srli_epi32(swizzles, 24);
+
+      while (count / 4) {
+         __m128i vals[4], srcr, dstr;
+         srcr = _mm_load_si128((__m128i *)typed_src);
+         /*
+          * Note a lot of intel cpus hate shifts for some odd reason, in 
+          * particular the non-immediate version. This would be especially
+          * useless on the first-gen atom which takes longer for that than
+          * I can do it by hand. The big cores are better but with the
+          * exception of Skylake any puny Jaguar is probably going to be
+          * faster here, at least per clock...
+          * The solution would be to figure out how to properly integrate
+          * ssse3 code in a separate function/file through the macro
+          * madness and just use that...
+          */
+         vals[0] = _mm_srl_epi32(srcr, shiftc[0]);
+         vals[1] = _mm_srl_epi32(srcr, shiftc[1]);
+         vals[2] = _mm_srl_epi32(srcr, shiftc[2]);
+         vals[3] = _mm_srl_epi32(srcr, shiftc[3]);
+         vals[0] = _mm_and_si128(lowbitsmask, vals[0]);
+         vals[1] = _mm_slli_epi32(_mm_and_si128(lowbitsmask, vals[1]), 8);
+         vals[2] = _mm_slli_epi32(_mm_and_si128(lowbitsmask, vals[2]), 16);
+         vals[3] = _mm_slli_epi32(vals[3], 24);
+         dstr = _mm_or_si128(vals[0], vals[1]);
+         dstr = _mm_or_si128(dstr, vals[2]);
+         dstr = _mm_or_si128(dstr, vals[3]);
+         /*
+          * Note we don't need to handle the swizzle_zero case - the
+          * shifts took care of that.
+          */
+         dstr = _mm_or_si128(dstr, onemask);
+         _mm_storeu_si128((__m128i *)typed_dst, dstr);
+
+         typed_src += 16;
+         typed_dst += 16;
+         count -= 4;
+      }
+   }
+#else
+   /* ssse3 path... */
+   if (count >= 8) {
+      uint32_t swizzled;
+      __m128i swizzles, onemask, zeromask, four, swiz_add;
+
+      swizzled = swizzle[0] | (swizzle[1] << 8) |
+                 (swizzle[2] << 16) | (swizzle[3] << 24);
+
+      four = _mm_set1_epi8(4);
+      swiz_add = _mm_set_epi8(12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0);
+      swizzles = _mm_set1_epi32(swizzled);
+      zeromask = _mm_cmpeq_epi8(swizzles, four);
+      onemask = _mm_cmpgt_epi8(swizzles, four);
+      swizzles = _mm_add_epi8(swizzles, swiz_add);
+      swizzles = _mm_or_si128(swizzles, zeromask);
+
+      while (count / 4) {
+         __m128i srcr, dstr;
+         srcr = _mm_load_si128((__m128i *)typed_src);
+         dstr = _mm_shuffle_epi8(srcr, swizzles);
+         dstr = _mm_or_si128(dstr, onemask);
+         _mm_storeu_si128((__m128i *)typed_dst, dstr);
+
+         typed_src += 16;
+         typed_dst += 16;
+         count -= 4;
+      }
+   }
+#endif
+
+   while (count) {
+      for (j = 0; j < 4; ++j) {
+         tmp[j] = typed_src[j];
+      }
+      for (j = 0; j < 4; ++j) {
+         typed_dst[j] = tmp[swizzle[j]];
+      }
+      typed_src += 4;
+      typed_dst += 4;
+      count--;
+   }
+}
+#endif
+
 /**
  * Represents a single swizzle-and-convert operation
  *
@@ -848,7 +998,10 @@ swizzle_convert_try_memcpy(void *dst,
             SWIZZLE_CONVERT_LOOP(DST_TYPE, 4, SRC_TYPE, 3, CONV); \
             break;                                                \
          case 4:                                                  \
-            SWIZZLE_CONVERT_LOOP(DST_TYPE, 4, SRC_TYPE, 4, CONV); \
+            if (HAVE_UBYTE_SSE_CONV)                              \
+               swizzle_convert_loop_sse(void_dst, void_src, swizzle, (void *)tmp, count); \
+            else                                                                          \
+               SWIZZLE_CONVERT_LOOP(DST_TYPE, 4, SRC_TYPE, 4, CONV);                      \
             break;                                                \
          }                                                        \
          break;                                                   \
@@ -1002,7 +1155,15 @@ convert_ubyte(void *void_dst, int num_dst_channels,
       }
       break;
    case MESA_ARRAY_FORMAT_TYPE_UBYTE:
+#if defined(__SSE2__)
+#undef HAVE_UBYTE_SSE_CONV
+#define HAVE_UBYTE_SSE_CONV 1
+#endif
       SWIZZLE_CONVERT(uint8_t, uint8_t, src);
+#if defined(__SSE2__)
+#undef HAVE_UBYTE_SSE_CONV
+#define HAVE_UBYTE_SSE_CONV 0
+#endif
       break;
    case MESA_ARRAY_FORMAT_TYPE_BYTE:
       if (normalized) {
-- 
2.1.4