[Mesa-dev] [PATCH 3/4] main: add some sse2 ubyte pack functions for rgba8 / rgbx8 unorm formats

Sun Jan 17 13:49:48 PST 2016

From: Roland Scheidegger <sroland at vmware.com>

This certainly isn't as generic as it would be ideally, but got to start
somewhere...
Handles just rgba8/rgbx8 formats (so just swizzling). Even when using
cached regions, these functions are definitely quite a bit faster than
the c ones (for larger counts, obviously) (about 3 times or so).
They can't quite match my manual assembly skills (e.g. the rgba8 ->bgra8 one
ends up with 2 shifts, 3 ors, 4 ands in the generated code, whereas my
manually generated version is just 2 shifts, 2 ands (or andnots) plus one or,
not to mention requiring 4 masks instead of just one albeit this is outside
the loop), but I see little performance impact from that (maybe hitting
memory limits).
This helps webgl performance in firefox (without layer accel) on a gm45 with
a cpu without sse41 by a factor of 10 or so iff the glReadPixels path hits
such a format (google maps does, making it useable again). www.fishgl.com
also hits this, I get about a 4x perf improvement and cpu usage
simultaneously drops by more than half. Make no mistake though, the perf
improvement in these cases is nearly all from not accessing uncached regions
byte-wise (fetching 16 bytes from there is super-slow, but not any slower than
fetching a single byte...).
webglsamples.org/aquarium/aquarium.html however, for instance does not hit
this (it's some bgrx->bgra conversion which isn't handled by pack and ends
up in convert_ubyte instead), this certainly doesn't solve the uncached
memory access in general for i965 (when there's actual conversion or transfer
ops would still do all accesses piecewise).
Albeit I suppose such conversion / swizzling would be better handled by meta
ideally...
---
 src/mesa/main/format_pack.py | 75 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/mesa/main/format_pack.py b/src/mesa/main/format_pack.py
index 2f43a30..0055c7c 100644
--- a/src/mesa/main/format_pack.py
+++ b/src/mesa/main/format_pack.py
@@ -50,6 +50,10 @@ string = """/*
 #include "../../gallium/auxiliary/util/u_format_r11g11b10f.h"
 #include "util/format_srgb.h"
 
+#if defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
 #define UNPACK(SRC, OFFSET, BITS) (((SRC) >> (OFFSET)) & MAX_UINT(BITS))
 #define PACK(SRC, OFFSET, BITS) (((SRC) & MAX_UINT(BITS)) << (OFFSET))
 
@@ -136,6 +140,66 @@ pack_ubyte_${f.short_name()}(const GLubyte src[4], void *dst)
       <% assert False %>
    %endif
 }
+
+   %if f.name in ('MESA_FORMAT_A8B8G8R8_UNORM', 'MESA_FORMAT_X8B8G8R8_UNORM', 'MESA_FORMAT_R8G8B8A8_UNORM', 'MESA_FORMAT_R8G8B8X8_UNORM', 'MESA_FORMAT_B8G8R8A8_UNORM', 'MESA_FORMAT_B8G8R8X8_UNORM', 'MESA_FORMAT_A8R8G8B8_UNORM', 'MESA_FORMAT_X8R8G8B8_UNORM'):
+
+#if defined(__SSE2__)
+static inline void
+pack_ubyte_${f.short_name()}_sse_aligned_4(const void *src, void *dst)
+{
+   __m128i srcreg, dstreg;
+      %for (i, c) in enumerate(f.channels):
+         %if c.type != 'x':
+   __m128i ${c.name}, ${c.name}mask;
+         %endif
+      %endfor
+
+   srcreg = _mm_load_si128((__m128i *)src);
+   dstreg = _mm_setzero_si128();
+      %for (i, c) in enumerate(f.channels):
+         <% j = f.swizzle.inverse()[i] %>
+         %if c.type != 'x':
+   ${c.name}mask = _mm_set1_epi32(0xFF << ${j*8});
+   ${c.name} = _mm_and_si128(${c.name}mask, srcreg);
+            %if j*8 - c.shift > 0:
+   ${c.name} = _mm_srli_epi32(${c.name}, ${j*8 - c.shift});
+            %elif j*8 - c.shift < 0:
+   ${c.name} = _mm_slli_epi32(${c.name}, ${c.shift - j*8});
+            %endif
+   dstreg = _mm_or_si128(${c.name}, dstreg);
+         %endif
+      %endfor
+
+   _mm_storeu_si128((__m128i *)dst, dstreg);
+}
+
+static inline void
+pack_ubyte_${f.short_name()}_sse(const uint8_t *s, uint8_t *d, unsigned count)
+{
+   while (((uintptr_t)s & 0xf) && count) {
+      pack_ubyte_${f.short_name()}(s, d);
+      s += 4;
+      d += 4;
+      count--;
+   }
+
+   while(count / 4) {
+      pack_ubyte_${f.short_name()}_sse_aligned_4(s, d);
+      s += 16;
+      d += 16;
+      count -= 4;
+   }
+
+   while (count) {
+      pack_ubyte_${f.short_name()}(s, d);
+      s += 4;
+      d += 4;
+      count--;
+   }
+}
+
+#endif
+   %endif
 %endfor
 
 static inline void
@@ -352,10 +416,21 @@ _mesa_pack_ubyte_rgba_row(mesa_format format, GLuint n,
    %endif
 
    case ${f.name}:
+   %if f.name in ('MESA_FORMAT_A8B8G8R8_UNORM', 'MESA_FORMAT_X8B8G8R8_UNORM', 'MESA_FORMAT_R8G8B8A8_UNORM', 'MESA_FORMAT_R8G8B8X8_UNORM', 'MESA_FORMAT_B8G8R8A8_UNORM', 'MESA_FORMAT_B8G8R8X8_UNORM', 'MESA_FORMAT_A8R8G8B8_UNORM', 'MESA_FORMAT_X8R8G8B8_UNORM'):
+#if defined(__SSE2__)
+      pack_ubyte_${f.short_name()}_sse((const void *)src[0], d, n);
+#else
+      for (i = 0; i < n; ++i) {
+         pack_ubyte_${f.short_name()}(src[i], d);
+         d += ${f.block_size() / 8};
+      }
+#endif
+   %else:
       for (i = 0; i < n; ++i) {
          pack_ubyte_${f.short_name()}(src[i], d);
          d += ${f.block_size() / 8};
       }
+   %endif:
       break;
 %endfor
    default:
-- 
2.1.4