[Mesa-dev] [PATCH 3/4] main: add some sse2 ubyte pack functions for rgba8 / rgbx8 unorm formats
sroland at vmware.com
sroland at vmware.com
Sun Jan 17 13:49:48 PST 2016
From: Roland Scheidegger <sroland at vmware.com>
This certainly isn't as generic as it would be ideally, but got to start
somewhere...
Handles just rgba8/rgbx8 formats (so just swizzling). Even when using
cached regions, these functions are definitely quite a bit faster than
the c ones (for larger counts, obviously) (about 3 times or so).
They can't quite match my manual assembly skills (e.g. the rgba8 ->bgra8 one
ends up with 2 shifts, 3 ors, 4 ands in the generated code, whereas my
manually generated version is just 2 shifts, 2 ands (or andnots) plus one or,
not to mention requiring 4 masks instead of just one albeit this is outside
the loop), but I see little performance impact from that (maybe hitting
memory limits).
This helps webgl performance in firefox (without layer accel) on a gm45 with
a cpu without sse41 by a factor of 10 or so iff the glReadPixels path hits
such a format (google maps does, making it useable again). www.fishgl.com
also hits this, I get about a 4x perf improvement and cpu usage
simultaneously drops by more than half. Make no mistake though, the perf
improvement in these cases is nearly all from not accessing uncached regions
byte-wise (fetching 16 bytes from there is super-slow, but not any slower than
fetching a single byte...).
webglsamples.org/aquarium/aquarium.html however, for instance does not hit
this (it's some bgrx->bgra conversion which isn't handled by pack and ends
up in convert_ubyte instead), this certainly doesn't solve the uncached
memory access in general for i965 (when there's actual conversion or transfer
ops would still do all accesses piecewise).
Albeit I suppose such conversion / swizzling would be better handled by meta
ideally...
---
src/mesa/main/format_pack.py | 75 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
diff --git a/src/mesa/main/format_pack.py b/src/mesa/main/format_pack.py
index 2f43a30..0055c7c 100644
--- a/src/mesa/main/format_pack.py
+++ b/src/mesa/main/format_pack.py
@@ -50,6 +50,10 @@ string = """/*
#include "../../gallium/auxiliary/util/u_format_r11g11b10f.h"
#include "util/format_srgb.h"
+#if defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
#define UNPACK(SRC, OFFSET, BITS) (((SRC) >> (OFFSET)) & MAX_UINT(BITS))
#define PACK(SRC, OFFSET, BITS) (((SRC) & MAX_UINT(BITS)) << (OFFSET))
@@ -136,6 +140,66 @@ pack_ubyte_${f.short_name()}(const GLubyte src[4], void *dst)
<% assert False %>
%endif
}
+
+ %if f.name in ('MESA_FORMAT_A8B8G8R8_UNORM', 'MESA_FORMAT_X8B8G8R8_UNORM', 'MESA_FORMAT_R8G8B8A8_UNORM', 'MESA_FORMAT_R8G8B8X8_UNORM', 'MESA_FORMAT_B8G8R8A8_UNORM', 'MESA_FORMAT_B8G8R8X8_UNORM', 'MESA_FORMAT_A8R8G8B8_UNORM', 'MESA_FORMAT_X8R8G8B8_UNORM'):
+
+#if defined(__SSE2__)
+static inline void
+pack_ubyte_${f.short_name()}_sse_aligned_4(const void *src, void *dst)
+{
+ __m128i srcreg, dstreg;
+ %for (i, c) in enumerate(f.channels):
+ %if c.type != 'x':
+ __m128i ${c.name}, ${c.name}mask;
+ %endif
+ %endfor
+
+ srcreg = _mm_load_si128((__m128i *)src);
+ dstreg = _mm_setzero_si128();
+ %for (i, c) in enumerate(f.channels):
+ <% j = f.swizzle.inverse()[i] %>
+ %if c.type != 'x':
+ ${c.name}mask = _mm_set1_epi32(0xFF << ${j*8});
+ ${c.name} = _mm_and_si128(${c.name}mask, srcreg);
+ %if j*8 - c.shift > 0:
+ ${c.name} = _mm_srli_epi32(${c.name}, ${j*8 - c.shift});
+ %elif j*8 - c.shift < 0:
+ ${c.name} = _mm_slli_epi32(${c.name}, ${c.shift - j*8});
+ %endif
+ dstreg = _mm_or_si128(${c.name}, dstreg);
+ %endif
+ %endfor
+
+ _mm_storeu_si128((__m128i *)dst, dstreg);
+}
+
+static inline void
+pack_ubyte_${f.short_name()}_sse(const uint8_t *s, uint8_t *d, unsigned count)
+{
+ while (((uintptr_t)s & 0xf) && count) {
+ pack_ubyte_${f.short_name()}(s, d);
+ s += 4;
+ d += 4;
+ count--;
+ }
+
+ while(count / 4) {
+ pack_ubyte_${f.short_name()}_sse_aligned_4(s, d);
+ s += 16;
+ d += 16;
+ count -= 4;
+ }
+
+ while (count) {
+ pack_ubyte_${f.short_name()}(s, d);
+ s += 4;
+ d += 4;
+ count--;
+ }
+}
+
+#endif
+ %endif
%endfor
static inline void
@@ -352,10 +416,21 @@ _mesa_pack_ubyte_rgba_row(mesa_format format, GLuint n,
%endif
case ${f.name}:
+ %if f.name in ('MESA_FORMAT_A8B8G8R8_UNORM', 'MESA_FORMAT_X8B8G8R8_UNORM', 'MESA_FORMAT_R8G8B8A8_UNORM', 'MESA_FORMAT_R8G8B8X8_UNORM', 'MESA_FORMAT_B8G8R8A8_UNORM', 'MESA_FORMAT_B8G8R8X8_UNORM', 'MESA_FORMAT_A8R8G8B8_UNORM', 'MESA_FORMAT_X8R8G8B8_UNORM'):
+#if defined(__SSE2__)
+ pack_ubyte_${f.short_name()}_sse((const void *)src[0], d, n);
+#else
+ for (i = 0; i < n; ++i) {
+ pack_ubyte_${f.short_name()}(src[i], d);
+ d += ${f.block_size() / 8};
+ }
+#endif
+ %else:
for (i = 0; i < n; ++i) {
pack_ubyte_${f.short_name()}(src[i], d);
d += ${f.block_size() / 8};
}
+ %endif:
break;
%endfor
default:
--
2.1.4
More information about the mesa-dev
mailing list