[Mesa-dev] [PATCH V2 2/2] i965: add runtime check for SSSE3 rgba8_copy

Fri Nov 7 04:21:11 PST 2014

Callgrind cpu usage results from pts benchmarks:

For ytile_copy_faster()

Nexuiz 1.6.1: 2.16% -> 1.20%

V2:
- put back the if statements and add one for the SSSE3 rgba8_copy
- move some header files out of the header
- don't indent the preprocessor tests
- changed copyright to Google and add author Frank Henigman

Signed-off-by: Timothy Arceri <t_arceri at yahoo.com.au>
---
 src/mesa/Makefile.am                           |  8 +++
 src/mesa/drivers/dri/i965/intel_tex_subimage.c | 80 ++++++++++++-------------
 src/mesa/main/fast_rgba8_copy.c                | 83 ++++++++++++++++++++++++++
 src/mesa/main/fast_rgba8_copy.h                | 38 ++++++++++++
 4 files changed, 169 insertions(+), 40 deletions(-)
 create mode 100644 src/mesa/main/fast_rgba8_copy.c
 create mode 100644 src/mesa/main/fast_rgba8_copy.h

 Thanks to everyone who took the time to provide feedback on the first
 patchset and pointing out that the baby had been thrown out with the
 bath water.

 I've had another go at it and the performance regression should be fixed.
 With the tests available I couldn't spot any real difference. However the 
 down side is the ssse3 code can't be inlined so there will be a small trade
 off compared to the current way of building with ssse3 enabled. But for the
 majority of builds done without ssse3 it should be a win.

diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index e71bccb..cf87289 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -107,6 +107,10 @@ AM_CXXFLAGS = $(LLVM_CFLAGS) $(VISIBILITY_CXXFLAGS)
 
 ARCH_LIBS =
 
+if SSSE3_SUPPORTED
+ARCH_LIBS += libmesa_ssse3.la
+endif
+
 if SSE41_SUPPORTED
 ARCH_LIBS += libmesa_sse41.la
 endif
@@ -150,6 +154,10 @@ libmesagallium_la_LIBADD = \
 	$(top_builddir)/src/glsl/libglsl.la \
 	$(ARCH_LIBS)
 
+libmesa_ssse3_la_SOURCES = \
+	main/fast_rgba8_copy.c
+libmesa_ssse3_la_CFLAGS = $(AM_CFLAGS) -mssse3
+
 libmesa_sse41_la_SOURCES = \
 	main/streaming-load-memcpy.c
 libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index cb5738a..81a2310 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -27,6 +27,7 @@
  **************************************************************************/
 
 #include "main/bufferobj.h"
+#include "main/fast_rgba8_copy.h"
 #include "main/image.h"
 #include "main/macros.h"
 #include "main/mtypes.h"
@@ -42,9 +43,7 @@
 #include "intel_mipmap_tree.h"
 #include "intel_blit.h"
 
-#ifdef __SSSE3__
-#include <tmmintrin.h>
-#endif
+#include "x86/common_x86_asm.h"
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
 
@@ -175,18 +174,6 @@ err:
    return false;
 }
 
-#ifdef __SSSE3__
-static const uint8_t rgba8_permutation[16] =
-   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
-
-/* NOTE: dst must be 16 byte aligned */
-#define rgba8_copy_16(dst, src)                     \
-   *(__m128i *)(dst) = _mm_shuffle_epi8(            \
-      (__m128i) _mm_loadu_ps((float *)(src)),       \
-      *(__m128i *) rgba8_permutation                \
-   )
-#endif
-
 /**
  * Copy RGBA to BGRA - swap R and B.
  */
@@ -196,29 +183,6 @@ rgba8_copy(void *dst, const void *src, size_t bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
-   /* Fast copying for tile spans.
-    *
-    * As long as the destination texture is 16 aligned,
-    * any 16 or 64 spans we get here should also be 16 aligned.
-    */
-
-   if (bytes == 16) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16(d+ 0, s+ 0);
-      return dst;
-   }
-
-   if (bytes == 64) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16(d+ 0, s+ 0);
-      rgba8_copy_16(d+16, s+16);
-      rgba8_copy_16(d+32, s+32);
-      rgba8_copy_16(d+48, s+48);
-      return dst;
-   }
-#endif
-
    while (bytes >= 4) {
       d[0] = s[2];
       d[1] = s[1];
@@ -355,6 +319,12 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(USE_SSSE3)
+      else if (mem_copy == _mesa_fast_rgba8_copy)
+         return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
+                           dst, src, src_pitch, swizzle_bit,
+                           _mesa_fast_rgba8_copy);
+      #endif
       else if (mem_copy == rgba8_copy)
          return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
@@ -362,6 +332,12 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return xtile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(USE_SSSE3)
+      else if (mem_copy == _mesa_fast_rgba8_copy)
+         return xtile_copy(x0, x1, x2, x3, y0, y1,
+                           dst, src, src_pitch, swizzle_bit,
+                           _mesa_fast_rgba8_copy);
+      #endif
       else if (mem_copy == rgba8_copy)
          return xtile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
@@ -391,6 +367,12 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(USE_SSSE3)
+      else if (mem_copy == _mesa_fast_rgba8_copy)
+         return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
+                           dst, src, src_pitch, swizzle_bit,
+                           _mesa_fast_rgba8_copy);
+      #endif
       else if (mem_copy == rgba8_copy)
          return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
@@ -398,6 +380,12 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return ytile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(USE_SSSE3)
+      else if (mem_copy == _mesa_fast_rgba8_copy)
+         return ytile_copy(x0, x1, x2, x3, y0, y1,
+                           dst, src, src_pitch, swizzle_bit,
+                           _mesa_fast_rgba8_copy);
+      #endif
       else if (mem_copy == rgba8_copy)
          return ytile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
@@ -582,7 +570,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
       if (format == GL_BGRA) {
          mem_copy = memcpy;
       } else if (format == GL_RGBA) {
-         mem_copy = rgba8_copy;
+         #if defined(USE_SSSE3)
+         if (cpu_has_ssse3) {
+            mem_copy = _mesa_fast_rgba8_copy;
+         }
+         else
+         #endif
+            mem_copy = rgba8_copy;
       }
    } else if ((texImage->TexFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
               (texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
@@ -591,7 +585,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
           * use the same function.
           */
-         mem_copy = rgba8_copy;
+         #if defined(USE_SSSE3)
+         if (cpu_has_ssse3) {
+            mem_copy = _mesa_fast_rgba8_copy;
+         }
+         else
+         #endif
+            mem_copy = rgba8_copy;
       } else if (format == GL_RGBA) {
          mem_copy = memcpy;
       }
diff --git a/src/mesa/main/fast_rgba8_copy.c b/src/mesa/main/fast_rgba8_copy.c
new file mode 100644
index 0000000..e3c7fd1
--- /dev/null
+++ b/src/mesa/main/fast_rgba8_copy.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2013 Google
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author:
+ *    Frank Henigman <fjhenigman at google.com>
+ *
+ */
+
+#ifdef __SSSE3__
+#include "main/fast_rgba8_copy.h"
+#include <assert.h>
+#include <stdint.h>
+#include <tmmintrin.h>
+
+static const uint8_t rgba8_permutation[16] =
+   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
+
+/* NOTE: dst must be 16 byte aligned */
+#define rgba8_copy_16(dst, src)                     \
+   *(__m128i *)(dst) = _mm_shuffle_epi8(            \
+      (__m128i) _mm_loadu_ps((float *)(src)),       \
+      *(__m128i *) rgba8_permutation                \
+   )
+
+/* Fast copying for tile spans.
+ *
+ * As long as the destination texture is 16 aligned,
+ * any 16 or 64 spans we get here should also be 16 aligned.
+ */
+void *
+_mesa_fast_rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+   uint8_t *d = dst;
+   uint8_t const *s = src;
+
+   if (bytes == 16) {
+      assert(!(((uintptr_t)dst) & 0xf));
+      rgba8_copy_16(d+ 0, s+ 0);
+      return dst;
+   }
+
+   if (bytes == 64) {
+      assert(!(((uintptr_t)dst) & 0xf));
+      rgba8_copy_16(d+ 0, s+ 0);
+      rgba8_copy_16(d+16, s+16);
+      rgba8_copy_16(d+32, s+32);
+      rgba8_copy_16(d+48, s+48);
+      return dst;
+   }
+
+   while (bytes >= 4) {
+      d[0] = s[2];
+      d[1] = s[1];
+      d[2] = s[0];
+      d[3] = s[3];
+      d += 4;
+      s += 4;
+      bytes -= 4;
+   }
+   return dst;
+}
+#endif
diff --git a/src/mesa/main/fast_rgba8_copy.h b/src/mesa/main/fast_rgba8_copy.h
new file mode 100644
index 0000000..4e3328b
--- /dev/null
+++ b/src/mesa/main/fast_rgba8_copy.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2013 Google
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author:
+ *    Frank Henigman <fjhenigman at google.com>
+ *
+ */
+
+#include <stddef.h>
+
+/* Fast copying for tile spans.
+ *
+ * As long as the destination texture is 16 aligned,
+ * any 16 or 64 spans we get here should also be 16 aligned.
+ */
+void *
+_mesa_fast_rgba8_copy(void *dst, const void *src, size_t n);
-- 
1.9.3