[Mesa-dev] [PATCH V4 3/3] i965: add runtime check for SSSE3 rgba8_copy

Sun Nov 9 03:55:50 PST 2014

Callgrind cpu usage results from pts benchmarks:

For ytile_copy_faster()
Nexuiz 1.6.1: 2.48% -> 0.97%

The folowing are the only discernible results from teximage:

Without patch and mesa default build flags -
TexSubImage(BGRA/ubyte 256 x 256): 6122.6 images/sec, 1530.6 MB/sec

With patch runtime ssse3 -
TexSubImage(BGRA/ubyte 256 x 256): 9288.0 images/sec, 2322.0 MB/sec

V4:
- fix slight regression when building with ssse3 compile flag by
 wrapping fallback if statments with #ifndef __SSSE3__
- add Mesa demo teximage results to commit message 

V3:
- rather than putting the ssse3 code in a different file
 in order to compile make use of gcc pragma for per
 function optimisations. Results in improved performace and less
 impact on those not needing runtime ssse3 checks.

V2:
- put back the if statements and add one for the SSSE3 rgba8_copy
- move some header files out of the header
- don't indent the preprocessor tests
- changed copyright to Google and add author Frank Henigman

Signed-off-by: Timothy Arceri <t_arceri at yahoo.com.au>
---
 src/mesa/drivers/dri/i965/intel_tex_subimage.c | 96 ++++++++++++++++++++++----
 1 file changed, 81 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index cb5738a..4c9ca18 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -42,8 +42,13 @@
 #include "intel_mipmap_tree.h"
 #include "intel_blit.h"
 
-#ifdef __SSSE3__
+#include "x86/common_x86_asm.h"
+#include "x86/x86_function_opt.h"
+
+#if defined(SSSE3_FUNC_OPT_START)
+SSSE3_FUNC_OPT_START
 #include <tmmintrin.h>
+SSSE3_FUNC_OPT_END
 #endif
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
@@ -175,7 +180,8 @@ err:
    return false;
 }
 
-#ifdef __SSSE3__
+#if defined(SSSE3_FUNC_OPT_START)
+SSSE3_FUNC_OPT_START
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
 
@@ -185,24 +191,18 @@ static const uint8_t rgba8_permutation[16] =
       (__m128i) _mm_loadu_ps((float *)(src)),       \
       *(__m128i *) rgba8_permutation                \
    )
-#endif
 
-/**
- * Copy RGBA to BGRA - swap R and B.
+/* Fast copying for tile spans.
+ *
+ * As long as the destination texture is 16 aligned,
+ * any 16 or 64 spans we get here should also be 16 aligned.
  */
 static inline void *
-rgba8_copy(void *dst, const void *src, size_t bytes)
+ssse3_fast_rgba8_copy(void *dst, const void *src, size_t bytes)
 {
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
-   /* Fast copying for tile spans.
-    *
-    * As long as the destination texture is 16 aligned,
-    * any 16 or 64 spans we get here should also be 16 aligned.
-    */
-
    if (bytes == 16) {
       assert(!(((uintptr_t)dst) & 0xf));
       rgba8_copy_16(d+ 0, s+ 0);
@@ -217,8 +217,30 @@ rgba8_copy(void *dst, const void *src, size_t bytes)
       rgba8_copy_16(d+48, s+48);
       return dst;
    }
+
+   while (bytes >= 4) {
+      d[0] = s[2];
+      d[1] = s[1];
+      d[2] = s[0];
+      d[3] = s[3];
+      d += 4;
+      s += 4;
+      bytes -= 4;
+   }
+   return dst;
+}
+SSSE3_FUNC_OPT_END
 #endif
 
+/**
+ * Copy RGBA to BGRA - swap R and B.
+ */
+static inline void *
+rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+   uint8_t *d = dst;
+   uint8_t const *s = src;
+
    while (bytes >= 4) {
       d[0] = s[2];
       d[1] = s[1];
@@ -355,16 +377,32 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(SSSE3_FUNC_OPT_START)
+      else if (mem_copy == ssse3_fast_rgba8_copy)
+         return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
+                           dst, src, src_pitch, swizzle_bit,
+                           ssse3_fast_rgba8_copy);
+      #endif
+      #ifndef __SSSE3__
       else if (mem_copy == rgba8_copy)
          return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
+      #endif
    } else {
       if (mem_copy == memcpy)
          return xtile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(SSSE3_FUNC_OPT_START)
+      else if (mem_copy == ssse3_fast_rgba8_copy)
+         return xtile_copy(x0, x1, x2, x3, y0, y1,
+                           dst, src, src_pitch, swizzle_bit,
+                           ssse3_fast_rgba8_copy);
+      #endif
+      #ifndef __SSSE3__
       else if (mem_copy == rgba8_copy)
          return xtile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
+      #endif
    }
    xtile_copy(x0, x1, x2, x3, y0, y1,
               dst, src, src_pitch, swizzle_bit, mem_copy);
@@ -391,16 +429,32 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(SSSE3_FUNC_OPT_START)
+      else if (mem_copy == ssse3_fast_rgba8_copy)
+         return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
+                           dst, src, src_pitch, swizzle_bit,
+                           ssse3_fast_rgba8_copy);
+      #endif
+      #ifndef __SSSE3__
       else if (mem_copy == rgba8_copy)
          return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
+      #endif
    } else {
       if (mem_copy == memcpy)
          return ytile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, memcpy);
+      #if defined(SSSE3_FUNC_OPT_START)
+      else if (mem_copy == ssse3_fast_rgba8_copy)
+         return ytile_copy(x0, x1, x2, x3, y0, y1,
+                           dst, src, src_pitch, swizzle_bit,
+                           ssse3_fast_rgba8_copy);
+      #endif
+      #ifndef __SSSE3__
       else if (mem_copy == rgba8_copy)
          return ytile_copy(x0, x1, x2, x3, y0, y1,
                            dst, src, src_pitch, swizzle_bit, rgba8_copy);
+      #endif
    }
    ytile_copy(x0, x1, x2, x3, y0, y1,
               dst, src, src_pitch, swizzle_bit, mem_copy);
@@ -582,7 +636,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
       if (format == GL_BGRA) {
          mem_copy = memcpy;
       } else if (format == GL_RGBA) {
-         mem_copy = rgba8_copy;
+         #if defined(SSSE3_FUNC_OPT_START)
+         if (cpu_has_ssse3) {
+            mem_copy = ssse3_fast_rgba8_copy;
+         }
+         else
+         #endif
+            mem_copy = rgba8_copy;
       }
    } else if ((texImage->TexFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
               (texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
@@ -591,7 +651,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
           * use the same function.
           */
-         mem_copy = rgba8_copy;
+         #if defined(SSSE3_FUNC_OPT_START)
+         if (cpu_has_ssse3) {
+            mem_copy = ssse3_fast_rgba8_copy;
+         }
+         else
+         #endif
+            mem_copy = rgba8_copy;
       } else if (format == GL_RGBA) {
          mem_copy = memcpy;
       }
-- 
1.9.3