[Mesa-dev] [PATCH V4 3/3] i965: add runtime check for SSSE3 rgba8_copy
Timothy Arceri
t_arceri at yahoo.com.au
Sun Nov 9 03:55:50 PST 2014
Callgrind cpu usage results from pts benchmarks:
For ytile_copy_faster()
Nexuiz 1.6.1: 2.48% -> 0.97%
The folowing are the only discernible results from teximage:
Without patch and mesa default build flags -
TexSubImage(BGRA/ubyte 256 x 256): 6122.6 images/sec, 1530.6 MB/sec
With patch runtime ssse3 -
TexSubImage(BGRA/ubyte 256 x 256): 9288.0 images/sec, 2322.0 MB/sec
V4:
- fix slight regression when building with ssse3 compile flag by
wrapping fallback if statments with #ifndef __SSSE3__
- add Mesa demo teximage results to commit message
V3:
- rather than putting the ssse3 code in a different file
in order to compile make use of gcc pragma for per
function optimisations. Results in improved performace and less
impact on those not needing runtime ssse3 checks.
V2:
- put back the if statements and add one for the SSSE3 rgba8_copy
- move some header files out of the header
- don't indent the preprocessor tests
- changed copyright to Google and add author Frank Henigman
Signed-off-by: Timothy Arceri <t_arceri at yahoo.com.au>
---
src/mesa/drivers/dri/i965/intel_tex_subimage.c | 96 ++++++++++++++++++++++----
1 file changed, 81 insertions(+), 15 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index cb5738a..4c9ca18 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -42,8 +42,13 @@
#include "intel_mipmap_tree.h"
#include "intel_blit.h"
-#ifdef __SSSE3__
+#include "x86/common_x86_asm.h"
+#include "x86/x86_function_opt.h"
+
+#if defined(SSSE3_FUNC_OPT_START)
+SSSE3_FUNC_OPT_START
#include <tmmintrin.h>
+SSSE3_FUNC_OPT_END
#endif
#define FILE_DEBUG_FLAG DEBUG_TEXTURE
@@ -175,7 +180,8 @@ err:
return false;
}
-#ifdef __SSSE3__
+#if defined(SSSE3_FUNC_OPT_START)
+SSSE3_FUNC_OPT_START
static const uint8_t rgba8_permutation[16] =
{ 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
@@ -185,24 +191,18 @@ static const uint8_t rgba8_permutation[16] =
(__m128i) _mm_loadu_ps((float *)(src)), \
*(__m128i *) rgba8_permutation \
)
-#endif
-/**
- * Copy RGBA to BGRA - swap R and B.
+/* Fast copying for tile spans.
+ *
+ * As long as the destination texture is 16 aligned,
+ * any 16 or 64 spans we get here should also be 16 aligned.
*/
static inline void *
-rgba8_copy(void *dst, const void *src, size_t bytes)
+ssse3_fast_rgba8_copy(void *dst, const void *src, size_t bytes)
{
uint8_t *d = dst;
uint8_t const *s = src;
-#ifdef __SSSE3__
- /* Fast copying for tile spans.
- *
- * As long as the destination texture is 16 aligned,
- * any 16 or 64 spans we get here should also be 16 aligned.
- */
-
if (bytes == 16) {
assert(!(((uintptr_t)dst) & 0xf));
rgba8_copy_16(d+ 0, s+ 0);
@@ -217,8 +217,30 @@ rgba8_copy(void *dst, const void *src, size_t bytes)
rgba8_copy_16(d+48, s+48);
return dst;
}
+
+ while (bytes >= 4) {
+ d[0] = s[2];
+ d[1] = s[1];
+ d[2] = s[0];
+ d[3] = s[3];
+ d += 4;
+ s += 4;
+ bytes -= 4;
+ }
+ return dst;
+}
+SSSE3_FUNC_OPT_END
#endif
+/**
+ * Copy RGBA to BGRA - swap R and B.
+ */
+static inline void *
+rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+ uint8_t *d = dst;
+ uint8_t const *s = src;
+
while (bytes >= 4) {
d[0] = s[2];
d[1] = s[1];
@@ -355,16 +377,32 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
if (mem_copy == memcpy)
return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
dst, src, src_pitch, swizzle_bit, memcpy);
+ #if defined(SSSE3_FUNC_OPT_START)
+ else if (mem_copy == ssse3_fast_rgba8_copy)
+ return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
+ dst, src, src_pitch, swizzle_bit,
+ ssse3_fast_rgba8_copy);
+ #endif
+ #ifndef __SSSE3__
else if (mem_copy == rgba8_copy)
return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height,
dst, src, src_pitch, swizzle_bit, rgba8_copy);
+ #endif
} else {
if (mem_copy == memcpy)
return xtile_copy(x0, x1, x2, x3, y0, y1,
dst, src, src_pitch, swizzle_bit, memcpy);
+ #if defined(SSSE3_FUNC_OPT_START)
+ else if (mem_copy == ssse3_fast_rgba8_copy)
+ return xtile_copy(x0, x1, x2, x3, y0, y1,
+ dst, src, src_pitch, swizzle_bit,
+ ssse3_fast_rgba8_copy);
+ #endif
+ #ifndef __SSSE3__
else if (mem_copy == rgba8_copy)
return xtile_copy(x0, x1, x2, x3, y0, y1,
dst, src, src_pitch, swizzle_bit, rgba8_copy);
+ #endif
}
xtile_copy(x0, x1, x2, x3, y0, y1,
dst, src, src_pitch, swizzle_bit, mem_copy);
@@ -391,16 +429,32 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
if (mem_copy == memcpy)
return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, src_pitch, swizzle_bit, memcpy);
+ #if defined(SSSE3_FUNC_OPT_START)
+ else if (mem_copy == ssse3_fast_rgba8_copy)
+ return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
+ dst, src, src_pitch, swizzle_bit,
+ ssse3_fast_rgba8_copy);
+ #endif
+ #ifndef __SSSE3__
else if (mem_copy == rgba8_copy)
return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, src_pitch, swizzle_bit, rgba8_copy);
+ #endif
} else {
if (mem_copy == memcpy)
return ytile_copy(x0, x1, x2, x3, y0, y1,
dst, src, src_pitch, swizzle_bit, memcpy);
+ #if defined(SSSE3_FUNC_OPT_START)
+ else if (mem_copy == ssse3_fast_rgba8_copy)
+ return ytile_copy(x0, x1, x2, x3, y0, y1,
+ dst, src, src_pitch, swizzle_bit,
+ ssse3_fast_rgba8_copy);
+ #endif
+ #ifndef __SSSE3__
else if (mem_copy == rgba8_copy)
return ytile_copy(x0, x1, x2, x3, y0, y1,
dst, src, src_pitch, swizzle_bit, rgba8_copy);
+ #endif
}
ytile_copy(x0, x1, x2, x3, y0, y1,
dst, src, src_pitch, swizzle_bit, mem_copy);
@@ -582,7 +636,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
if (format == GL_BGRA) {
mem_copy = memcpy;
} else if (format == GL_RGBA) {
- mem_copy = rgba8_copy;
+ #if defined(SSSE3_FUNC_OPT_START)
+ if (cpu_has_ssse3) {
+ mem_copy = ssse3_fast_rgba8_copy;
+ }
+ else
+ #endif
+ mem_copy = rgba8_copy;
}
} else if ((texImage->TexFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
(texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
@@ -591,7 +651,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
/* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
* use the same function.
*/
- mem_copy = rgba8_copy;
+ #if defined(SSSE3_FUNC_OPT_START)
+ if (cpu_has_ssse3) {
+ mem_copy = ssse3_fast_rgba8_copy;
+ }
+ else
+ #endif
+ mem_copy = rgba8_copy;
} else if (format == GL_RGBA) {
mem_copy = memcpy;
}
--
1.9.3
More information about the mesa-dev
mailing list