[Mesa-dev] [PATCH 1/2] i965/tiled_memcpy: Add alignment assumption decorations

Tue Apr 5 01:04:48 UTC 2016

This should help GCC when inlining memcpy to be able to better choose an
aligned version.  It should also fix potential performance issues in the
next commit.
---
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 30 +++++++++++++++++++-------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 3135458..19079d0 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -56,6 +56,8 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
 
+#define assume_aligned(x, n) __builtin_assume_aligned(x, n)
+
 #ifdef __SSSE3__
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
@@ -200,10 +202,12 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
 
       for (xo = x1; xo < x2; xo += xtile_span) {
-         mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
+         mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), xtile_span),
+                  src + xo, xtile_span);
       }
 
-      mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+      mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), xtile_span),
+               src + x2, x3 - x2);
 
       src += src_pitch;
    }
@@ -259,12 +263,14 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+         mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), ytile_span),
+                  src + x, ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+      mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), ytile_span),
+               src + x2, x3 - x2);
 
       src += src_pitch;
    }
@@ -302,10 +308,14 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
 
       for (xo = x1; xo < x2; xo += xtile_span) {
-         mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
+         mem_copy(dst + xo,
+                  assume_aligned(src + ((xo + yo) ^ swizzle), xtile_span),
+                  xtile_span);
       }
 
-      mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      mem_copy(dst + x2,
+               assume_aligned(src + ((xo + yo) ^ swizzle), xtile_span),
+               x3 - x2);
 
       dst += dst_pitch;
    }
@@ -361,12 +371,16 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+         mem_copy(dst + x,
+                  assume_aligned(src + ((xo + yo) ^ swizzle), ytile_span),
+                  ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      mem_copy(dst + x2,
+               assume_aligned(src + ((xo + yo) ^ swizzle), ytile_span),
+               x3 - x2);
 
       dst += dst_pitch;
    }
-- 
2.5.0.400.gff86faf