[Mesa-dev] [PATCH 1/2] i965/tiled_memcpy: Add alignment assumption decorations
Jason Ekstrand
jason at jlekstrand.net
Tue Apr 5 01:04:48 UTC 2016
This should help GCC when inlining memcpy to be able to better choose an
aligned version. It should also fix potential performance issues in the
next commit.
---
src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 30 +++++++++++++++++++-------
1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 3135458..19079d0 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -56,6 +56,8 @@ static const uint32_t ytile_width = 128;
static const uint32_t ytile_height = 32;
static const uint32_t ytile_span = 16;
+#define assume_aligned(x, n) __builtin_assume_aligned(x, n)
+
#ifdef __SSSE3__
static const uint8_t rgba8_permutation[16] =
{ 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
@@ -200,10 +202,12 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
for (xo = x1; xo < x2; xo += xtile_span) {
- mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
+ mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), xtile_span),
+ src + xo, xtile_span);
}
- mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+ mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), xtile_span),
+ src + x2, x3 - x2);
src += src_pitch;
}
@@ -259,12 +263,14 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
* at each step so we don't need to calculate it explicitly.
*/
for (x = x1; x < x2; x += ytile_span) {
- mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+ mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), ytile_span),
+ src + x, ytile_span);
xo += bytes_per_column;
swizzle ^= swizzle_bit;
}
- mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+ mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), ytile_span),
+ src + x2, x3 - x2);
src += src_pitch;
}
@@ -302,10 +308,14 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
for (xo = x1; xo < x2; xo += xtile_span) {
- mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
+ mem_copy(dst + xo,
+ assume_aligned(src + ((xo + yo) ^ swizzle), xtile_span),
+ xtile_span);
}
- mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+ mem_copy(dst + x2,
+ assume_aligned(src + ((xo + yo) ^ swizzle), xtile_span),
+ x3 - x2);
dst += dst_pitch;
}
@@ -361,12 +371,16 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
* at each step so we don't need to calculate it explicitly.
*/
for (x = x1; x < x2; x += ytile_span) {
- mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+ mem_copy(dst + x,
+ assume_aligned(src + ((xo + yo) ^ swizzle), ytile_span),
+ ytile_span);
xo += bytes_per_column;
swizzle ^= swizzle_bit;
}
- mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+ mem_copy(dst + x2,
+ assume_aligned(src + ((xo + yo) ^ swizzle), ytile_span),
+ x3 - x2);
dst += dst_pitch;
}
--
2.5.0.400.gff86faf
More information about the mesa-dev
mailing list