[Pixman] [PATCH 2/3] mmx: fix unaligned accesses
Siarhei Siamashka
siarhei.siamashka at gmail.com
Sat Jul 23 19:28:13 PDT 2011
On Wed, Jul 20, 2011 at 10:29 PM, <mattst88 at gmail.com> wrote:
> From: Matt Turner <mattst88 at gmail.com>
>
> Signed-off-by: Matt Turner <mattst88 at gmail.com>
> ---
> pixman/pixman-mmx.c | 109 +++++++++++++++++++++++++++++++++++---------------
> 1 files changed, 76 insertions(+), 33 deletions(-)
>
> diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
> index 71fa18e..b9b09b6 100644
> --- a/pixman/pixman-mmx.c
> +++ b/pixman/pixman-mmx.c
> @@ -298,6 +298,22 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
>
> #endif
>
> +/* Elemental unaligned loads */
> +
> +static __inline__ uint64_t ldq_u(uint64_t *p)
> +{
> + struct __una_u64 { uint64_t x __attribute__((packed)); };
> + const struct __una_u64 *ptr = (const struct __una_u64 *) p;
> + return ptr->x;
> +}
Here it makes sense to make sure that gcc generates efficient code
using WALIGNR instructions. I also tried the following sample program
with '_mm_align_si64' intrinsic (totally untested though):
$ cat test_walignr.c
#include <mmintrin.h>
#include <stdint.h>
static inline __m64 ldq_u1 (__m64 *p)
{
struct __una_u64 { __m64 x __attribute__((packed)); };
const struct __una_u64 *ptr = (const struct __una_u64 *) p;
return ptr->x;
}
static inline __m64 ldq_u2 (__m64 *p)
{
int align = (uintptr_t)p & 7;
__m64 tmp1, tmp2;
__m64 *aligned_p;
if (align == 0)
return *p;
aligned_p = (__m64 *)((uintptr_t)p & ~7);
return _mm_align_si64 (aligned_p[0], aligned_p[1], align);
}
void test1 (__m64 *dst, char *src)
{
*dst = _mm_adds_pu8 (ldq_u1 ((__m64 *)(src + 0)),
ldq_u1 ((__m64 *)(src + 8)));
}
void test2 (__m64 *dst, char *src)
{
*dst = _mm_adds_pu8 (ldq_u2 ((__m64 *)(src + 0)),
ldq_u2 ((__m64 *)(src + 8)));
}
$ arm-none-linux-gnueabi-gcc -march=iwmmxt -flax-vector-conversions -c
-O2 test_walignr.c
$ arm-none-linux-gnueabi-objdump -d -miwmmxt test_walignr.o
test_walignr.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <test1>:
0: e92d0ff0 push {r4, r5, r6, r7, r8, r9, sl, fp}
4: e5d1300d ldrb r3, [r1, #13]
8: e24dd018 sub sp, sp, #24
c: e5d18000 ldrb r8, [r1]
10: e5d17009 ldrb r7, [r1, #9]
14: e5d1c004 ldrb ip, [r1, #4]
18: e5d14005 ldrb r4, [r1, #5]
1c: e5d1200a ldrb r2, [r1, #10]
20: e5d1a001 ldrb sl, [r1, #1]
24: e5d16008 ldrb r6, [r1, #8]
28: e5d15002 ldrb r5, [r1, #2]
2c: e5d1900c ldrb r9, [r1, #12]
30: e58d3004 str r3, [sp, #4]
34: e5d1b003 ldrb fp, [r1, #3]
38: e1866407 orr r6, r6, r7, lsl #8
3c: e18c7404 orr r7, ip, r4, lsl #8
40: e58db008 str fp, [sp, #8]
44: e5d1300b ldrb r3, [r1, #11]
48: e5d1b006 ldrb fp, [r1, #6]
4c: e59d4004 ldr r4, [sp, #4]
50: e58d3010 str r3, [sp, #16]
54: e5d1300e ldrb r3, [r1, #14]
58: e187c80b orr ip, r7, fp, lsl #16
5c: e59db008 ldr fp, [sp, #8]
60: e58d3014 str r3, [sp, #20]
64: e5d13007 ldrb r3, [r1, #7]
68: e1899404 orr r9, r9, r4, lsl #8
6c: e59d4014 ldr r4, [sp, #20]
70: e58d300c str r3, [sp, #12]
74: e188340a orr r3, r8, sl, lsl #8
78: e1835805 orr r5, r3, r5, lsl #16
7c: e1868802 orr r8, r6, r2, lsl #16
80: e5d1100f ldrb r1, [r1, #15]
84: e1856c0b orr r6, r5, fp, lsl #24
88: e59d200c ldr r2, [sp, #12]
8c: e59db010 ldr fp, [sp, #16]
90: e1893804 orr r3, r9, r4, lsl #16
94: e18c7c02 orr r7, ip, r2, lsl #24
98: e1888c0b orr r8, r8, fp, lsl #24
9c: e1839c01 orr r9, r3, r1, lsl #24
a0: ec476001 tmcrr wr1, r6, r7
a4: ec498002 tmcrr wr2, r8, r9
a8: ee110182 waddbus wr0, wr1, wr2
ac: edc00100 wstrd wr0, [r0]
b0: e28dd018 add sp, sp, #24
b4: e8bd0ff0 pop {r4, r5, r6, r7, r8, r9, sl, fp}
b8: e12fff1e bx lr
000000bc <test2>:
bc: e2113007 ands r3, r1, #7
c0: 1a000007 bne e4 <test2+0x28>
c4: e2813008 add r3, r1, #8
c8: e2132007 ands r2, r3, #7
cc: edd11100 wldrd wr1, [r1]
d0: 1a00000b bne 104 <test2+0x48>
d4: edd10102 wldrd wr0, [r1, #8]
d8: ee114180 waddbus wr4, wr1, wr0
dc: edc04100 wstrd wr4, [r0]
e0: e12fff1e bx lr
e4: e3c12007 bic r2, r1, #7
e8: edd21100 wldrd wr1, [r2]
ec: edd20102 wldrd wr0, [r2, #8]
f0: ee083110 tmcr wcgr0, r3
f4: e2813008 add r3, r1, #8
f8: e2132007 ands r2, r3, #7
fc: ee811020 walignr0 wr1, wr1, wr0
100: 0afffff3 beq d4 <test2+0x18>
104: e3c33007 bic r3, r3, #7
108: edd32100 wldrd wr2, [r3]
10c: edd33102 wldrd wr3, [r3, #8]
110: ee092110 tmcr wcgr1, r2
114: ee920023 walignr1 wr0, wr2, wr3
118: ee114180 waddbus wr4, wr1, wr0
11c: edc04100 wstrd wr4, [r0]
120: e12fff1e bx lr
The 'test1' function does not look good because it uses ARM
instructions to read data one byte at a time and combine it. Function
'test2' looks a bit better because it now uses WALIGNR, but this is
still not an optimal solution. Ideally, if we need to read N
contiguous unaligned 64-bit values, this requires (N + 1) loads via
WLDRD instructions and N fixups via WALIGNR, also shift argument for
WALIGNR has to be calculated only once.
--
Best regards,
Siarhei Siamashka
More information about the Pixman
mailing list