[Pixman] [PATCH 2/3] mmx: fix unaligned accesses

Sat Jul 23 19:28:13 PDT 2011

On Wed, Jul 20, 2011 at 10:29 PM,  <mattst88 at gmail.com> wrote:
> From: Matt Turner <mattst88 at gmail.com>
>
> Signed-off-by: Matt Turner <mattst88 at gmail.com>
> ---
>  pixman/pixman-mmx.c |  109 +++++++++++++++++++++++++++++++++++---------------
>  1 files changed, 76 insertions(+), 33 deletions(-)
>
> diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
> index 71fa18e..b9b09b6 100644
> --- a/pixman/pixman-mmx.c
> +++ b/pixman/pixman-mmx.c
> @@ -298,6 +298,22 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
>
>  #endif
>
> +/* Elemental unaligned loads */
> +
> +static __inline__ uint64_t ldq_u(uint64_t *p)
> +{
> +    struct __una_u64 { uint64_t x __attribute__((packed)); };
> +    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
> +    return ptr->x;
> +}

Here it makes sense to make sure that gcc generates efficient code
using WALIGNR instructions. I also tried the following sample program
with '_mm_align_si64' intrinsic (totally untested though):

$ cat test_walignr.c

#include <mmintrin.h>
#include <stdint.h>

static inline __m64 ldq_u1 (__m64 *p)
{
    struct __una_u64 { __m64 x __attribute__((packed)); };
    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
    return ptr->x;
}

static inline __m64 ldq_u2 (__m64 *p)
{
    int align = (uintptr_t)p & 7;
    __m64 tmp1, tmp2;
    __m64 *aligned_p;
    if (align == 0)
        return *p;
    aligned_p = (__m64 *)((uintptr_t)p & ~7);
    return _mm_align_si64 (aligned_p[0], aligned_p[1], align);
}

void test1 (__m64 *dst, char *src)
{
    *dst = _mm_adds_pu8 (ldq_u1 ((__m64 *)(src + 0)),
                         ldq_u1 ((__m64 *)(src + 8)));
}

void test2 (__m64 *dst, char *src)
{
    *dst = _mm_adds_pu8 (ldq_u2 ((__m64 *)(src + 0)),
                         ldq_u2 ((__m64 *)(src + 8)));
}

$ arm-none-linux-gnueabi-gcc -march=iwmmxt -flax-vector-conversions -c
-O2 test_walignr.c
$ arm-none-linux-gnueabi-objdump -d -miwmmxt test_walignr.o

test_walignr.o:     file format elf32-littlearm

Disassembly of section .text:

00000000 <test1>:
   0:   e92d0ff0        push    {r4, r5, r6, r7, r8, r9, sl, fp}
   4:   e5d1300d        ldrb    r3, [r1, #13]
   8:   e24dd018        sub     sp, sp, #24
   c:   e5d18000        ldrb    r8, [r1]
  10:   e5d17009        ldrb    r7, [r1, #9]
  14:   e5d1c004        ldrb    ip, [r1, #4]
  18:   e5d14005        ldrb    r4, [r1, #5]
  1c:   e5d1200a        ldrb    r2, [r1, #10]
  20:   e5d1a001        ldrb    sl, [r1, #1]
  24:   e5d16008        ldrb    r6, [r1, #8]
  28:   e5d15002        ldrb    r5, [r1, #2]
  2c:   e5d1900c        ldrb    r9, [r1, #12]
  30:   e58d3004        str     r3, [sp, #4]
  34:   e5d1b003        ldrb    fp, [r1, #3]
  38:   e1866407        orr     r6, r6, r7, lsl #8
  3c:   e18c7404        orr     r7, ip, r4, lsl #8
  40:   e58db008        str     fp, [sp, #8]
  44:   e5d1300b        ldrb    r3, [r1, #11]
  48:   e5d1b006        ldrb    fp, [r1, #6]
  4c:   e59d4004        ldr     r4, [sp, #4]
  50:   e58d3010        str     r3, [sp, #16]
  54:   e5d1300e        ldrb    r3, [r1, #14]
  58:   e187c80b        orr     ip, r7, fp, lsl #16
  5c:   e59db008        ldr     fp, [sp, #8]
  60:   e58d3014        str     r3, [sp, #20]
  64:   e5d13007        ldrb    r3, [r1, #7]
  68:   e1899404        orr     r9, r9, r4, lsl #8
  6c:   e59d4014        ldr     r4, [sp, #20]
  70:   e58d300c        str     r3, [sp, #12]
  74:   e188340a        orr     r3, r8, sl, lsl #8
  78:   e1835805        orr     r5, r3, r5, lsl #16
  7c:   e1868802        orr     r8, r6, r2, lsl #16
  80:   e5d1100f        ldrb    r1, [r1, #15]
  84:   e1856c0b        orr     r6, r5, fp, lsl #24
  88:   e59d200c        ldr     r2, [sp, #12]
  8c:   e59db010        ldr     fp, [sp, #16]
  90:   e1893804        orr     r3, r9, r4, lsl #16
  94:   e18c7c02        orr     r7, ip, r2, lsl #24
  98:   e1888c0b        orr     r8, r8, fp, lsl #24
  9c:   e1839c01        orr     r9, r3, r1, lsl #24
  a0:   ec476001        tmcrr   wr1, r6, r7
  a4:   ec498002        tmcrr   wr2, r8, r9
  a8:   ee110182        waddbus wr0, wr1, wr2
  ac:   edc00100        wstrd   wr0, [r0]
  b0:   e28dd018        add     sp, sp, #24
  b4:   e8bd0ff0        pop     {r4, r5, r6, r7, r8, r9, sl, fp}
  b8:   e12fff1e        bx      lr

000000bc <test2>:
  bc:   e2113007        ands    r3, r1, #7
  c0:   1a000007        bne     e4 <test2+0x28>
  c4:   e2813008        add     r3, r1, #8
  c8:   e2132007        ands    r2, r3, #7
  cc:   edd11100        wldrd   wr1, [r1]
  d0:   1a00000b        bne     104 <test2+0x48>
  d4:   edd10102        wldrd   wr0, [r1, #8]
  d8:   ee114180        waddbus wr4, wr1, wr0
  dc:   edc04100        wstrd   wr4, [r0]
  e0:   e12fff1e        bx      lr
  e4:   e3c12007        bic     r2, r1, #7
  e8:   edd21100        wldrd   wr1, [r2]
  ec:   edd20102        wldrd   wr0, [r2, #8]
  f0:   ee083110        tmcr    wcgr0, r3
  f4:   e2813008        add     r3, r1, #8
  f8:   e2132007        ands    r2, r3, #7
  fc:   ee811020        walignr0        wr1, wr1, wr0
 100:   0afffff3        beq     d4 <test2+0x18>
 104:   e3c33007        bic     r3, r3, #7
 108:   edd32100        wldrd   wr2, [r3]
 10c:   edd33102        wldrd   wr3, [r3, #8]
 110:   ee092110        tmcr    wcgr1, r2
 114:   ee920023        walignr1        wr0, wr2, wr3
 118:   ee114180        waddbus wr4, wr1, wr0
 11c:   edc04100        wstrd   wr4, [r0]
 120:   e12fff1e        bx      lr

The 'test1' function does not look good because it uses ARM
instructions to read data one byte at a time and combine it. Function
'test2' looks a bit better because it now uses WALIGNR, but this is
still not an optimal solution. Ideally, if we need to read N
contiguous unaligned 64-bit values, this requires (N + 1) loads via
WLDRD instructions and N fixups via WALIGNR, also shift argument for
WALIGNR has to be calculated only once.

-- 
Best regards,
Siarhei Siamashka