[Pixman] [PATCH 26/37] armv6: Add fetcher for a8r8g8b8 nearest-neighbour transformed images

Sun Sep 14 13:10:12 PDT 2014

This is constrained to support X increments in the positive X direction only,
so this means scaled images (except those reflected in the Y axis) plus
parallelogram transformations which preserve the direction of the X axis.
It also doesn't attempt to support any form of image repeat.

With this optimisation, some operations constructed from fetcher and combiner
calls using general_composite_rect() now outperform the versions consructed
from FAST_NEAREST macros in pixman-fast-path.c, but unfortunately the
FAST_NEAREST ones have higher priority in fast path lookup. Here are some
benchmarks for the in_reverse_8888_8888 operation, which is not affected:

lowlevel-blt-bench -n :

     Before          After
     Mean   StdDev   Mean   StdDev  Confidence  Change
L1   10.2   0.0      27.1   0.2     100.0%      +164.8%
L2   8.2    0.1      23.0   0.4     100.0%      +179.2%
M    8.3    0.0      24.8   0.0     100.0%      +200.3%
HT   5.5    0.0      12.7   0.0     100.0%      +129.9%
VT   5.4    0.0      12.1   0.0     100.0%      +123.2%
R    5.4    0.0      11.9   0.1     100.0%      +122.7%
RT   2.8    0.0      5.4    0.1     100.0%      +91.9%

affine-bench for 5 different scaling factors:

     Before          After
     Mean   StdDev   Mean   StdDev  Confidence  Change
0.5  11.1   0.0      28.3   0.0     100.0%      +155.1%
0.75 10.5   0.0      26.4   0.0     100.0%      +152.2%
1.0  9.9    0.0      24.6   0.0     100.0%      +147.5%
1.5  9.0    0.0      21.8   0.0     100.0%      +141.4%
2.0  8.3    0.0      19.7   0.0     100.0%      +138.4%
---
 pixman/pixman-arm-common.h          |   14 ++
 pixman/pixman-arm-simd-asm-scaled.S |    9 +
 pixman/pixman-arm-simd-asm-scaled.h |  367 +++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c            |   70 +++++++
 4 files changed, 460 insertions(+), 0 deletions(-)
 create mode 100644 pixman/pixman-arm-simd-asm-scaled.h

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index f4632b2..a4d4ea4 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -455,6 +455,20 @@ cputype##_combine_##name##_u (pixman_implementation_t *imp,                   \
 
 /*****************************************************************************/
 
+#define PIXMAN_ARM_NEAREST_AFFINE_FLAGS                                 \
+    (FAST_PATH_NO_ALPHA_MAP             |                               \
+     FAST_PATH_NO_ACCESSORS             |                               \
+     FAST_PATH_NARROW_FORMAT            |                               \
+     FAST_PATH_NEAREST_FILTER           |                               \
+     FAST_PATH_HAS_TRANSFORM            |                               \
+     FAST_PATH_AFFINE_TRANSFORM)
+
+#define PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS                           \
+    (PIXMAN_ARM_NEAREST_AFFINE_FLAGS      |                             \
+     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST |                             \
+     FAST_PATH_X_UNIT_POSITIVE            |                             \
+     FAST_PATH_Y_UNIT_ZERO)
+
 #define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name)                         \
 void                                                                        \
 pixman_get_scanline_##name##_asm_##cputype (int32_t        w,               \
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index e050292..2c7e091 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -38,6 +38,7 @@
 	.p2align 2
 
 #include "pixman-arm-asm.h"
+#include "pixman-arm-simd-asm-scaled.h"
 
 /*
  * Note: This code is only using armv5te instructions (not even armv6),
@@ -154,3 +155,11 @@ generate_nearest_scanline_func \
 
 generate_nearest_scanline_func \
     pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
+
+/******************************************************************************/
+
+generate_nearest_scaled_cover_function \
+    pixman_get_scanline_nearest_scaled_cover_a8r8g8b8_asm_armv6, 32, \
+    3, 3 /* prefetch distances */, nop_macro, nop_macro
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd-asm-scaled.h b/pixman/pixman-arm-simd-asm-scaled.h
new file mode 100644
index 0000000..fb6eb44
--- /dev/null
+++ b/pixman/pixman-arm-simd-asm-scaled.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright © 2014 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison at riscosopen.org)
+ */
+
+.set log2_32, 5
+.set log2_16, 4
+.set log2_8,  3
+.set log2_4,  2
+.set log2_2,  1
+.set log2_1,  0
+
+.macro ldrx  bpp, cond, tail
+ .if \bpp == 32
+        ldr\cond \tail
+ .elseif \bpp == 16
+        ldr\cond\()h \tail
+ .elseif \bpp == 8
+        ldr\cond\()b \tail
+ .else
+        .error  "Input bits per pixel not supported"
+ .endif
+.endm
+
+.macro branch  cond, label1, label2
+ .ifnc "\label1", ""
+        b\cond  \label1
+ .else
+        b\cond  \label2
+ .endif
+.endm
+
+.macro nearest_scaled_cover_enlarge_mask_innerloop  bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+ .ifnc \mask_hint, mask_is_0
+        teq     VALID, #1
+  .ifc \convert, nop_macro
+   .ifnc \mask_hint, mask_is_non_0
+        ittt    ne
+        teqne   \reg, #0
+   .else
+        itt     ne
+   .endif
+        ldrx    \bpp, ne, <PIXEL, [SRC]>
+        movne   VALID, #1
+  .else
+   .ifnc \mask_hint, mask_is_non_0
+        it      ne
+        teqne   \reg, #0
+   .endif
+        beq     1101f
+        ldrx    \bpp,, <PIXEL, [SRC]>
+        mov     VALID, #1
+        \convert PIXEL, TMP
+1101:
+  .endif
+ .endif
+        adds    ACCUM, ACCUM, UX
+ .ifnc \mask_hint, mask_is_0
+        mov     \reg, PIXEL
+ .endif
+        \store
+        branch  cc, \exit_label, 1103f
+        add     SRC, SRC, #\bpp/8
+        mov     VALID, #0
+        tst     SRC, #31
+        branch  ne, \exit_label, 1103f
+        subs    PLDS, PLDS, #32
+        branch  lt, \exit_label, 1103f
+        pld     [SRC, #prefetch_distance_src*32]
+1103:
+.endm
+
+.macro nearest_scaled_cover_enlarge_nomask_innerloop  bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+        adds    ACCUM, ACCUM, UX
+        mov     \reg, PIXEL
+        \store
+        branch  cc, \exit_label, 1203f
+ .ifnc "\may_be_final",""
+        teq     COUNT, #0
+        ldrx    \bpp, ne, <PIXEL, [SRC, #\bpp/8]!!>
+ .else
+        ldrx    \bpp,, <PIXEL, [SRC, #\bpp/8]!!>
+ .endif
+        tst     SRC, #31
+        \convert PIXEL, TMP
+        branch  ne, \exit_label, 1203f
+        subs    PLDS, PLDS, #32
+        branch  lt, \exit_label, 1203f
+        pld     [SRC, #prefetch_distance_src*32]
+1203:
+.endm
+
+.macro nearest_scaled_cover_reduce_mask_innerloop  bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+        add     PTR, SRC, XHI, lsl #log2_\bpp - 3
+        mov     TMP, XHI
+        adds    XLO, XLO, UX, lsl #16
+        adc     XHI, XHI, UX, lsr #16
+ .ifc "\mask_hint",""
+        teq     \reg, #0
+  .ifnc \convert, nop_macro
+        beq     1301f
+        ldrx    \bpp,, <\reg, [PTR]>
+  .else
+        ldrx    \bpp, ne, <\reg, [PTR]>
+  .endif
+        eor     TMP, TMP, XHI
+        bics    TMP, TMP, #255/\bpp
+        \convert \reg, TMP
+  .ifnc \convert, nop_macro
+        b       1302f
+1301:   eor     TMP, TMP, XHI
+        bics    TMP, TMP, #255/\bpp
+1302:
+  .endif
+ .else
+  .ifc \mask_hint, mask_is_non_0
+        ldrx    \bpp,, <\reg, [PTR]>
+  .endif
+        eor     TMP, TMP, XHI
+        bics    TMP, TMP, #255/\bpp
+  .ifc \mask_hint, mask_is_non_0
+        \convert \reg, TMP
+  .endif
+ .endif
+        \store
+        branch  eq, \exit_label, 1303f
+        subs    PLDS, PLDS, #32
+        branch  lt, \exit_label, 1303f
+        bic     PTR, PTR, #31   @ base of *previous* cacheline
+        pld     [PTR, #(prefetch_distance_src+1)*32]
+1303:
+.endm
+
+.macro nearest_scaled_cover_reduce_nomask_innerloop  bpp, reg, convert, mask_hint, may_be_final, exit_label, store
+        add     PTR, SRC, XHI, lsl #log2_\bpp - 3
+        mov     TMP, XHI
+        adds    XLO, XLO, UX, lsl #16
+        adc     XHI, XHI, UX, lsr #16
+        ldrx    \bpp,, <\reg, [PTR]>
+        eor     TMP, TMP, XHI
+        bics    TMP, TMP, #255/\bpp
+        \convert \reg, TMP
+        \store
+        branch  eq, \exit_label, 1403f
+        subs    PLDS, PLDS, #32
+        branch  lt, \exit_label, 1403f
+        bic     PTR, PTR, #31   @ base of *previous* cacheline
+        pld     [PTR, #(prefetch_distance_src+1)*32]
+1403:
+.endm
+
+.macro process1  bpp, has_mask, disable_prefetch, inner_loop, convert
+ .if \has_mask
+        ldr     WK0, [MASK], #4
+  .if !\disable_prefetch
+        tst     MASK, #31
+        bne     1501f
+        pld     [MASK, #prefetch_distance_mask*32]
+1501:
+  .endif
+        teq     WK0, #0
+        bne     1502f
+        \inner_loop  \bpp, WK0, \convert, mask_is_0, 1, 1503f, <add DST, DST, #4>
+        b       1503f
+ .endif
+1502:   \inner_loop  \bpp, WK0, \convert, mask_is_non_0, 1,, <str WK0, [DST], #4>
+1503:
+.endm
+
+.macro process4  bpp, has_mask, disable_mask_prefetch, inner_loop, convert
+ .if \has_mask
+        ldmia   MASK!, {WK0-WK3}
+  .if !\disable_mask_prefetch
+        bic     TMP, MASK, #31
+        pld     [TMP, #prefetch_distance_mask*32]
+  .endif
+        orr     WK0, WK0, WK1
+        orr     WK2, WK2, WK3
+        orrs    WK0, WK0, WK2
+        bne     1601f
+        \inner_loop  \bpp, WK0, \convert, mask_is_0
+        \inner_loop  \bpp, WK1, \convert, mask_is_0
+        \inner_loop  \bpp, WK2, \convert, mask_is_0
+        \inner_loop  \bpp, WK3, \convert, mask_is_0, 1, 1602f, <add DST, DST, #4*4>
+        b       1602f
+ .endif
+1601:   \inner_loop  \bpp, WK0, \convert
+        \inner_loop  \bpp, WK1, \convert
+        \inner_loop  \bpp, WK2, \convert
+        \inner_loop  \bpp, WK3, \convert,, 1,, <stmia DST!!, {WK0,WK1,WK2,WK3}>
+1602:
+.endm
+
+.macro process  bpp, has_mask, inner_loop, convert
+        cmp     COUNT, #2 * 4 - 1 - 1   @ guaranteed at least one aligned half-cacheline output?
+        blo     1706f
+        tst     DST, #15
+        beq     1702f
+1701:   process1 \bpp, \has_mask, 0, \inner_loop, \convert
+        sub     COUNT, COUNT, #1
+        tst     DST, #15
+        bne     1701b
+1702:   sub     COUNT, COUNT, #4 - 1
+        tst     MASK, #16
+        beq     1704f
+1703:   process4  \bpp, \has_mask, 0, \inner_loop, \convert
+        subs    COUNT, COUNT, #4
+        bcc     1705f
+1704:   process4  \bpp, \has_mask, 1, \inner_loop, \convert
+        subs    COUNT, COUNT, #4
+        bcs     1703b
+1705:   adds    COUNT, COUNT, #4 - 1
+        bcc     1707f
+        @ drop through...
+1706:   process1 \bpp, \has_mask, 1, \inner_loop, \convert
+        subs    COUNT, COUNT, #1
+        bcs     1706b
+1707:   pop     {r4-r11, pc}
+.endm
+
+.macro generate_nearest_scaled_cover_function fname, \
+                                              bpp, \
+                                              prefetch_distance_src_, \
+                                              prefetch_distance_mask_, \
+                                              init, \
+                                              convert
+
+/* void fname(uint32_t width,
+ *            pixman_fixed_t x,
+ *            pixman_fixed_t ux,
+ *            uint32_t *dest,
+ *            const uint32_t *source,
+ *            const uint32_t *mask);
+ */
+pixman_asm_function fname
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set prefetch_distance_src,  prefetch_distance_src_
+ .set prefetch_distance_mask, prefetch_distance_mask_
+
+/*
+ * Assign symbolic names to registers
+ */
+COUNT   .req    a1
+X       .req    a2
+ACCUM   .req    a2  @ enlarge only
+XLO     .req    a2  @ reduce only
+UX      .req    a3
+DST     .req    a4
+SRC     .req    v1
+MASK    .req    v2
+PLDS    .req    v3
+PIXEL   .req    v4  @ enlarge only
+XHI     .req    v4  @ reduce only
+WK0     .req    v5
+WK1     .req    v6
+WK2     .req    sl
+WK3     .req    fp
+VALID   .req    ip  @ enlarge-with-mask only
+PTR     .req    ip  @ reduce only
+TMP     .req    lr
+
+        mov     ip, sp
+        push    {r4-r11, lr}        /* save all registers */
+        ldmia   ip, {SRC, MASK}
+        subs    COUNT, COUNT, #1
+        blo     1807f-4
+        \init
+        mla     WK2, COUNT, UX, X
+        bics    WK0, MASK, #31
+        beq     1801f
+        @ Use a simplified preload process for the mask,
+        @ without a braking distance.
+  .set OFFSET, 0
+  .rept prefetch_distance_mask + 1
+        pld     [WK0, #OFFSET]
+   .set OFFSET, OFFSET + 32
+  .endr
+1801:
+        add     WK0, SRC, X, lsr #16 - (log2_\bpp - 3)
+        bic     WK0, WK0, #31
+        pld     [WK0]
+        add     WK2, SRC, WK2, lsr #16 - (log2_\bpp - 3)
+        bic     WK2, WK2, #31
+        add     WK1, WK0, #prefetch_distance_src*32
+        subs    PLDS, WK2, WK1
+        movcc   WK1, WK2
+1802:   add     WK0, WK0, #32
+        cmp     WK0, WK1
+        bhi     1803f
+        pld     [WK0]
+        b       1802b
+1803:
+        cmp     UX, #0x10000
+        bhs     1805f
+        @ Enlarge
+        add     SRC, X, lsr #16 - (log2_\bpp - 3)
+        mov     ACCUM, X, lsl #16
+        mov     UX, UX, lsl #16
+        bic     SRC, SRC, #(\bpp-1)/8
+        teq     MASK, #0
+        beq     1804f
+        mov     VALID, #0
+        process \bpp, 1, nearest_scaled_cover_enlarge_mask_innerloop, \convert
+1804:
+        ldrx    \bpp,, <PIXEL, [SRC]>
+        \convert PIXEL, TMP
+        process \bpp, 0, nearest_scaled_cover_enlarge_nomask_innerloop, \convert
+
+1805:   @ Reduce
+        and     TMP, SRC, #31
+        bic     SRC, SRC, #31
+        mov     XHI, X, lsr #16
+        mov     XLO, X, lsl #16
+        add     XHI, XHI, TMP, lsr #log2_\bpp - 3
+        teq     MASK, #0
+        beq     1806f
+        process \bpp, 1, nearest_scaled_cover_reduce_mask_innerloop, \convert
+1806:   process \bpp, 0, nearest_scaled_cover_reduce_nomask_innerloop, \convert
+1807:
+
+        .unreq  COUNT
+        .unreq  X
+        .unreq  ACCUM
+        .unreq  XLO
+        .unreq  UX
+        .unreq  DST
+        .unreq  SRC
+        .unreq  MASK
+        .unreq  PLDS
+        .unreq  PIXEL
+        .unreq  XHI
+        .unreq  WK0
+        .unreq  WK1
+        .unreq  WK2
+        .unreq  WK3
+        .unreq  VALID
+        .unreq  PTR
+        .unreq  TMP
+.endfunc
+.endm
+
+.macro nop_macro x:vararg
+.endm
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index f028794..0a4daa7 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -116,6 +116,68 @@ PIXMAN_ARM_BIND_WRITE_BACK   (armv6, r5g6b5)
 PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a1r5g5b5)
 PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a8)
 
+#define PIXMAN_IMAGE_GET_SCALED(image, unscaled_x, unscaled_y, type, stride, out_bits, scaled_x, scaled_y, uxx, uxy, uyy) \
+    do                                                                                           \
+    {                                                                                            \
+        pixman_image_t  *__image__      = (image);                                               \
+        pixman_fixed_t   __offset__     = pixman_int_to_fixed (unscaled_x) + pixman_fixed_1 / 2; \
+        pixman_fixed_t   __line__       = pixman_int_to_fixed (unscaled_y) + pixman_fixed_1 / 2; \
+        pixman_fixed_t   __x__, __y__;                                                           \
+        int64_t          __x64__, __y64__;                                                       \
+        pixman_fixed_t (*__matrix__)[3] = __image__->common.transform->matrix;                   \
+                                                                                                 \
+        __x64__  = (int64_t) __matrix__[0][0] * (__offset__ & 0xFFFF);                           \
+        __x64__ += (int64_t) __matrix__[0][1] * (__line__ & 0xFFFF);                             \
+        __x__    = (__x64__ + 0x8000) >> 16;                                                     \
+        __x__   += __matrix__[0][0] * (__offset__ >> 16);                                        \
+        __x__   += __matrix__[0][1] * (__line__ >> 16);                                          \
+        __x__   += __matrix__[0][2];                                                             \
+        __y64__  = (int64_t) __matrix__[1][1] * (__line__ & 0xFFFF);                             \
+        __y__    = (__y64__ + 0x8000) >> 16;                                                     \
+        __y__   += __matrix__[1][1] * (__line__ >> 16);                                          \
+        __y__   += __matrix__[1][2];                                                             \
+                                                                                                 \
+        (stride)   = __image__->bits.rowstride * (int) sizeof (uint32_t) / (int) sizeof (type);  \
+        (out_bits) = (type *)__image__->bits.bits;                                               \
+        (scaled_x) = __x__;                                                                      \
+        (scaled_y) = __y__;                                                                      \
+        (uxx)      = __matrix__[0][0];                                                           \
+        (uxy)      = __matrix__[0][1];                                                           \
+        (uyy)      = __matrix__[1][1];                                                           \
+    } while (0)
+
+#define BIND_GET_SCANLINE_NEAREST_SCALED_COVER(cputype, name, type)         \
+void                                                                        \
+pixman_get_scanline_nearest_scaled_cover_##name##_asm_##cputype (           \
+                                                  uint32_t        width,    \
+                                                  pixman_fixed_t  x,        \
+                                                  pixman_fixed_t  ux,       \
+                                                  uint32_t       *dest,     \
+                                                  const type     *source,   \
+                                                  const uint32_t *mask);    \
+                                                                            \
+static uint32_t *                                                           \
+cputype##_get_scanline_nearest_scaled_cover_##name (pixman_iter_t  *iter,   \
+                                                    const uint32_t *mask)   \
+{                                                                           \
+    int            stride;                                                  \
+    type          *bits, *src;                                              \
+    pixman_fixed_t x, y, uxx, uxy, uyy;                                     \
+                                                                            \
+    PIXMAN_IMAGE_GET_SCALED (iter->image, iter->x, iter->y++, type,         \
+                             stride, bits, x, y, uxx, uxy, uyy);            \
+                                                                            \
+    (void) uxy;                                                             \
+    (void) uyy;                                                             \
+    src = bits + stride * pixman_fixed_to_int (y - pixman_fixed_e);         \
+    pixman_get_scanline_nearest_scaled_cover_##name##_asm_##cputype (       \
+            iter->width, x - pixman_fixed_e, uxx, iter->buffer, src, mask); \
+                                                                            \
+    return iter->buffer;                                                    \
+}
+
+BIND_GET_SCANLINE_NEAREST_SCALED_COVER (armv6, a8r8g8b8, uint32_t)
+
 void
 pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
                                        int32_t   h,
@@ -340,6 +402,14 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
 
 static const pixman_iter_info_t arm_simd_iters[] =
 {
+    { PIXMAN_a8r8g8b8,
+      PIXMAN_ARM_NEAREST_SCALED_COVER_FLAGS,
+      ITER_NARROW | ITER_SRC,
+      NULL,
+      armv6_get_scanline_nearest_scaled_cover_a8r8g8b8,
+      NULL
+    },
+
     { PIXMAN_r5g6b5,
       (FAST_PATH_STANDARD_FLAGS             |
        FAST_PATH_ID_TRANSFORM               |
-- 
1.7.5.4