[Pixman] [PATCH 22/32] armv6: Add SRC combiner

Thu Aug 7 09:50:18 PDT 2014

The without-mask case reduces, as in the pixman-combine32.c version, to a
trivial memcpy call, but the with-mask case benefits from assembly code.

lowlevel-blt-bench results for src_8888_8_8888:

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  15.5   0.1      39.6   0.5     100.0%      +155.1%
L2  11.1   0.2      27.2   0.6     100.0%      +144.9%
M   11.2   0.0      33.3   0.0     100.0%      +196.2%
HT  8.8    0.0      16.3   0.1     100.0%      +85.5%
VT  8.6    0.0      15.5   0.1     100.0%      +81.7%
R   8.3    0.0      14.8   0.1     100.0%      +78.7%
RT  4.3    0.0      5.7    0.1     100.0%      +30.9%
---
 pixman/pixman-arm-simd-asm.S |   86 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |   21 ++++++++++
 2 files changed, 107 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index aeb40dc..f61b715 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -212,6 +212,92 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro src_8888_8888_8888_init
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        msr     CPSR_s, #0x50000
+        /* Point at alpha byte in mask */
+        add     MASK, MASK, #3
+        line_saved_regs STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro src_8888_8888_8888_newline
+        ldr     STRIDE_D, =0x00800080
+.endm
+
+.macro src_8888_8888_8888_1pixel  s, m, d, tmp, half
+        uxtb16  tmp, s
+        uxtb16  s, s, ror #8
+        mla     tmp, tmp, m, half
+        mla     s, s, m, half
+        uxtab16 tmp, tmp, tmp, ror #8
+        uxtab16 s, s, s, ror #8
+        mov     tmp, tmp, ror #8
+        sel     d, tmp, s
+.endm
+
+.macro src_8888_8888_8888_2pixels  s0, s1, m0, m1, d0, d1, tmp0, tmp1, half
+        uxtb16  tmp0, s0
+        uxtb16  s0, s0, ror #8
+        uxtb16  tmp1, s1
+        uxtb16  s1, s1, ror #8
+        mla     tmp0, tmp0, m0, half
+        mla     s0, s0, m0, half
+        mla     tmp1, tmp1, m1, half
+        mla     s1, s1, m1, half
+        uxtab16 tmp0, tmp0, tmp0, ror #8
+        uxtab16 s0, s0, s0, ror #8
+        uxtab16 tmp1, tmp1, tmp1, ror #8
+        uxtab16 s1, s1, s1, ror #8
+        mov     tmp0, tmp0, ror #8
+        mov     tmp1, tmp1, ror #8
+        sel     d0, tmp0, s0
+        sel     d1, tmp1, s1
+.endm
+
+.macro src_8888_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        ldm     SRC!, {STRIDE_S, STRIDE_M, WK0, WK1}
+        ldrb    WK2, [MASK], #4
+        ldrb    WK3, [MASK], #4
+        src_8888_8888_8888_2pixels STRIDE_S, STRIDE_M, WK2, WK3, WK2, WK3, SCRATCH, ORIG_W, STRIDE_D
+        ldrb    STRIDE_S, [MASK], #4
+        ldrb    STRIDE_M, [MASK], #4
+ .elseif numbytes == 8
+        ldm     SRC!, {STRIDE_S, STRIDE_M}
+        ldrb    WK2, [MASK], #4
+        ldrb    WK3, [MASK], #4
+ .else // numbytes == 4
+        ldr     STRIDE_S, [SRC], #4
+        ldrb    WK2, [MASK], #4
+ .endif
+.endm
+
+.macro src_8888_8888_8888_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        src_8888_8888_8888_2pixels WK0, WK1, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, SCRATCH, ORIG_W, STRIDE_D
+        stm     DST!, {WK2, WK3, SCRATCH, ORIG_W}
+ .elseif numbytes == 8
+        src_8888_8888_8888_2pixels STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, SCRATCH, ORIG_W, STRIDE_D
+        stm     DST!, {SCRATCH, ORIG_W}
+ .else // numbytes == 4
+        src_8888_8888_8888_1pixel STRIDE_S, WK2, SCRATCH, SCRATCH, STRIDE_D
+        str     SCRATCH, [DST], #4
+ .endif
+20:
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_src_mask_asm_armv6, 32, 32, 32, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS, \
+    2, /* prefetch distance */ \
+    src_8888_8888_8888_init, \
+    src_8888_8888_8888_newline, \
+    nop_macro, /* cleanup */ \
+    src_8888_8888_8888_process_head, \
+    src_8888_8888_8888_process_tail
+
+/******************************************************************************/
+
 .macro src_0565_8888_init
         /* Hold loop invariants in MASK and STRIDE_M */
         ldr     MASK, =0x07E007E0
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index b17266e..f938342 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -81,6 +81,26 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
                                         uint32_t, uint32_t)
 
+void
+pixman_composite_scanline_src_mask_asm_armv6 (int32_t         w,
+                                              uint32_t       *dst,
+                                              const uint32_t *src,
+                                              const uint32_t *mask);
+
+static void
+armv6_combine_src_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    if (mask)
+        pixman_composite_scanline_src_mask_asm_armv6 (width, dest, src, mask);
+    else
+        memcpy (dest, src, width * sizeof (uint32_t));
+}
+
 PIXMAN_ARM_BIND_COMBINE_U (armv6, over)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, over_reverse)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, in)
@@ -313,6 +333,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
 
+    imp->combine_32[PIXMAN_OP_SRC] = armv6_combine_src_u;
     imp->combine_32[PIXMAN_OP_OVER] = armv6_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = armv6_combine_over_reverse_u;
     imp->combine_32[PIXMAN_OP_IN] = armv6_combine_in_u;
-- 
1.7.5.4