[Pixman] [PATCH 18/32] armv6: Add ADD combiner

Ben Avison bavison at riscosopen.org
Thu Aug 7 09:50:14 PDT 2014


lowlevel-blt-bench results for two example operations, with and without masks,
neither of which has a dedicated fast path at the time of writing:

add_n_1555

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  11.6   0.1      13.7   0.1     100.0%      +18.4%
L2  11.6   0.1      13.6   0.2     100.0%      +18.1%
M   10.5   0.0      12.5   0.0     100.0%      +19.2%
HT  9.3    0.0      10.1   0.0     100.0%      +8.1%
VT  9.3    0.0      10.0   0.0     100.0%      +7.9%
R   8.9    0.0      9.6    0.0     100.0%      +7.7%
RT  5.4    0.1      5.4    0.1     96.2%       -0.5%  (insignificant)

add_n_8_1555

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  6.2    0.0      9.3    0.0     100.0%      +49.2%
L2  6.2    0.0      9.2    0.1     100.0%      +47.9%
M   5.7    0.0      8.8    0.0     100.0%      +52.7%
HT  5.3    0.0      6.8    0.0     100.0%      +27.8%
VT  5.3    0.0      6.7    0.0     100.0%      +26.7%
R   5.1    0.0      6.4    0.0     100.0%      +26.1%
RT  3.1    0.0      3.3    0.0     100.0%      +8.7%
---
 pixman/pixman-arm-simd-asm.S |   95 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |    4 ++
 2 files changed, 99 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 37e9f33..ce0edfc 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -426,6 +426,16 @@ generate_composite_function \
     add_8_8_process_head, \
     add_8_8_process_tail
 
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_armv6, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    2, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    add_8_8_process_head, \
+    add_8_8_process_tail
+
 /******************************************************************************/
 
 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
@@ -1698,3 +1708,88 @@ generate_composite_function \
     in_n_8888_process_tail
 
 /******************************************************************************/
+
+.macro add_8888_8888_8888_init
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        msr     CPSR_s, #0x50000
+        /* Point at alpha bytes in mask */
+        add     MASK, MASK, #3
+        line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro add_8888_8888_8888_newline
+        ldr     Y, =0x00800080
+.endm
+
+.macro add_8888_8888_8888_1pixel_head  d, s, m
+        ldr     s, [SRC], #4
+        ldrb    m, [MASK], #4
+        ldr     WK&d, [DST], #4
+.endm
+
+.macro add_8888_8888_8888_1pixel_tail  d, s, m, tmp, half
+        mul_8888_8 s, m, tmp, half
+        uqadd8  WK&d, WK&d, s
+.endm
+
+.macro add_8888_8888_8888_2pixels_head  d0, d1, s0, s1, tmp0, tmp1, half
+        ldm     SRC!, {s0, s1}
+        ldrb    WK&d0, [MASK], #4
+        ldrb    WK&d1, [MASK], #4
+        uxtb16  tmp0, s0
+        uxtb16  s0, s0, ror #8
+        uxtb16  tmp1, s1
+        uxtb16  s1, s1, ror #8
+        mla     tmp0, tmp0, WK&d0, half
+        mla     s0, s0, WK&d0, half
+        mla     tmp1, tmp1, WK&d1, half
+        mla     s1, s1, WK&d1, half
+        ldm     DST!, {WK&d0, WK&d1}
+.endm
+
+.macro add_8888_8888_8888_2pixels_tail  d0, d1, s0, s1, tmp0, tmp1
+        uxtab16 tmp0, tmp0, tmp0, ror #8
+        uxtab16 s0, s0, s0, ror #8
+        uxtab16 tmp1, tmp1, tmp1, ror #8
+        uxtab16 s1, s1, s1, ror #8
+        mov     tmp0, tmp0, ror #8
+        mov     tmp1, tmp1, ror #8
+        sel     s0, tmp0, s0
+        sel     s1, tmp1, s1
+        uqadd8  WK&d0, WK&d0, s0
+        uqadd8  WK&d1, WK&d1, s1
+.endm
+
+.macro add_8888_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        add_8888_8888_8888_2pixels_head %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W, Y
+        add_8888_8888_8888_2pixels_tail %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+        add_8888_8888_8888_2pixels_head %(firstreg+2), %(firstreg+3), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W, Y
+ .elseif numbytes == 8
+        add_8888_8888_8888_2pixels_head %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W, Y
+ .else // numbytes == 4
+        add_8888_8888_8888_1pixel_head firstreg, STRIDE_S, STRIDE_M
+ .endif
+.endm
+
+.macro add_8888_8888_8888_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        add_8888_8888_8888_2pixels_tail %(firstreg+2), %(firstreg+3), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+ .elseif numbytes == 8
+        add_8888_8888_8888_2pixels_tail %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+ .else // numbytes == 4
+        add_8888_8888_8888_1pixel_tail firstreg, STRIDE_S, STRIDE_M, ORIG_W, Y
+ .endif
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_armv6, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER, \
+    2, /* prefetch distance */ \
+    add_8888_8888_8888_init, \
+    add_8888_8888_8888_newline, \
+    nop_macro, /* cleanup */ \
+    add_8888_8888_8888_process_head, \
+    add_8888_8888_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 8bdda82..834995a 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -81,6 +81,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
                                         uint32_t, uint32_t)
 
+PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
+
 void
 pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
                                        int32_t   h,
@@ -305,6 +307,8 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
 
+    imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
+
     imp->blt = arm_simd_blt;
     imp->fill = arm_simd_fill;
 
-- 
1.7.5.4



More information about the Pixman mailing list