[Pixman] [PATCH 20/32] armv6: Add IN, IN_REVERSE, OUT and OUT_REVERSE combiners

Ben Avison bavison at riscosopen.org
Thu Aug 7 09:50:16 PDT 2014


lowlevel-blt-bench results for example operations, with and without masks,
none of which has a dedicated fast path at the time of writing:

in_n_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  11.0   0.0      16.1   0.1     100.0%      +46.6%
L2  10.7   0.1      15.6   0.2     100.0%      +46.0%
M   9.6    0.0      14.1   0.0     100.0%      +46.6%
HT  8.8    0.0      11.2   0.0     100.0%      +26.3%
VT  8.8    0.0      11.0   0.0     100.0%      +26.1%
R   8.5    0.0      10.6   0.0     100.0%      +25.3%
RT  5.4    0.0      6.0    0.1     100.0%      +11.0%

in_n_8_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  6.3    0.0      10.5   0.0     100.0%      +67.0%
L2  6.4    0.0      10.4   0.1     100.0%      +63.9%
M   5.9    0.0      9.8    0.0     100.0%      +65.9%
HT  5.4    0.0      7.3    0.0     100.0%      +36.0%
VT  5.3    0.0      7.2    0.0     100.0%      +35.7%
R   5.1    0.0      6.9    0.0     100.0%      +33.9%
RT  3.1    0.0      3.6    0.0     100.0%      +14.5%

inrev_n_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  11.0   0.0      15.5   0.1     100.0%      +40.9%
L2  10.7   0.1      15.0   0.2     100.0%      +40.6%
M   9.6    0.0      13.6   0.0     100.0%      +41.9%
HT  8.8    0.0      11.0   0.0     100.0%      +24.4%
VT  8.8    0.0      10.8   0.0     100.0%      +23.9%
R   8.5    0.0      10.4   0.0     100.0%      +23.0%
RT  5.4    0.1      6.0    0.1     100.0%      +10.9%

inrev_n_8_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  6.8    0.0      11.0   0.1     100.0%      +61.5%
L2  6.9    0.1      10.9   0.1     100.0%      +58.0%
M   6.3    0.0      10.2   0.0     100.0%      +61.2%
HT  5.7    0.0      7.6    0.0     100.0%      +32.2%
VT  5.7    0.0      7.5    0.0     100.0%      +32.5%
R   5.5    0.0      7.2    0.0     100.0%      +30.5%
RT  3.2    0.0      3.6    0.0     100.0%      +12.6%

out_n_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  10.8   0.0      15.6   0.1     100.0%      +44.6%
L2  10.4   0.1      15.3   0.1     100.0%      +46.7%
M   9.4    0.0      13.8   0.0     100.0%      +46.2%
HT  8.7    0.0      11.0   0.0     100.0%      +25.7%
VT  8.6    0.0      10.8   0.1     100.0%      +25.4%
R   8.4    0.0      10.4   0.0     100.0%      +24.3%
RT  5.4    0.0      5.7    0.1     100.0%      +6.4%

out_n_8_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  6.2    0.0      10.3   0.0     100.0%      +66.1%
L2  6.2    0.1      10.3   0.1     100.0%      +65.7%
M   5.6    0.2      9.7    0.0     100.0%      +72.9%
HT  5.3    0.0      7.2    0.0     100.0%      +36.1%
VT  5.3    0.0      7.1    0.0     100.0%      +35.7%
R   5.1    0.0      6.8    0.0     100.0%      +33.9%
RT  3.1    0.0      3.5    0.0     100.0%      +13.7%

outrev_n_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  9.8    0.0      15.3   0.1     100.0%      +55.4%
L2  9.5    0.1      15.0   0.2     100.0%      +57.4%
M   8.6    0.0      13.6   0.0     100.0%      +57.8%
HT  8.1    0.0      10.9   0.0     100.0%      +34.5%
VT  8.0    0.0      10.8   0.0     100.0%      +34.0%
R   7.8    0.0      10.3   0.0     100.0%      +32.6%
RT  5.1    0.1      5.8    0.1     100.0%      +13.3%

outrev_n_8_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  6.3    0.0      11.0   0.1     100.0%      +73.8%
L2  6.4    0.0      11.0   0.1     100.0%      +73.1%
M   5.4    0.2      10.2   0.0     100.0%      +87.7%
HT  5.4    0.0      7.6    0.0     100.0%      +40.4%
VT  5.4    0.0      7.5    0.0     100.0%      +40.0%
R   5.2    0.0      7.2    0.0     100.0%      +38.3%
RT  3.1    0.0      3.6    0.1     100.0%      +15.9%
---
 pixman/pixman-arm-simd-asm.S |  376 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |    8 +
 2 files changed, 384 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 0c13a73..15eabe2 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -2048,3 +2048,379 @@ generate_composite_function_single_scanline \
     add_8888_8888_8888_process_tail
 
 /******************************************************************************/
+
+.macro inout_init
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        msr     CPSR_s, #0x50000
+        /* Point at alpha byte in source / mask / dest */
+ .if REVERSE
+        add     SRC, SRC, #3
+ .endif
+ .if WITH_MASK
+        add     MASK, MASK, #3
+ .endif
+ .if !REVERSE
+        add     DST, DST, #3
+ .endif
+        line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro inout_newline
+        ldr     Y, =0x00800080
+        mov     STRIDE_D, #0xff
+.endm
+
+.macro inout_1pixel  s, m, d, tmp, half, ff, offset
+        uxtb16  tmp, s
+ .if !IN_NOT_OUT
+        bic     d, ff, d
+ .endif
+        uxtb16  s, s, ror #8
+ .if WITH_MASK
+        mla     tmp, tmp, m, half
+        mla     s, s, m, half
+        uxtab16 tmp, tmp, tmp, ror #8
+        uxtab16 s, s, s, ror #8
+        uxtb16  tmp, tmp, ror #8
+        uxtb16  s, s, ror #8
+ .endif
+        mla     tmp, tmp, d, half
+        mla     s, s, d, half
+        uxtab16 tmp, tmp, tmp, ror #8
+        uxtab16 s, s, s, ror #8
+        mov     tmp, tmp, ror #8
+        sel     s, tmp, s
+        str     s, [DST, #offset]
+.endm
+
+.macro inout_2pixels  s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset
+        uxtb16  tmp0, s0
+        uxtb16  s0, s0, ror #8
+ .if !IN_NOT_OUT
+        bic     d0, ff, d0
+        bic     d1, ff, d1
+ .endif
+        uxtb16  tmp1, s1
+        uxtb16  s1, s1, ror #8
+ .if WITH_MASK
+        mla     tmp0, tmp0, m0, half
+        mla     s0, s0, m0, half
+        mla     tmp1, tmp1, m1, half
+        mla     s1, s1, m1, half
+        uxtab16 tmp0, tmp0, tmp0, ror #8
+        uxtab16 s0, s0, s0, ror #8
+        uxtab16 tmp1, tmp1, tmp1, ror #8
+        uxtab16 s1, s1, s1, ror #8
+        uxtb16  tmp0, tmp0, ror #8
+        uxtb16  s0, s0, ror #8
+        uxtb16  tmp1, tmp1, ror #8
+        uxtb16  s1, s1, ror #8
+ .endif
+        mla     tmp0, tmp0, d0, half
+        mla     s0, s0, d0, half
+        mla     tmp1, tmp1, d1, half
+        mla     s1, s1, d1, half
+        uxtab16 tmp0, tmp0, tmp0, ror #8
+        uxtab16 s0, s0, s0, ror #8
+        uxtab16 tmp1, tmp1, tmp1, ror #8
+        uxtab16 s1, s1, s1, ror #8
+        mov     tmp0, tmp0, ror #8
+        mov     tmp1, tmp1, ror #8
+        sel     s0, tmp0, s0
+        sel     s1, tmp1, s1
+        strd    s0, s1, [DST, #offset]
+.endm
+
+.macro inout_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes >= 8
+        pixld   , 8, 0, SRC, unaligned_src
+  .if WITH_MASK
+        ldrb    STRIDE_S, [MASK], #4
+        ldrb    STRIDE_M, [MASK], #4
+  .endif
+        ldrb    WK2, [DST], #4
+        ldrb    WK3, [DST], #4
+  .if numbytes == 16
+        inout_2pixels  WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -11
+        pixld   , 8, 0, SRC, unaligned_src
+   .if WITH_MASK
+        ldrb    STRIDE_S, [MASK], #4
+        ldrb    STRIDE_M, [MASK], #4
+   .endif
+        ldrb    WK2, [DST], #4
+        ldrb    WK3, [DST], #4
+  .endif
+ .else // numbytes == 4
+        pixld   , 4, 0, SRC, unaligned_src
+  .if WITH_MASK
+        ldrb    STRIDE_S, [MASK], #4
+  .endif
+        ldrb    WK2, [DST], #4
+ .endif
+.endm
+
+.macro inout_process_tail  cond, numbytes, firstreg
+ .if numbytes >= 8
+        inout_2pixels  WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -11
+ .else // numbytes == 4
+        inout_1pixel  WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -7
+ .endif
+.endm
+
+.macro inout_reverse_1pixel  s, m, d, tmp, half, ff, offset
+ .if WITH_MASK
+        smlabb  s, s, m, half
+        uxtb16  tmp, d
+        uxtb16  d, d, ror #8
+        uxtab   s, s, s, ror #8
+  .if IN_NOT_OUT
+        and     s, ff, s, lsr #8
+  .else
+        bic     s, ff, s, lsr #8
+  .endif
+ .else
+  .if !IN_NOT_OUT
+        bic     s, ff, s
+  .endif
+        uxtb16  tmp, d
+        uxtb16  d, d, ror #8
+ .endif
+        mla     tmp, tmp, s, half
+        mla     d, d, s, half
+        uxtab16 tmp, tmp, tmp, ror #8
+        uxtab16 d, d, d, ror #8
+        mov     tmp, tmp, ror #8
+        sel     d, tmp, d
+        str     d, [DST, #offset]
+.endm
+
+.macro inout_reverse_2pixels  s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset
+ .if WITH_MASK
+        smlabb  s0, s0, m0, half
+        smlabb  s1, s1, m1, half
+        uxtb16  tmp0, d0
+        uxtb16  d0, d0, ror #8
+        uxtab   s0, s0, s0, ror #8
+        uxtab   s1, s1, s1, ror #8
+  .if IN_NOT_OUT
+        and     s0, ff, s0, lsr #8
+        and     s1, ff, s1, lsr #8
+  .else
+        bic     s0, ff, s0, lsr #8
+        bic     s1, ff, s1, lsr #8
+  .endif
+ .else
+  .if !IN_NOT_OUT
+        bic     s0, ff, s0
+        bic     s1, ff, s1
+  .endif
+        uxtb16  tmp0, d0
+        uxtb16  d0, d0, ror #8
+ .endif
+        uxtb16  tmp1, d1
+        uxtb16  d1, d1, ror #8
+        mla     tmp0, tmp0, s0, half
+        mla     d0, d0, s0, half
+        mla     tmp1, tmp1, s1, half
+        mla     d1, d1, s1, half
+        uxtab16 tmp0, tmp0, tmp0, ror #8
+        uxtab16 d0, d0, d0, ror #8
+        uxtab16 tmp1, tmp1, tmp1, ror #8
+        uxtab16 d1, d1, d1, ror #8
+        mov     tmp0, tmp0, ror #8
+        mov     tmp1, tmp1, ror #8
+        sel     d0, tmp0, d0
+        sel     d1, tmp1, d1
+        strd    d0, d1, [DST, #offset]
+.endm
+
+.macro inout_reverse_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        ldrb    WK0, [SRC], #4
+        ldrb    WK1, [SRC], #4
+        ldrb    SCRATCH, [SRC], #4
+        ldrb    ORIG_W, [SRC], #4
+  .if IN_NOT_OUT
+        and     WK2, WK0, WK1
+        and     WK2, WK2, SCRATCH
+        and     WK2, WK2, ORIG_W
+        bics    WK2, STRIDE_D, WK2
+  .else
+        orr     WK2, WK0, WK1
+        orr     WK2, WK2, SCRATCH
+        orrs    WK2, WK2, ORIG_W
+  .endif
+  .if WITH_MASK
+        ldrb    STRIDE_S, [MASK], #4
+        ldrb    STRIDE_M, [MASK], #4
+        ldrb    SCRATCH, [MASK], #4
+        ldrb    ORIG_W, [MASK], #4
+  .endif
+        ldrd    WK2, WK3, [DST], #16
+  .if WITH_MASK
+        bne     10f
+   .if IN_NOT_OUT
+        and     SCRATCH, SCRATCH, ORIG_W
+        and     SCRATCH, SCRATCH, STRIDE_S
+        and     SCRATCH, SCRATCH, STRIDE_M
+        bics    SCRATCH, STRIDE_D, SCRATCH
+   .else
+        orr     SCRATCH, SCRATCH, ORIG_W
+        orr     SCRATCH, SCRATCH, STRIDE_S
+        orrs    SCRATCH, SCRATCH, STRIDE_M
+   .endif
+  .endif
+        beq     20f
+10:
+        inout_reverse_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -16
+  .if IN_NOT_OUT && !WITH_MASK
+        ldrd    WK2, WK3, [DST, #-8]
+  .endif
+        ldrb    WK0, [SRC, #-8]
+        ldrb    WK1, [SRC, #-4]
+  .if WITH_MASK
+        ldrb    STRIDE_S, [MASK, #-8]
+        ldrb    STRIDE_M, [MASK, #-4]
+  .endif
+  .if !(IN_NOT_OUT && !WITH_MASK)
+        ldrd    WK2, WK3, [DST, #-8]
+  .endif
+ .elseif numbytes == 8
+  .if IN_NOT_OUT && !WITH_MASK
+        ldrd    WK2, WK3, [DST], #8
+  .endif
+        ldrb    WK0, [SRC], #4
+        ldrb    WK1, [SRC], #4
+  .if WITH_MASK
+        ldrb    STRIDE_S, [MASK], #4
+        ldrb    STRIDE_M, [MASK], #4
+  .endif
+  .if !(IN_NOT_OUT && !WITH_MASK)
+        ldrd    WK2, WK3, [DST], #8
+  .endif
+ .else // numbytes == 4
+  .if IN_NOT_OUT && !WITH_MASK
+        ldr     WK2, [DST], #4
+  .endif
+        ldrb    WK0, [SRC], #4
+  .if WITH_MASK
+        ldrb    STRIDE_S, [MASK], #4
+  .endif
+  .if !(IN_NOT_OUT && !WITH_MASK)
+        ldr     WK2, [DST], #4
+  .endif
+ .endif
+.endm
+
+.macro inout_reverse_process_tail  cond, numbytes, firstreg
+ .if numbytes >= 8
+        inout_reverse_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -8
+ .else // numbytes == 4
+        inout_reverse_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -4
+ .endif
+20:
+.endm
+
+.set REVERSE, 0
+.set IN_NOT_OUT, 0
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_asm_armv6, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_process_head, \
+    inout_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_mask_asm_armv6, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_process_head, \
+    inout_process_tail
+
+.set IN_NOT_OUT, 1
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_in_asm_armv6, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_process_head, \
+    inout_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_in_mask_asm_armv6, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_process_head, \
+    inout_process_tail
+
+.set REVERSE, 1
+.set IN_NOT_OUT, 0
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_armv6, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_reverse_process_head, \
+    inout_reverse_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_armv6, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_reverse_process_head, \
+    inout_reverse_process_tail
+
+.set IN_NOT_OUT, 1
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_in_reverse_asm_armv6, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_reverse_process_head, \
+    inout_reverse_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_in_reverse_mask_asm_armv6, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    inout_init, \
+    inout_newline, \
+    nop_macro, /* cleanup */ \
+    inout_reverse_process_head, \
+    inout_reverse_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 69c46c5..7f7d8c0 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -82,6 +82,10 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
                                         uint32_t, uint32_t)
 
 PIXMAN_ARM_BIND_COMBINE_U (armv6, over_reverse)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, in)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, in_reverse)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, out)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
 
 void
@@ -309,6 +313,10 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
 
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = armv6_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = armv6_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = armv6_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = armv6_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u;
     imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
 
     imp->blt = arm_simd_blt;
-- 
1.7.5.4



More information about the Pixman mailing list