[Pixman] [PATCH 12/32] armv6: Add over_8888_8_0565 fast path

Ben Avison bavison at riscosopen.org
Thu Aug 7 09:50:08 PDT 2014


lowlevel-blt-bench results:

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  5.2    0.0      20.0   0.2     100.0%      +281.7%
L2  4.5    0.0      16.2   0.2     100.0%      +256.9%
M   4.5    0.0      18.8   0.0     100.0%      +321.1%
HT  3.9    0.0      10.9   0.0     100.0%      +177.6%
VT  3.9    0.0      10.6   0.0     100.0%      +171.5%
R   3.8    0.0      10.0   0.0     100.0%      +165.1%
RT  2.3    0.0      4.9    0.1     100.0%      +107.7%
---
 pixman/pixman-arm-simd-asm.S |  181 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |    5 +
 2 files changed, 186 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 6c77fd3..43d3c63 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1402,3 +1402,184 @@ generate_composite_function \
     in_8888_8_process_tail
 
 /******************************************************************************/
+
+.macro over_8888_8_0565_init
+        .unreq  WK2
+        .unreq  WK3
+        HALF    .req    Y
+        WK2     .req    STRIDE_D
+        WK3     .req    STRIDE_S
+        WK4     .req    STRIDE_M
+        WK5     .req    SCRATCH
+        WK6     .req    ORIG_W
+        WK7     .req    r10
+        WK8     .req    r11
+        BITMSK5 .req    WK7
+        BITMSK6 .req    WK8
+        line_saved_regs  Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro over_8888_8_0565_newline
+        ldr     HALF, =0x00800080
+        ldr     BITMSK5, =0x001f001f
+        ldr     BITMSK6, =0xfc00fc00
+.endm
+
+.macro over_8888_8_0565_cleanup
+        .unreq  HALF
+        .unreq  WK2
+        .unreq  WK3
+        .unreq  WK4
+        .unreq  WK5
+        .unreq  WK6
+        .unreq  WK7
+        .unreq  WK8
+        .unreq  BITMSK5
+        .unreq  BITMSK6
+        WK2     .req    r10
+        WK3     .req    r11
+.endm
+
+.macro over_8888_8_0565_2pixels_head
+        ldmia   SRC!, {WK0, WK1}
+        ldrb    WK2, [MASK], #1
+        ldrb    WK3, [MASK], #1
+        @ Because we'll be writing the destination in sub-cacheline
+        @ chunks either way, it needs to be preloaded, so there's no
+        @ penalty in loading its existing value even if it's unused
+        ldr     WK4, [DST], #4
+.endm
+
+.set COUNTER, 0
+
+.macro over_8888_8_0565_2pixels_tail
+        uxtb16  WK5, WK0                    @ 00000000rrrrrrrr00000000bbbbbbbb
+        uxtb16  WK0, WK0, ror #8            @ 00000000aaaaaaaa00000000gggggggg
+        uxtb16  WK6, WK1                    @ 00000000RRRRRRRR00000000BBBBBBBB
+        uxtb16  WK1, WK1, ror #8            @ 00000000AAAAAAAA00000000GGGGGGGG
+        mla     WK5, WK5, WK2, HALF         @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+        mla     WK0, WK0, WK2, HALF         @ aaaaaaaaaaaaaaaagggggggggggggggg
+        mla     WK6, WK6, WK3, HALF         @ RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+        mla     WK1, WK1, WK3, HALF         @ AAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+        uxtab16 WK5, WK5, WK5, ror #8
+        uxtab16 WK0, WK0, WK0, ror #8
+        uxtab16 WK6, WK6, WK6, ror #8
+        uxtab16 WK1, WK1, WK1, ror #8
+        mov     WK3, #0xFF
+        eor     WK2, WK3, WK0, lsr #24      @ 000000000000000000000000aaaaaaaa
+        eors    WK3, WK3, WK1, lsr #24      @ 000000000000000000000000AAAAAAAA
+        pkhbt   WK0, WK0, WK1, lsl #16      @ GGGGGGGGGGGGGGGGgggggggggggggggg
+        it      eq
+        teqeq   WK2, #0
+        beq     1f  @ skip processing of existing dest buffer if both pixels opaque
+        bic     WK1, WK4, BITMSK6, lsr #5   @ RRRRR000000BBBBBrrrrr000000bbbbb
+        and     WK7, BITMSK6, WK4, lsl #5   @ GGGGGG0000000000gggggg0000000000
+        mov     WK4, WK1, lsl #16           @ rrrrr000000bbbbb0000000000000000
+        orr     WK7, WK7, WK7, lsr #6       @ GGGGGGGGGGGG0000gggggggggggg0000
+        bic     WK1, WK1, WK4, lsr #16      @ RRRRR000000BBBBB0000000000000000
+        orr     WK4, WK4, WK4, lsr #5       @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK1, WK1, WK1, lsr #5       @ RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK4, WK4, WK4, asr #5       @ rrrrrrrrrr0xxxxxbbbbbbbbbb000000
+        pkhtb   WK1, WK1, WK1, asr #5       @ RRRRRRRRRR0xxxxxBBBBBBBBBB000000
+        uxtb16  WK7, WK7, ror #8            @ 00000000GGGGGGGG00000000gggggggg
+        uxtb16  WK4, WK4, ror #8            @ 00000000rrrrrrrr00000000bbbbbbbb
+        uxtb16  WK1, WK1, ror #8            @ 00000000RRRRRRRR00000000BBBBBBBB
+        smlabb  WK8, WK7, WK2, HALF         @ 00000000x0000000gggggggggggggggg
+        smlatb  WK7, WK7, WK3, HALF         @ 00000000x0000000GGGGGGGGGGGGGGGG
+        mla     WK4, WK4, WK2, HALF         @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+        mla     WK1, WK1, WK3, HALF         @ RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+        pkhbt   WK3, WK8, WK7, lsl #16      @ GGGGGGGGGGGGGGGGgggggggggggggggg
+        ldr     BITMSK5, =0x001f001f
+        ldr     BITMSK6, =0xfc00fc00
+ .if COUNTER == 16
+  .set COUNTER, 0
+        b       0f
+        .ltorg
+0:
+ .else
+  .set COUNTER, COUNTER + 1
+ .endif
+        uxtab16 WK4, WK4, WK4, ror #8
+        uxtab16 WK1, WK1, WK1, ror #8
+        uxtab16 WK3, WK3, WK3, ror #8
+        @ We could have used 16-bit saturating adds here for greater
+        @ precision, but then it wouldn't be binary identical to the C version
+        uqadd8  WK5, WK4, WK5               @ rrrrrrrrxxxxxxxxbbbbbbbbxxxxxxxx
+        uqadd8  WK6, WK1, WK6               @ RRRRRRRRxxxxxxxxBBBBBBBBxxxxxxxx
+        uqadd8  WK0, WK3, WK0               @ GGGGGGGGxxxxxxxxggggggggxxxxxxxx
+1:      and     WK5, BITMSK5, WK5, lsr #11  @ 00000000000rrrrr00000000000bbbbb
+        and     WK6, BITMSK5, WK6, lsr #11  @ 00000000000RRRRR00000000000BBBBB
+        orr     WK5, WK5, WK5, lsr #5       @ 00000000000xxxxxrrrrr000000bbbbb
+        orr     WK6, WK6, WK6, lsr #5       @ 00000000000xxxxxRRRRR000000BBBBB
+        and     WK0, BITMSK6, WK0           @ GGGGGG0000000000gggggg0000000000
+        pkhbt   WK3, WK5, WK6, lsl #16      @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     WK0, WK3, WK0, lsr #5       @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+        str     WK0, [DST, #-4]
+.endm
+
+.macro over_8888_8_0565_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 2
+        ldr     WK0, [SRC], #4
+        ldrb    WK1, [MASK], #1
+        ldrh    WK2, [DST], #2
+ .else
+  .if numbytes >= 8
+   .if numbytes == 16
+        over_8888_8_0565_2pixels_head
+        over_8888_8_0565_2pixels_tail
+        over_8888_8_0565_2pixels_head
+        over_8888_8_0565_2pixels_tail
+   .endif
+        over_8888_8_0565_2pixels_head
+        over_8888_8_0565_2pixels_tail
+  .endif
+        over_8888_8_0565_2pixels_head
+ .endif
+.endm
+
+.macro over_8888_8_0565_process_tail  cond, numbytes, firstreg
+ .if numbytes == 2
+        uxtb16  WK3, WK0                    @ 00000000rrrrrrrr00000000bbbbbbbb
+        uxtb16  WK0, WK0, ror #8            @ 00000000aaaaaaaa00000000gggggggg
+        mla     WK3, WK3, WK1, HALF         @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+        mla     WK0, WK0, WK1, HALF         @ aaaaaaaaaaaaaaaagggggggggggggggg
+        mov     WK1, #0xFF
+        uxtab16 WK3, WK3, WK3, ror #8
+        uxtab16 WK0, WK0, WK0, ror #8
+        eors    WK1, WK1, WK0, lsr #24      @ 000000000000000000000000aaaaaaaa
+        beq     1f  @ skip processing of existing dest buffer if pixel opaque
+        bic     WK4, WK2, BITMSK6, lsr #5   @ 0000000000000000rrrrr000000bbbbb
+        and     WK2, BITMSK6, WK2, lsl #5   @ 0000000000000000gggggg0000000000
+        mov     WK4, WK4, lsl #16           @ rrrrr000000bbbbb0000000000000000
+        orr     WK4, WK4, WK4, lsr #5       @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        pkhtb   WK4, WK4, WK4, asr #5       @ rrrrrrrrrr0xxxxxbbbbbbbbbb000000
+        orr     WK2, WK2, WK2, lsr #6       @ 0000000000000000gggggggggggg0000
+        uxtb16  WK4, WK4, ror #8            @ 00000000rrrrrrrr00000000bbbbbbbb
+        uxtb    WK2, WK2, ror #8            @ 000000000000000000000000gggggggg
+        mla     WK4, WK4, WK1, HALF         @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
+        smlabb  WK2, WK2, WK1, HALF         @ 00000000x0000000gggggggggggggggg
+        uxtab16 WK4, WK4, WK4, ror #8
+        uxtab   WK2, WK2, WK2, ror #8
+        uqadd8  WK3, WK4, WK3               @ rrrrrrrrxxxxxxxxbbbbbbbbxxxxxxxx
+        uqadd8  WK0, WK2, WK0               @ xxxxxxxxxxxxxxxxggggggggxxxxxxxx
+1:      and     WK3, BITMSK5, WK3, lsr #11  @ 00000000000rrrrr00000000000bbbbb
+        and     WK0, BITMSK6, WK0           @ xxxxxx0000000000gggggg0000000000
+        orr     WK3, WK3, WK3, lsr #5       @ 00000000000xxxxxrrrrr000000bbbbb
+        orr     WK0, WK3, WK0, lsr #5       @ 000000xxxxxxxxxxrrrrrggggggbbbbb
+        strh    WK0, [DST, #-2]
+ .else
+        over_8888_8_0565_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_armv6, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+    2, /* prefetch distance */ \
+    over_8888_8_0565_init, \
+    over_8888_8_0565_newline, \
+    over_8888_8_0565_cleanup, \
+    over_8888_8_0565_process_head, \
+    over_8888_8_0565_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 76770bc..63bf320 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -69,6 +69,9 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
                                       uint32_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(armv6, over_8888_8_0565,
+                                       uint32_t, 1, uint8_t, 1, uint16_t, 1)
+
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
                                         uint16_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
@@ -245,6 +248,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, r5g6b5, armv6_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, armv6_composite_over_8888_8_0565),
 
     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888),
-- 
1.7.5.4



More information about the Pixman mailing list