[Pixman] [PATCH 11/32] armv6: Add in_8888_8 fast path

Ben Avison bavison at riscosopen.org
Thu Aug 7 09:50:07 PDT 2014


This is used instead of the equivalent C fast path.

lowlevel-blt-bench results, compared to no fast path at all:

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  12.4   0.1      117.5  2.3     100.0%      +851.2%
L2  9.5    0.1      46.9   2.4     100.0%      +393.8%
M   9.6    0.0      61.9   0.9     100.0%      +544.0%
HT  7.9    0.0      26.6   0.5     100.0%      +238.6%
VT  7.7    0.0      24.2   0.4     100.0%      +212.5%
R   7.4    0.0      22.4   0.4     100.0%      +204.5%
RT  4.1    0.0      8.7    0.2     100.0%      +109.4%
---
 pixman/pixman-arm-simd-asm.S |  111 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |    4 ++
 2 files changed, 115 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 08d6709..6c77fd3 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1291,3 +1291,114 @@ generate_composite_function \
     over_n_0565_process_tail
 
 /******************************************************************************/
+
+.macro in_8888_8_init
+        SRC0    .req   Y
+        SRC1    .req   STRIDE_D
+        SRC2    .req   STRIDE_S
+        SRC3    .req   MASK
+        HALF    .req   STRIDE_M
+        TMP     .req   ORIG_W
+        line_saved_regs  Y, STRIDE_D, STRIDE_S, ORIG_W
+        ldr     SCRATCH, =0x00800080
+        mov     HALF, #0x80
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        /* Offset the source pointer: we only need the alpha bytes */
+        add     SRC, SRC, #3
+.endm
+
+.macro in_8888_8_cleanup
+        .unreq  SRC0
+        .unreq  SRC1
+        .unreq  SRC2
+        .unreq  SRC3
+        .unreq  HALF
+        .unreq  TMP
+.endm
+
+.macro in_8888_8_4pixels_head dst
+        ldr     TMP, [DST], #4
+        ldrb    SRC0, [SRC], #12
+        ldrb    SRC3, [SRC], #-4
+        ldrb    SRC2, [SRC], #-4
+        uxtb16  WK&dst, TMP
+        uxtb16  TMP, TMP, ror #8
+        ldrb    SRC1, [SRC], #12
+        smlabb  SRC0, SRC0, WK&dst, HALF
+        smlabt  SRC3, SRC3, TMP, HALF
+        smlabt  SRC2, SRC2, WK&dst, HALF
+        smlabb  SRC1, SRC1, TMP, HALF
+        orr     WK&dst, SRC0, SRC2, lsl #16
+        /* There'd be a stall here if immediately followed by orr, so
+         * fill it with something like a preload if possible */
+.endm
+
+.macro in_8888_8_4pixels_tail dst
+        orr     TMP, SRC1, SRC3, lsl #16
+        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        uxtab16 TMP, TMP, TMP, ror #8
+        mov     WK&dst, WK&dst, ror #8
+        sel     WK&dst, WK&dst, TMP
+.endm
+
+.macro in_8888_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 1
+        ldrb    WK3, [DST], #1
+        ldrb    SRC0, [SRC], #4
+ .elseif numbytes == 2
+        ldrb    WK3, [DST], #1
+        ldrb    SRC0, [SRC], #4
+        ldrb    TMP, [DST], #1
+        ldrb    SRC1, [SRC], #4
+ .else
+  .if numbytes >= 8
+   .if numbytes == 16
+        in_8888_8_4pixels_head 0
+        in_8888_8_4pixels_tail 0
+        in_8888_8_4pixels_head 1
+    .if preload
+        PF  bic,    SCRATCH, SRC, #31
+        PF  pld,    [SCRATCH, #32*prefetch_distance]
+    .endif
+        in_8888_8_4pixels_tail 1
+   .endif
+        in_8888_8_4pixels_head 2
+        in_8888_8_4pixels_tail 2
+  .endif
+        in_8888_8_4pixels_head 3
+ .endif
+.endm
+
+.macro in_8888_8_process_tail  cond, numbytes, firstreg
+ .if numbytes == 1
+        smlabb  WK3, SRC0, WK3, HALF
+        add     WK3, WK3, WK3, lsr #8
+        mov     WK3, WK3, lsr #8
+        strb    WK3, [DST, #-1]
+ .elseif numbytes == 2
+        smlabb  WK3, SRC0, WK3, HALF
+        smlabb  TMP, SRC1, TMP, HALF
+        add     WK3, WK3, WK3, lsr #8
+        add     TMP, TMP, TMP, lsr #8
+        mov     WK3, WK3, lsr #8
+        mov     TMP, TMP, lsr #8
+        strb    WK3, [DST, #-2]
+        strb    TMP, [DST, #-1]
+ .else
+        in_8888_8_4pixels_tail 3
+        pixst   , numbytes, (4-numbytes/4), DST
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_in_8888_8_asm_armv6, 32, 0, 8, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS, \
+    3, /* prefetch distance */ \
+    in_8888_8_init, \
+    nop_macro, /* newline */ \
+    in_8888_8_cleanup, \
+    in_8888_8_process_head, \
+    in_8888_8_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 31f960d..76770bc 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -48,6 +48,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_8888_8,
+                                   uint32_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
 
@@ -260,6 +262,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    PIXMAN_STD_FAST_PATH (IN, a8r8g8b8, null, a8, armv6_composite_in_8888_8),
+    PIXMAN_STD_FAST_PATH (IN, a8b8g8r8, null, a8, armv6_composite_in_8888_8),
     PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888),
     PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888),
     PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888),
-- 
1.7.5.4



More information about the Pixman mailing list