[Pixman] [PATCH 20/32] armv6: Add IN, IN_REVERSE, OUT and OUT_REVERSE combiners
Ben Avison
bavison at riscosopen.org
Thu Aug 7 09:50:16 PDT 2014
lowlevel-blt-bench results for example operations, with and without masks,
none of which has a dedicated fast path at the time of writing:
in_n_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 11.0 0.0 16.1 0.1 100.0% +46.6%
L2 10.7 0.1 15.6 0.2 100.0% +46.0%
M 9.6 0.0 14.1 0.0 100.0% +46.6%
HT 8.8 0.0 11.2 0.0 100.0% +26.3%
VT 8.8 0.0 11.0 0.0 100.0% +26.1%
R 8.5 0.0 10.6 0.0 100.0% +25.3%
RT 5.4 0.0 6.0 0.1 100.0% +11.0%
in_n_8_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 6.3 0.0 10.5 0.0 100.0% +67.0%
L2 6.4 0.0 10.4 0.1 100.0% +63.9%
M 5.9 0.0 9.8 0.0 100.0% +65.9%
HT 5.4 0.0 7.3 0.0 100.0% +36.0%
VT 5.3 0.0 7.2 0.0 100.0% +35.7%
R 5.1 0.0 6.9 0.0 100.0% +33.9%
RT 3.1 0.0 3.6 0.0 100.0% +14.5%
inrev_n_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 11.0 0.0 15.5 0.1 100.0% +40.9%
L2 10.7 0.1 15.0 0.2 100.0% +40.6%
M 9.6 0.0 13.6 0.0 100.0% +41.9%
HT 8.8 0.0 11.0 0.0 100.0% +24.4%
VT 8.8 0.0 10.8 0.0 100.0% +23.9%
R 8.5 0.0 10.4 0.0 100.0% +23.0%
RT 5.4 0.1 6.0 0.1 100.0% +10.9%
inrev_n_8_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 6.8 0.0 11.0 0.1 100.0% +61.5%
L2 6.9 0.1 10.9 0.1 100.0% +58.0%
M 6.3 0.0 10.2 0.0 100.0% +61.2%
HT 5.7 0.0 7.6 0.0 100.0% +32.2%
VT 5.7 0.0 7.5 0.0 100.0% +32.5%
R 5.5 0.0 7.2 0.0 100.0% +30.5%
RT 3.2 0.0 3.6 0.0 100.0% +12.6%
out_n_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 10.8 0.0 15.6 0.1 100.0% +44.6%
L2 10.4 0.1 15.3 0.1 100.0% +46.7%
M 9.4 0.0 13.8 0.0 100.0% +46.2%
HT 8.7 0.0 11.0 0.0 100.0% +25.7%
VT 8.6 0.0 10.8 0.1 100.0% +25.4%
R 8.4 0.0 10.4 0.0 100.0% +24.3%
RT 5.4 0.0 5.7 0.1 100.0% +6.4%
out_n_8_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 6.2 0.0 10.3 0.0 100.0% +66.1%
L2 6.2 0.1 10.3 0.1 100.0% +65.7%
M 5.6 0.2 9.7 0.0 100.0% +72.9%
HT 5.3 0.0 7.2 0.0 100.0% +36.1%
VT 5.3 0.0 7.1 0.0 100.0% +35.7%
R 5.1 0.0 6.8 0.0 100.0% +33.9%
RT 3.1 0.0 3.5 0.0 100.0% +13.7%
outrev_n_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 9.8 0.0 15.3 0.1 100.0% +55.4%
L2 9.5 0.1 15.0 0.2 100.0% +57.4%
M 8.6 0.0 13.6 0.0 100.0% +57.8%
HT 8.1 0.0 10.9 0.0 100.0% +34.5%
VT 8.0 0.0 10.8 0.0 100.0% +34.0%
R 7.8 0.0 10.3 0.0 100.0% +32.6%
RT 5.1 0.1 5.8 0.1 100.0% +13.3%
outrev_n_8_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 6.3 0.0 11.0 0.1 100.0% +73.8%
L2 6.4 0.0 11.0 0.1 100.0% +73.1%
M 5.4 0.2 10.2 0.0 100.0% +87.7%
HT 5.4 0.0 7.6 0.0 100.0% +40.4%
VT 5.4 0.0 7.5 0.0 100.0% +40.0%
R 5.2 0.0 7.2 0.0 100.0% +38.3%
RT 3.1 0.0 3.6 0.1 100.0% +15.9%
---
pixman/pixman-arm-simd-asm.S | 376 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 8 +
2 files changed, 384 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 0c13a73..15eabe2 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -2048,3 +2048,379 @@ generate_composite_function_single_scanline \
add_8888_8888_8888_process_tail
/******************************************************************************/
+
+.macro inout_init
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ msr CPSR_s, #0x50000
+ /* Point at alpha byte in source / mask / dest */
+ .if REVERSE
+ add SRC, SRC, #3
+ .endif
+ .if WITH_MASK
+ add MASK, MASK, #3
+ .endif
+ .if !REVERSE
+ add DST, DST, #3
+ .endif
+ line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro inout_newline
+ ldr Y, =0x00800080
+ mov STRIDE_D, #0xff
+.endm
+
+.macro inout_1pixel s, m, d, tmp, half, ff, offset
+ uxtb16 tmp, s
+ .if !IN_NOT_OUT
+ bic d, ff, d
+ .endif
+ uxtb16 s, s, ror #8
+ .if WITH_MASK
+ mla tmp, tmp, m, half
+ mla s, s, m, half
+ uxtab16 tmp, tmp, tmp, ror #8
+ uxtab16 s, s, s, ror #8
+ uxtb16 tmp, tmp, ror #8
+ uxtb16 s, s, ror #8
+ .endif
+ mla tmp, tmp, d, half
+ mla s, s, d, half
+ uxtab16 tmp, tmp, tmp, ror #8
+ uxtab16 s, s, s, ror #8
+ mov tmp, tmp, ror #8
+ sel s, tmp, s
+ str s, [DST, #offset]
+.endm
+
+.macro inout_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset
+ uxtb16 tmp0, s0
+ uxtb16 s0, s0, ror #8
+ .if !IN_NOT_OUT
+ bic d0, ff, d0
+ bic d1, ff, d1
+ .endif
+ uxtb16 tmp1, s1
+ uxtb16 s1, s1, ror #8
+ .if WITH_MASK
+ mla tmp0, tmp0, m0, half
+ mla s0, s0, m0, half
+ mla tmp1, tmp1, m1, half
+ mla s1, s1, m1, half
+ uxtab16 tmp0, tmp0, tmp0, ror #8
+ uxtab16 s0, s0, s0, ror #8
+ uxtab16 tmp1, tmp1, tmp1, ror #8
+ uxtab16 s1, s1, s1, ror #8
+ uxtb16 tmp0, tmp0, ror #8
+ uxtb16 s0, s0, ror #8
+ uxtb16 tmp1, tmp1, ror #8
+ uxtb16 s1, s1, ror #8
+ .endif
+ mla tmp0, tmp0, d0, half
+ mla s0, s0, d0, half
+ mla tmp1, tmp1, d1, half
+ mla s1, s1, d1, half
+ uxtab16 tmp0, tmp0, tmp0, ror #8
+ uxtab16 s0, s0, s0, ror #8
+ uxtab16 tmp1, tmp1, tmp1, ror #8
+ uxtab16 s1, s1, s1, ror #8
+ mov tmp0, tmp0, ror #8
+ mov tmp1, tmp1, ror #8
+ sel s0, tmp0, s0
+ sel s1, tmp1, s1
+ strd s0, s1, [DST, #offset]
+.endm
+
+.macro inout_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes >= 8
+ pixld , 8, 0, SRC, unaligned_src
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK], #4
+ ldrb STRIDE_M, [MASK], #4
+ .endif
+ ldrb WK2, [DST], #4
+ ldrb WK3, [DST], #4
+ .if numbytes == 16
+ inout_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -11
+ pixld , 8, 0, SRC, unaligned_src
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK], #4
+ ldrb STRIDE_M, [MASK], #4
+ .endif
+ ldrb WK2, [DST], #4
+ ldrb WK3, [DST], #4
+ .endif
+ .else // numbytes == 4
+ pixld , 4, 0, SRC, unaligned_src
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK], #4
+ .endif
+ ldrb WK2, [DST], #4
+ .endif
+.endm
+
+.macro inout_process_tail cond, numbytes, firstreg
+ .if numbytes >= 8
+ inout_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -11
+ .else // numbytes == 4
+ inout_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -7
+ .endif
+.endm
+
+.macro inout_reverse_1pixel s, m, d, tmp, half, ff, offset
+ .if WITH_MASK
+ smlabb s, s, m, half
+ uxtb16 tmp, d
+ uxtb16 d, d, ror #8
+ uxtab s, s, s, ror #8
+ .if IN_NOT_OUT
+ and s, ff, s, lsr #8
+ .else
+ bic s, ff, s, lsr #8
+ .endif
+ .else
+ .if !IN_NOT_OUT
+ bic s, ff, s
+ .endif
+ uxtb16 tmp, d
+ uxtb16 d, d, ror #8
+ .endif
+ mla tmp, tmp, s, half
+ mla d, d, s, half
+ uxtab16 tmp, tmp, tmp, ror #8
+ uxtab16 d, d, d, ror #8
+ mov tmp, tmp, ror #8
+ sel d, tmp, d
+ str d, [DST, #offset]
+.endm
+
+.macro inout_reverse_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset
+ .if WITH_MASK
+ smlabb s0, s0, m0, half
+ smlabb s1, s1, m1, half
+ uxtb16 tmp0, d0
+ uxtb16 d0, d0, ror #8
+ uxtab s0, s0, s0, ror #8
+ uxtab s1, s1, s1, ror #8
+ .if IN_NOT_OUT
+ and s0, ff, s0, lsr #8
+ and s1, ff, s1, lsr #8
+ .else
+ bic s0, ff, s0, lsr #8
+ bic s1, ff, s1, lsr #8
+ .endif
+ .else
+ .if !IN_NOT_OUT
+ bic s0, ff, s0
+ bic s1, ff, s1
+ .endif
+ uxtb16 tmp0, d0
+ uxtb16 d0, d0, ror #8
+ .endif
+ uxtb16 tmp1, d1
+ uxtb16 d1, d1, ror #8
+ mla tmp0, tmp0, s0, half
+ mla d0, d0, s0, half
+ mla tmp1, tmp1, s1, half
+ mla d1, d1, s1, half
+ uxtab16 tmp0, tmp0, tmp0, ror #8
+ uxtab16 d0, d0, d0, ror #8
+ uxtab16 tmp1, tmp1, tmp1, ror #8
+ uxtab16 d1, d1, d1, ror #8
+ mov tmp0, tmp0, ror #8
+ mov tmp1, tmp1, ror #8
+ sel d0, tmp0, d0
+ sel d1, tmp1, d1
+ strd d0, d1, [DST, #offset]
+.endm
+
+.macro inout_reverse_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ ldrb WK0, [SRC], #4
+ ldrb WK1, [SRC], #4
+ ldrb SCRATCH, [SRC], #4
+ ldrb ORIG_W, [SRC], #4
+ .if IN_NOT_OUT
+ and WK2, WK0, WK1
+ and WK2, WK2, SCRATCH
+ and WK2, WK2, ORIG_W
+ bics WK2, STRIDE_D, WK2
+ .else
+ orr WK2, WK0, WK1
+ orr WK2, WK2, SCRATCH
+ orrs WK2, WK2, ORIG_W
+ .endif
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK], #4
+ ldrb STRIDE_M, [MASK], #4
+ ldrb SCRATCH, [MASK], #4
+ ldrb ORIG_W, [MASK], #4
+ .endif
+ ldrd WK2, WK3, [DST], #16
+ .if WITH_MASK
+ bne 10f
+ .if IN_NOT_OUT
+ and SCRATCH, SCRATCH, ORIG_W
+ and SCRATCH, SCRATCH, STRIDE_S
+ and SCRATCH, SCRATCH, STRIDE_M
+ bics SCRATCH, STRIDE_D, SCRATCH
+ .else
+ orr SCRATCH, SCRATCH, ORIG_W
+ orr SCRATCH, SCRATCH, STRIDE_S
+ orrs SCRATCH, SCRATCH, STRIDE_M
+ .endif
+ .endif
+ beq 20f
+10:
+ inout_reverse_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -16
+ .if IN_NOT_OUT && !WITH_MASK
+ ldrd WK2, WK3, [DST, #-8]
+ .endif
+ ldrb WK0, [SRC, #-8]
+ ldrb WK1, [SRC, #-4]
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK, #-8]
+ ldrb STRIDE_M, [MASK, #-4]
+ .endif
+ .if !(IN_NOT_OUT && !WITH_MASK)
+ ldrd WK2, WK3, [DST, #-8]
+ .endif
+ .elseif numbytes == 8
+ .if IN_NOT_OUT && !WITH_MASK
+ ldrd WK2, WK3, [DST], #8
+ .endif
+ ldrb WK0, [SRC], #4
+ ldrb WK1, [SRC], #4
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK], #4
+ ldrb STRIDE_M, [MASK], #4
+ .endif
+ .if !(IN_NOT_OUT && !WITH_MASK)
+ ldrd WK2, WK3, [DST], #8
+ .endif
+ .else // numbytes == 4
+ .if IN_NOT_OUT && !WITH_MASK
+ ldr WK2, [DST], #4
+ .endif
+ ldrb WK0, [SRC], #4
+ .if WITH_MASK
+ ldrb STRIDE_S, [MASK], #4
+ .endif
+ .if !(IN_NOT_OUT && !WITH_MASK)
+ ldr WK2, [DST], #4
+ .endif
+ .endif
+.endm
+
+.macro inout_reverse_process_tail cond, numbytes, firstreg
+ .if numbytes >= 8
+ inout_reverse_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -8
+ .else // numbytes == 4
+ inout_reverse_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -4
+ .endif
+20:
+.endm
+
+.set REVERSE, 0
+.set IN_NOT_OUT, 0
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_asm_armv6, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_process_head, \
+ inout_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_mask_asm_armv6, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_process_head, \
+ inout_process_tail
+
+.set IN_NOT_OUT, 1
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_in_asm_armv6, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_process_head, \
+ inout_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_in_mask_asm_armv6, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_process_head, \
+ inout_process_tail
+
+.set REVERSE, 1
+.set IN_NOT_OUT, 0
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_armv6, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_reverse_process_head, \
+ inout_reverse_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_mask_asm_armv6, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_reverse_process_head, \
+ inout_reverse_process_tail
+
+.set IN_NOT_OUT, 1
+.set WITH_MASK, 0
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_in_reverse_asm_armv6, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_reverse_process_head, \
+ inout_reverse_process_tail
+
+.set WITH_MASK, 1
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_in_reverse_mask_asm_armv6, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ inout_init, \
+ inout_newline, \
+ nop_macro, /* cleanup */ \
+ inout_reverse_process_head, \
+ inout_reverse_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 69c46c5..7f7d8c0 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -82,6 +82,10 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
uint32_t, uint32_t)
PIXMAN_ARM_BIND_COMBINE_U (armv6, over_reverse)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, in)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, in_reverse)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, out)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse)
PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
void
@@ -309,6 +313,10 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
imp->combine_32[PIXMAN_OP_OVER_REVERSE] = armv6_combine_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_IN] = armv6_combine_in_u;
+ imp->combine_32[PIXMAN_OP_IN_REVERSE] = armv6_combine_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_OUT] = armv6_combine_out_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u;
imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
imp->blt = arm_simd_blt;
--
1.7.5.4
More information about the Pixman
mailing list