[Pixman] [PATCH 21/32] armv6: Add OVER combiner
Ben Avison
bavison at riscosopen.org
Thu Aug 7 09:50:17 PDT 2014
lowlevel-blt-bench results for two example operations, with and without masks,
neither of which has a dedicated fast path at the time of writing:
over_8888_1555
Before After
Mean StdDev Mean StdDev Confidence Change
L1 7.1 0.0 11.4 0.1 100.0% +60.2%
L2 5.7 0.0 9.9 0.1 100.0% +73.4%
M 5.6 0.0 10.0 0.0 100.0% +79.2%
HT 4.9 0.0 7.6 0.0 100.0% +53.7%
VT 4.9 0.0 7.4 0.0 100.0% +52.3%
R 4.7 0.0 7.2 0.0 100.0% +51.9%
RT 3.1 0.0 4.1 0.0 100.0% +29.8%
over_8888_n_1555
Before After
Mean StdDev Mean StdDev Confidence Change
L1 4.9 0.0 8.6 0.0 100.0% +75.5%
L2 4.4 0.0 8.3 0.1 100.0% +88.7%
M 4.3 0.0 8.3 0.0 100.0% +93.0%
HT 4.0 0.0 6.6 0.0 100.0% +62.9%
VT 4.0 0.0 6.5 0.0 100.0% +61.9%
R 3.9 0.0 6.3 0.0 100.0% +60.9%
RT 2.7 0.0 3.6 0.0 100.0% +33.5%
---
pixman/pixman-arm-simd-asm.S | 140 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 2 +
2 files changed, 142 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 15eabe2..aeb40dc 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -543,6 +543,146 @@ generate_composite_function \
over_8888_8888_process_head, \
over_8888_8888_process_tail
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_asm_armv6, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ over_8888_8888_init, \
+ over_8888_8888_newline \
+ nop_macro, /* cleanup */ \
+ over_8888_8888_process_head, \
+ over_8888_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_8888_init
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ msr CPSR_s, #0x50000
+ /* Point at alpha byte in mask */
+ add MASK, MASK, #3
+ line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro over_8888_8888_8888_newline
+ ldr Y, =0x00800080
+ mov STRIDE_D, #0xff
+.endm
+
+.macro over_8888_8888_8888_1pixel s, m, d, tmp, half, ff, offset
+ uxtb16 tmp, s
+ uxtb16 s, s, ror #8
+ mla tmp, tmp, m, half
+ mla s, s, m, half
+ uxtab16 tmp, tmp, tmp, ror #8
+ uxtab16 s, s, s, ror #8
+ mov tmp, tmp, ror #8
+ sub m, ff, s, lsr #24
+ sel s, tmp, s
+ uxtb16 tmp, d
+ uxtb16 d, d, ror #8
+ mla tmp, tmp, m, half
+ mla d, d, m, half
+ uxtab16 tmp, tmp, tmp, ror #8
+ uxtab16 d, d, d, ror #8
+ mov tmp, tmp, ror #8
+ sel d, tmp, d
+ uqadd8 d, d, s
+ str d, [DST, #offset]
+.endm
+
+.macro over_8888_8888_8888_2pixels s0, s1, m0, m1, d0, d1, tmp0, tmp1, half, ff, offset
+ uxtb16 tmp0, s0
+ uxtb16 s0, s0, ror #8
+ uxtb16 tmp1, s1
+ uxtb16 s1, s1, ror #8
+ mla tmp0, tmp0, m0, half
+ mla s0, s0, m0, half
+ mla tmp1, tmp1, m1, half
+ mla s1, s1, m1, half
+ uxtab16 tmp0, tmp0, tmp0, ror #8
+ uxtab16 s0, s0, s0, ror #8
+ uxtab16 tmp1, tmp1, tmp1, ror #8
+ uxtab16 s1, s1, s1, ror #8
+ mov tmp0, tmp0, ror #8
+ mov tmp1, tmp1, ror #8
+ sub m0, ff, s0, lsr #24
+ sub m1, ff, s1, lsr #24
+ sel s0, tmp0, s0
+ sel s1, tmp1, s1
+ uxtb16 tmp0, d0
+ uxtb16 d0, d0, ror #8
+ uxtb16 tmp1, d1
+ uxtb16 d1, d1, ror #8
+ mla tmp0, tmp0, m0, half
+ mla d0, d0, m0, half
+ mla tmp1, tmp1, m1, half
+ mla d1, d1, m1, half
+ uxtab16 tmp0, tmp0, tmp0, ror #8
+ uxtab16 d0, d0, d0, ror #8
+ uxtab16 tmp1, tmp1, tmp1, ror #8
+ uxtab16 d1, d1, d1, ror #8
+ mov tmp0, tmp0, ror #8
+ mov tmp1, tmp1, ror #8
+ sel d0, tmp0, d0
+ sel d1, tmp1, d1
+ uqadd8 d0, d0, s0
+ uqadd8 d1, d1, s1
+ strd d0, d1, [DST, #offset]
+.endm
+
+.macro over_8888_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ ldm SRC!, {WK0, WK1, SCRATCH, ORIG_W}
+ ldrb STRIDE_S, [MASK], #4
+ ldrb STRIDE_M, [MASK], #4
+ orr WK2, WK0, WK1
+ orr WK2, WK2, SCRATCH
+ orrs WK2, WK2, ORIG_W
+ ldrb SCRATCH, [MASK], #4
+ ldrb ORIG_W, [MASK], #4
+ ldrd WK2, WK3, [DST], #16
+ bne 10f
+ orr SCRATCH, SCRATCH, STRIDE_S
+ orr SCRATCH, SCRATCH, STRIDE_M
+ orrs SCRATCH, SCRATCH, ORIG_W
+ beq 20f
+10:
+ over_8888_8888_8888_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -16
+ ldrd WK0, WK1, [SRC, #-8]
+ ldrb STRIDE_S, [MASK, #-8]
+ ldrb STRIDE_M, [MASK, #-4]
+ ldrd WK2, WK3, [DST, #-8]
+ .elseif numbytes == 8
+ ldrd WK0, WK1, [SRC], #8
+ ldrb STRIDE_S, [MASK], #4
+ ldrb STRIDE_M, [MASK], #4
+ ldrd WK2, WK3, [DST], #8
+ .else // numbytes == 4
+ ldr WK0, [SRC], #4
+ ldrb STRIDE_S, [MASK], #4
+ ldr WK2, [DST], #4
+ .endif
+.endm
+
+.macro over_8888_8888_8888_process_tail cond, numbytes, firstreg
+ .if numbytes >= 8
+ over_8888_8888_8888_2pixels WK0, WK1, STRIDE_S, STRIDE_M, WK2, WK3, SCRATCH, ORIG_W, Y, STRIDE_D, -8
+ .else // numbytes == 4
+ over_8888_8888_8888_1pixel WK0, STRIDE_S, WK2, SCRATCH, Y, STRIDE_D, -4
+ .endif
+20:
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_mask_asm_armv6, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ over_8888_8888_8888_init, \
+ over_8888_8888_8888_newline, \
+ nop_macro, /* cleanup */ \
+ over_8888_8888_8888_process_head, \
+ over_8888_8888_8888_process_tail
+
/******************************************************************************/
.macro over_reverse_8888_8888_init
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 7f7d8c0..b17266e 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -81,6 +81,7 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
uint32_t, uint32_t)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, over)
PIXMAN_ARM_BIND_COMBINE_U (armv6, over_reverse)
PIXMAN_ARM_BIND_COMBINE_U (armv6, in)
PIXMAN_ARM_BIND_COMBINE_U (armv6, in_reverse)
@@ -312,6 +313,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+ imp->combine_32[PIXMAN_OP_OVER] = armv6_combine_over_u;
imp->combine_32[PIXMAN_OP_OVER_REVERSE] = armv6_combine_over_reverse_u;
imp->combine_32[PIXMAN_OP_IN] = armv6_combine_in_u;
imp->combine_32[PIXMAN_OP_IN_REVERSE] = armv6_combine_in_reverse_u;
--
1.7.5.4
More information about the Pixman
mailing list