[Pixman] [PATCH 09/12] ARMv6: Add fast path for in_reverse_8888_8888
Tomeu Vizoso
tomeu at tomeuvizoso.net
Wed Mar 12 01:25:04 PDT 2014
From: Ben Avison <bavison at riscosopen.org>
lowlevel-blt-bench results:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 21.3 0.1 32.5 0.2 100.0% +52.1%
L2 12.1 0.2 19.5 0.5 100.0% +61.2%
M 11.0 0.0 17.1 0.0 100.0% +54.6%
HT 8.7 0.0 12.8 0.1 100.0% +46.9%
VT 8.6 0.0 12.5 0.1 100.0% +46.0%
R 8.6 0.0 12.0 0.1 100.0% +40.6%
RT 5.1 0.1 6.6 0.1 100.0% +28.8%
Trimmed cairo-perf-trace results:
Before After
Mean StdDev Mean StdDev Confidence Change
t-firefox-paintball 17.7 0.1 14.2 0.1 100.0% +24.5%
---
pixman/pixman-arm-simd-asm.S | 104 +++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 8 ++++
2 files changed, 112 insertions(+)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index ac084c4..20ad05a 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -875,6 +875,110 @@ generate_composite_function \
/******************************************************************************/
+.macro in_reverse_8888_8888_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x00800080
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ /* Offset the source pointer: we only need the alpha bytes */
+ add SRC, SRC, #3
+ line_saved_regs ORIG_W
+.endm
+
+.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
+ ldrb ORIG_W, [SRC], #4
+ .if numbytes >= 8
+ ldrb WK®1, [SRC], #4
+ .if numbytes == 16
+ ldrb WK®2, [SRC], #4
+ ldrb WK®3, [SRC], #4
+ .endif
+ .endif
+ add DST, DST, #numbytes
+.endm
+
+.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+.endm
+
+.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
+ .if is_only != 1
+ movs s, ORIG_W
+ .if offset != 0
+ ldrb ORIG_W, [SRC, #offset]
+ .endif
+ beq 01f
+ teq STRIDE_M, #0xFF
+ beq 02f
+ .endif
+ uxtb16 SCRATCH, d /* rb_dest */
+ uxtb16 d, d, ror #8 /* ag_dest */
+ mla SCRATCH, SCRATCH, s, MASK
+ mla d, d, s, MASK
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 d, d, d, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sel d, SCRATCH, d
+ b 02f
+ .if offset == 0
+48: /* Last mov d,#0 of the set - used as part of shortcut for
+ * source values all 0 */
+ .endif
+01: mov d, #0
+02:
+.endm
+
+.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+ teq ORIG_W, ORIG_W, asr #32
+ ldrne WK®1, [DST, #-4]
+ .elseif numbytes == 8
+ teq ORIG_W, WK®1
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+ ldmnedb DST, {WK®1-WK®2}
+ .else
+ teq ORIG_W, WK®1
+ teqeq ORIG_W, WK®2
+ teqeq ORIG_W, WK®3
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+ ldmnedb DST, {WK®1-WK®4}
+ .endif
+ cmnne DST, #0 /* clear C if NE */
+ bcs 49f /* no writes to dest if source all -1 */
+ beq 48f /* set dest to all 0 if source all 0 */
+ .if numbytes == 4
+ in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1
+ str WK®1, [DST, #-4]
+ .elseif numbytes == 8
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0
+ stmdb DST, {WK®1-WK®2}
+ .else
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0
+ stmdb DST, {WK®1-WK®4}
+ .endif
+49:
+.endm
+
+.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
+ in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+ pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
+ 2, /* prefetch distance */ \
+ in_reverse_8888_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ in_reverse_8888_8888_process_head, \
+ in_reverse_8888_8888_process_tail
+
+/******************************************************************************/
+
#ifdef PROFILING
.p2align 9
#endif
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index b97c7ec..5a50098 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -47,6 +47,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -235,6 +238,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888),
+
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
--
1.8.5.3
More information about the Pixman
mailing list