[Pixman] [PATCH v5 2/4] ARMv6: Add fast path for over_n_8888_8888_ca
Pekka Paalanen
ppaalanen at gmail.com
Wed Apr 9 06:25:30 PDT 2014
From: Ben Avison <bavison at riscosopen.org>
Benchmark results, "before" is
* upstream/master 4b76bbfda670f9ede67d0449f3640605e1fc4df0
"after" contains the additional patches on top:
+ ARMv6: Support for very variable-hungry composite operations
+ ARMv6: Add fast path for over_n_8888_8888_ca (this patch)
lowlevel-blt-bench, over_n_8888_8888_ca, 100 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 2.7 0.00 16.1 0.06 100.00% +500.7%
L2 2.4 0.01 14.1 0.15 100.00% +489.9%
M 2.3 0.00 14.3 0.01 100.00% +510.2%
HT 2.2 0.00 9.7 0.03 100.00% +345.0%
VT 2.2 0.00 9.4 0.02 100.00% +333.4%
R 2.2 0.01 9.5 0.03 100.00% +331.6%
RT 1.9 0.01 5.5 0.07 100.00% +192.7%
At most 1 outliers rejected per test per set.
cairo-perf-trace with trimmed traces, 30 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
t-firefox-talos-gfx.trace 33.1 0.42 25.8 0.44 100.00% +28.6%
t-firefox-scrolling.trace 31.4 0.11 24.8 0.12 100.00% +26.3%
t-gnome-terminal-vim.trace 22.4 0.10 19.9 0.14 100.00% +12.5%
t-evolution.trace 13.9 0.07 13.0 0.05 100.00% +6.5%
t-firefox-planet-gnome.trace 11.6 0.02 10.9 0.02 100.00% +6.5%
t-gvim.trace 34.0 0.21 33.2 0.21 100.00% +2.4%
t-chromium-tabs.trace 4.9 0.02 4.9 0.02 100.00% +1.0%
t-poppler.trace 9.8 0.05 9.8 0.06 100.00% +0.7%
t-firefox-canvas-swscroll.trace 32.3 0.10 32.2 0.09 100.00% +0.4%
t-firefox-paintball.trace 18.1 0.01 18.0 0.01 100.00% +0.3%
t-poppler-reseau.trace 22.5 0.09 22.4 0.11 99.29% +0.3%
t-firefox-canvas.trace 18.1 0.06 18.0 0.05 99.29% +0.2%
t-xfce4-terminal-a1.trace 4.8 0.01 4.8 0.01 99.77% +0.2%
t-firefox-fishbowl.trace 21.2 0.03 21.2 0.04 100.00% +0.2%
t-gnome-system-monitor.trace 17.3 0.03 17.3 0.03 99.54% +0.1%
t-firefox-asteroids.trace 11.1 0.01 11.1 0.01 100.00% +0.1%
t-midori-zoomed.trace 8.0 0.01 8.0 0.01 99.98% +0.1%
t-grads-heat-map.trace 4.4 0.04 4.4 0.04 34.08% +0.1% (insignificant)
t-firefox-talos-svg.trace 20.6 0.03 20.6 0.04 54.06% +0.0% (insignificant)
t-firefox-fishtank.trace 13.2 0.01 13.2 0.01 52.81% -0.0% (insignificant)
t-swfdec-giant-steps.trace 14.9 0.02 14.9 0.03 85.50% -0.1% (insignificant)
t-firefox-chalkboard.trace 36.6 0.02 36.7 0.03 100.00% -0.2%
t-firefox-canvas-alpha.trace 20.7 0.32 20.7 0.22 55.76% -0.3% (insignificant)
t-swfdec-youtube.trace 7.8 0.02 7.8 0.03 100.00% -0.5%
t-firefox-particles.trace 27.4 0.16 27.5 0.18 99.94% -0.6%
At most 4 outliers rejected per test per set.
Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
Confidence is based on Welch's t-test. Absolute changes less than 1%
can be accounted as measurement errors, even if statistically
significant.
v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Use pixman_asm_function instead of startfunc.
Rebased. Re-benchmarked on Raspberry Pi.
Commit message.
v5, Ben Avison <bavison at riscosopen.org> :
Fixed the bug exposed in blitters-test 4928372.
15 hours of testing, compared to the 45 minutes to hit
the bug originally.
Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Squash the fix, re-benchmark on Raspberry Pi.
---
pixman/pixman-arm-simd-asm.S | 267 +++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd-asm.h | 10 +-
pixman/pixman-arm-simd.c | 8 ++
3 files changed, 283 insertions(+), 2 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd6f788..b6f9a39 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -37,6 +37,7 @@
.altmacro
.p2align 2
+#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
/* A head macro should do all processing which results in an output of up to
@@ -689,3 +690,269 @@ generate_composite_function \
/******************************************************************************/
+.macro over_white_8888_8888_ca_init
+ HALF .req SRC
+ TMP0 .req STRIDE_D
+ TMP1 .req STRIDE_S
+ TMP2 .req STRIDE_M
+ TMP3 .req ORIG_W
+ WK4 .req SCRATCH
+ line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
+ ldr SCRATCH, =0x800080
+ mov HALF, #0x80
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+ .set DST_PRELOAD_BIAS, 8
+.endm
+
+.macro over_white_8888_8888_ca_cleanup
+ .set DST_PRELOAD_BIAS, 0
+ .unreq HALF
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq WK4
+.endm
+
+.macro over_white_8888_8888_ca_combine m, d
+ uxtb16 TMP1, TMP0 /* rb_notmask */
+ uxtb16 TMP2, d /* rb_dest; 1 stall follows */
+ smlatt TMP3, TMP2, TMP1, HALF /* red */
+ smlabb TMP2, TMP2, TMP1, HALF /* blue */
+ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
+ uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
+ smlatt d, TMP1, TMP0, HALF /* alpha */
+ smlabb TMP1, TMP1, TMP0, HALF /* green */
+ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+ pkhbt TMP1, TMP1, d, lsl #16 /* ag */
+ uxtab16 TMP0, TMP0, TMP0, ror #8
+ uxtab16 TMP1, TMP1, TMP1, ror #8
+ mov TMP0, TMP0, ror #8
+ sel d, TMP0, TMP1
+ uqadd8 d, d, m /* d is a late result */
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_head
+ pixld , 4, 1, MASK, 0
+ pixld , 4, 3, DST, 0
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_tail
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ bcc 03f
+ mov WK3, WK1
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: pixst , 4, 3, DST
+03:
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_head
+ pixld , 8, 1, MASK, 0
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_tail
+ pixld , 8, 3, DST
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ movcs WK3, WK1
+ bcs 02f
+ teq WK2, #0
+ beq 05f
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: mvn TMP0, WK2
+ teq WK2, WK2, asr #32
+ bne 03f
+ movcs WK4, WK2
+ b 04f
+03: over_white_8888_8888_ca_combine WK2, WK4
+04: pixst , 8, 3, DST
+05:
+.endm
+
+.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_head
+ .else
+ .if numbytes == 16
+ over_white_8888_8888_ca_2pixels_head
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+ over_white_8888_8888_ca_2pixels_head
+ .endif
+.endm
+
+.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_tail
+ .else
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
+ 2, /* prefetch distance */ \
+ over_white_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_white_8888_8888_ca_cleanup, \
+ over_white_8888_8888_ca_process_head, \
+ over_white_8888_8888_ca_process_tail
+
+
+.macro over_n_8888_8888_ca_init
+ /* Set up constants. RB_SRC and AG_SRC are in registers;
+ * RB_FLDS, A_SRC, and the two HALF values need to go on the
+ * stack (and the ful SRC value is already there) */
+ ldr SCRATCH, [sp, #ARGS_STACK_OFFSET]
+ mov WK0, #0x00FF0000
+ orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */
+ mov WK1, #0x80 /* HALF default value */
+ mov WK2, SCRATCH, lsr #24 /* A_SRC */
+ orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
+ push {WK0-WK3}
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
+ uxtb16 SRC, SCRATCH
+ uxtb16 STRIDE_S, SCRATCH, ror #8
+
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, WK3, WK3
+
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ WK0 .req Y
+ WK1 .req STRIDE_D
+ RB_SRC .req SRC
+ AG_SRC .req STRIDE_S
+ WK2 .req STRIDE_M
+ RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */
+ A_SRC .req r8
+ HALF .req r9
+ WK3 .req r10
+ WK4 .req r11
+ WK5 .req SCRATCH
+ WK6 .req ORIG_W
+
+ line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8888_8888_ca_cleanup
+ add sp, sp, #16
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
+
+ .unreq WK0
+ .unreq WK1
+ .unreq RB_SRC
+ .unreq AG_SRC
+ .unreq WK2
+ .unreq RB_FLDS
+ .unreq A_SRC
+ .unreq HALF
+ .unreq WK3
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ WK0 .req r8
+ WK1 .req r9
+ WK2 .req r10
+ WK3 .req r11
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_head
+ pixld , 4, 6, MASK, 0
+ pixld , 4, 0, DST, 0
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_tail
+ ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
+ uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
+ teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */
+ bne 20f
+ bcc 40f
+ /* Mask is fully opaque (all channels) */
+ ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
+ eors A_SRC, A_SRC, #0xFF
+ bne 10f
+ /* Source is also opaque - same as src_8888_8888 */
+ mov WK0, WK6
+ b 30f
+10: /* Same as over_8888_8888 */
+ mul_8888_8 WK0, A_SRC, WK5, HALF
+ uqadd8 WK0, WK0, WK6
+ b 30f
+20: /* No simplifications possible - do it the hard way */
+ uxtb16 WK2, WK6, ror #8 /* ag_mask */
+ mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */
+ mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */
+ ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
+ uxtb16 WK5, WK0 /* rb_dest */
+ uxtab16 WK3, WK3, WK3, ror #8
+ uxtb16 WK6, WK0, ror #8 /* ag_dest */
+ uxtab16 WK4, WK4, WK4, ror #8
+ smlatt WK0, RB_SRC, WK1, HALF /* red1 */
+ smlabb WK1, RB_SRC, WK1, HALF /* blue1 */
+ bic WK3, RB_FLDS, WK3, lsr #8
+ bic WK4, RB_FLDS, WK4, lsr #8
+ pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */
+ smlatt WK0, WK5, WK3, HALF /* red2 */
+ smlabb WK3, WK5, WK3, HALF /* blue2 */
+ uxtab16 WK1, WK1, WK1, ror #8
+ smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */
+ pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */
+ smlabb WK0, AG_SRC, WK2, HALF /* green1 */
+ smlatt WK2, WK6, WK4, HALF /* alpha2 */
+ smlabb WK4, WK6, WK4, HALF /* green2 */
+ pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */
+ uxtab16 WK3, WK3, WK3, ror #8
+ pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */
+ uxtab16 WK0, WK0, WK0, ror #8
+ uxtab16 WK4, WK4, WK4, ror #8
+ mov WK1, WK1, ror #8
+ mov WK3, WK3, ror #8
+ sel WK2, WK1, WK0 /* recombine source*mask */
+ sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */
+ uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
+30: /* The destination buffer is already in the L1 cache, so
+ * there's little point in amalgamating writes */
+ pixst , 4, 0, DST
+40:
+.endm
+
+.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .rept (numbytes / 4) - 1
+ over_n_8888_8888_ca_1pixel_head
+ over_n_8888_8888_ca_1pixel_tail
+ .endr
+ over_n_8888_8888_ca_1pixel_head
+.endm
+
+.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
+ over_n_8888_8888_ca_1pixel_tail
+.endm
+
+pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+ ldr ip, [sp]
+ cmp ip, #-1
+ beq pixman_composite_over_white_8888_8888_ca_asm_armv6
+ /* else drop through... */
+ .endfunc
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+ 2, /* prefetch distance */ \
+ over_n_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_n_8888_8888_ca_cleanup, \
+ over_n_8888_8888_ca_process_head, \
+ over_n_8888_8888_ca_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 852a113..1bb8b45 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -80,6 +80,12 @@
.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
/*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
+
+/*
* Offset into stack where mask and source pointer/stride can be accessed.
*/
#ifdef DEBUG_PARAMS
@@ -462,11 +468,11 @@
.if dst_r_bpp > 0
tst DST, #16
bne 111f
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
112:
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 8fbc439..dd6b907 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -56,6 +56,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
+ uint32_t, 1, uint32_t, 1)
+
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
uint16_t, uint16_t)
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
@@ -238,6 +241,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
--
1.8.3.2
More information about the Pixman
mailing list