[Pixman] [PATCH 3/3] ARMv6: Add fast path for over_n_8888_8888_ca
Pekka Paalanen
ppaalanen at gmail.com
Mon Mar 31 05:03:45 PDT 2014
From: Ben Avison <bavison at riscosopen.org>
Benchmark results, "before" is the patch
- ARMv6: Add fast path for over_reverse_n_8888,
"after" contains the additional patches:
- ARM: share pixman_asm_function definition
- ARMv6: Support for very variable-hungry composite operations
- ARMv6: Add fast path for over_n_8888_8888_ca (this patch)
lowlevel-blt-bench, over_n_8888_8888_ca, 100 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 2.7 0.0 16.0 0.0 100.00% +495.0%
L2 2.4 0.0 14.3 0.2 100.00% +497.7%
M 2.3 0.0 14.8 0.0 100.00% +528.6%
HT 2.2 0.0 9.6 0.0 100.00% +341.4%
VT 2.2 0.0 9.4 0.0 100.00% +331.7%
R 2.2 0.0 9.4 0.0 100.00% +327.3%
RT 1.9 0.0 5.3 0.1 100.00% +181.5%
At most 3 outliers rejected per case per set.
cairo-perf-trace with trimmed traces, 30 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
t-firefox-talos-gfx.trace 32.9 0.4 25.4 0.4 100.00% +29.6%
t-firefox-scrolling.trace 31.2 0.1 24.6 0.1 100.00% +26.7%
t-gnome-terminal-vim.trace 22.2 0.1 19.8 0.2 100.00% +11.7%
t-firefox-planet-gnome.trace 11.5 0.0 10.9 0.0 100.00% +6.4%
t-evolution.trace 13.8 0.1 13.0 0.1 100.00% +5.9%
t-gvim.trace 33.5 0.2 33.0 0.2 100.00% +1.3%
t-xfce4-terminal-a1.trace 4.8 0.0 4.8 0.0 100.00% +1.1%
t-poppler-reseau.trace 22.4 0.1 22.1 0.1 100.00% +1.0%
t-firefox-talos-svg.trace 20.5 0.1 20.4 0.0 100.00% +0.7%
t-gnome-system-monitor.trace 17.2 0.0 17.1 0.0 100.00% +0.6%
t-swfdec-giant-steps.trace 14.9 0.0 14.8 0.0 100.00% +0.6%
t-midori-zoomed.trace 8.0 0.0 8.0 0.0 100.00% +0.5%
t-firefox-paintball.trace 18.0 0.0 17.9 0.0 100.00% +0.5%
t-firefox-canvas.trace 18.0 0.0 17.9 0.0 100.00% +0.3%
t-firefox-asteroids.trace 11.1 0.0 11.1 0.0 100.00% +0.3%
t-firefox-fishbowl.trace 21.2 0.0 21.1 0.0 100.00% +0.3%
t-chromium-tabs.trace 4.9 0.0 4.9 0.0 95.59% +0.3% (insignificant)
t-poppler.trace 9.7 0.0 9.7 0.1 92.48% +0.2% (insignificant)
t-firefox-canvas-swscroll.trace 32.1 0.1 32.1 0.1 76.28% +0.1% (insignificant)
t-firefox-fishtank.trace 13.2 0.0 13.2 0.0 82.91% +0.0% (insignificant)
t-swfdec-youtube.trace 7.8 0.0 7.8 0.0 16.82% +0.0% (insignificant)
t-firefox-chalkboard.trace 36.6 0.0 36.6 0.0 100.00% -0.1%
t-grads-heat-map.trace 4.4 0.0 4.4 0.0 99.95% -0.6%
t-firefox-particles.trace 27.3 0.2 27.5 0.1 100.00% -0.6%
t-firefox-canvas-alpha.trace 20.5 0.3 20.7 0.3 97.72% -0.8% (insignificant)
At most 6 outliers rejected per case per set.
Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
Confidence is based on Welch's t-test. Absolute changes less than 1%
can be accounted as measurement errors, even if statistically
significant.
v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Use pixman_asm_function instead of startfunc.
Rebased. Re-benchmarked on Raspberry Pi.
Commit message.
---
pixman/pixman-arm-simd-asm.S | 265 +++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 8 ++
2 files changed, 273 insertions(+)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd6f788..7bb18cb 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -37,6 +37,7 @@
.altmacro
.p2align 2
+#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
/* A head macro should do all processing which results in an output of up to
@@ -689,3 +690,267 @@ generate_composite_function \
/******************************************************************************/
+.macro over_white_8888_8888_ca_init
+ HALF .req SRC
+ TMP0 .req STRIDE_D
+ TMP1 .req STRIDE_S
+ TMP2 .req STRIDE_M
+ TMP3 .req ORIG_W
+ WK4 .req SCRATCH
+ line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
+ ldr SCRATCH, =0x800080
+ mov HALF, #0x80
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro over_white_8888_8888_ca_cleanup
+ .unreq HALF
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq WK4
+.endm
+
+.macro over_white_8888_8888_ca_combine m, d
+ uxtb16 TMP1, TMP0 /* rb_notmask */
+ uxtb16 TMP2, d /* rb_dest; 1 stall follows */
+ smlatt TMP3, TMP2, TMP1, HALF /* red */
+ smlabb TMP2, TMP2, TMP1, HALF /* blue */
+ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
+ uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
+ smlatt d, TMP1, TMP0, HALF /* alpha */
+ smlabb TMP1, TMP1, TMP0, HALF /* green */
+ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+ pkhbt TMP1, TMP1, d, lsl #16 /* ag */
+ uxtab16 TMP0, TMP0, TMP0, ror #8
+ uxtab16 TMP1, TMP1, TMP1, ror #8
+ mov TMP0, TMP0, ror #8
+ sel d, TMP0, TMP1
+ uqadd8 d, d, m /* d is a late result */
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_head
+ pixld , 4, 1, MASK, 0
+ pixld , 4, 3, DST, 0
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_tail
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ bcc 03f
+ mov WK3, WK1
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: pixst , 4, 3, DST
+03:
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_head
+ pixld , 8, 1, MASK, 0
+ pixld , 8, 3, DST
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_tail
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ movcs WK3, WK1
+ bcs 02f
+ teq WK2, #0
+ beq 05f
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: mvn TMP0, WK2
+ teq WK2, WK2, asr #32
+ bne 03f
+ movcs WK4, WK2
+ b 04f
+03: over_white_8888_8888_ca_combine WK2, WK4
+04: pixst , 8, 3, DST
+05:
+.endm
+
+.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_head
+ .else
+ .if numbytes == 16
+ over_white_8888_8888_ca_2pixels_head
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+ over_white_8888_8888_ca_2pixels_head
+ .endif
+.endm
+
+.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_tail
+ .else
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
+ 2, /* prefetch distance */ \
+ over_white_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_white_8888_8888_ca_cleanup, \
+ over_white_8888_8888_ca_process_head, \
+ over_white_8888_8888_ca_process_tail
+
+
+.macro over_n_8888_8888_ca_init
+ /* Set up constants. RB_SRC and AG_SRC are in registers;
+ * RB_FLDS, A_SRC, and the two HALF values need to go on the
+ * stack (and the ful SRC value is already there) */
+ ldr SCRATCH, [sp, #ARGS_STACK_OFFSET]
+ mov WK0, #0x00FF0000
+ orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */
+ mov WK1, #0x80 /* HALF default value */
+ mov WK2, SCRATCH, lsr #24 /* A_SRC */
+ orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
+ push {WK0-WK3}
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
+ uxtb16 SRC, SCRATCH
+ uxtb16 STRIDE_S, SCRATCH, ror #8
+
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, WK3, WK3
+
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ WK0 .req Y
+ WK1 .req STRIDE_D
+ RB_SRC .req SRC
+ AG_SRC .req STRIDE_S
+ WK2 .req STRIDE_M
+ RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */
+ A_SRC .req r8
+ HALF .req r9
+ WK3 .req r10
+ WK4 .req r11
+ WK5 .req SCRATCH
+ WK6 .req ORIG_W
+
+ line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8888_8888_ca_cleanup
+ add sp, sp, #16
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
+
+ .unreq WK0
+ .unreq WK1
+ .unreq RB_SRC
+ .unreq AG_SRC
+ .unreq WK2
+ .unreq RB_FLDS
+ .unreq A_SRC
+ .unreq HALF
+ .unreq WK3
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ WK0 .req r8
+ WK1 .req r9
+ WK2 .req r10
+ WK3 .req r11
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_head
+ pixld , 4, 6, MASK, 0
+ pixld , 4, 0, DST, 0
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_tail
+ ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
+ uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
+ teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */
+ bne 20f
+ bcc 40f
+ /* Mask is fully opaque (all channels) */
+ ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
+ eors A_SRC, A_SRC, #0xFF
+ bne 10f
+ /* Source is also opaque - same as src_8888_8888 */
+ mov WK0, WK6
+ b 30f
+10: /* Same as over_8888_8888 */
+ mul_8888_8 WK0, A_SRC, WK5, HALF
+ uqadd8 WK0, WK0, WK6
+ b 30f
+20: /* No simplifications possible - do it the hard way */
+ uxtb16 WK2, WK6, ror #8 /* ag_mask */
+ mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */
+ mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */
+ ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
+ uxtb16 WK5, WK0 /* rb_dest */
+ uxtab16 WK3, WK3, WK3, ror #8
+ uxtb16 WK6, WK0, ror #8 /* ag_dest */
+ uxtab16 WK4, WK4, WK4, ror #8
+ smlatt WK0, RB_SRC, WK1, HALF /* red1 */
+ smlabb WK1, RB_SRC, WK1, HALF /* blue1 */
+ bic WK3, RB_FLDS, WK3, lsr #8
+ bic WK4, RB_FLDS, WK4, lsr #8
+ pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */
+ smlatt WK0, WK5, WK3, HALF /* red2 */
+ smlabb WK3, WK5, WK3, HALF /* blue2 */
+ uxtab16 WK1, WK1, WK1, ror #8
+ smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */
+ pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */
+ smlabb WK0, AG_SRC, WK2, HALF /* green1 */
+ smlatt WK2, WK6, WK4, HALF /* alpha2 */
+ smlabb WK4, WK6, WK4, HALF /* green2 */
+ pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */
+ uxtab16 WK3, WK3, WK3, ror #8
+ pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */
+ uxtab16 WK0, WK0, WK0, ror #8
+ uxtab16 WK4, WK4, WK4, ror #8
+ mov WK1, WK1, ror #8
+ mov WK3, WK3, ror #8
+ sel WK2, WK1, WK0 /* recombine source*mask */
+ sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */
+ uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
+30: /* The destination buffer is already in the L1 cache, so
+ * there's little point in amalgamating writes */
+ pixst , 4, 0, DST
+40:
+.endm
+
+.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .rept (numbytes / 4) - 1
+ over_n_8888_8888_ca_1pixel_head
+ over_n_8888_8888_ca_1pixel_tail
+ .endr
+ over_n_8888_8888_ca_1pixel_head
+.endm
+
+.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
+ over_n_8888_8888_ca_1pixel_tail
+.endm
+
+pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+ ldr ip, [sp]
+ cmp ip, #-1
+ beq pixman_composite_over_white_8888_8888_ca_asm_armv6
+ /* else drop through... */
+ .endfunc
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+ 2, /* prefetch distance */ \
+ over_n_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_n_8888_8888_ca_cleanup, \
+ over_n_8888_8888_ca_process_head, \
+ over_n_8888_8888_ca_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 8fbc439..dd6b907 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -56,6 +56,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
+ uint32_t, 1, uint32_t, 1)
+
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
uint16_t, uint16_t)
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
@@ -238,6 +241,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
--
1.8.3.2
More information about the Pixman
mailing list