[Pixman] [PATCH] ARMv6: Add fast path for over_reverse_n_8888
Pekka Paalanen
ppaalanen at gmail.com
Fri Mar 28 02:13:21 PDT 2014
From: Ben Avison <bavison at riscosopen.org>
Benchmark results, "before" is upstream commit
c343846 lowlevel-blt-bench: add in_reverse_8888_8888 test
and "after" is with this patch only added on top.
lowlevel-blt-bench, over_reverse_n_8888, 100 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 15.1 0.1 274.5 2.3 100.00% +1718.9%
L2 12.8 0.3 181.8 0.7 100.00% +1315.5%
M 10.8 0.0 77.9 0.0 100.00% +621.2%
HT 9.7 0.0 29.4 0.2 100.00% +204.9%
VT 9.5 0.0 26.7 0.1 100.00% +179.3%
R 9.3 0.0 25.3 0.1 100.00% +173.6%
RT 6.0 0.1 11.0 0.2 100.00% +82.9%
At most 16 outliers rejected per case per set.
cairo-perf-trace with trimmed traces, 30 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
t-poppler.trace 12.9 0.1 9.7 0.0 100.00% +32.6%
t-firefox-talos-gfx.trace 33.2 0.7 32.9 0.4 95.23% +0.9% (insignificant)
t-firefox-particles.trace 27.4 0.1 27.3 0.2 99.65% +0.4%
t-firefox-canvas-alpha.trace 20.5 0.3 20.5 0.3 57.51% +0.3% (insignificant)
t-poppler-reseau.trace 22.4 0.1 22.4 0.1 95.69% +0.3% (insignificant)
t-firefox-fishtank.trace 13.2 0.0 13.2 0.0 99.84% +0.1%
t-swfdec-giant-steps.trace 14.9 0.0 14.9 0.0 87.68% +0.1% (insignificant)
t-swfdec-youtube.trace 7.8 0.0 7.8 0.0 35.22% +0.1% (insignificant)
t-firefox-planet-gnome.trace 11.5 0.0 11.5 0.0 29.37% +0.0% (insignificant)
t-firefox-fishbowl.trace 21.2 0.0 21.2 0.0 18.09% +0.0% (insignificant)
t-grads-heat-map.trace 4.4 0.0 4.4 0.0 1.84% +0.0% (insignificant)
t-firefox-paintball.trace 18.0 0.0 18.0 0.0 33.43% -0.0% (insignificant)
t-firefox-talos-svg.trace 20.5 0.0 20.5 0.1 68.56% -0.1% (insignificant)
t-midori-zoomed.trace 8.0 0.0 8.0 0.0 99.98% -0.1%
t-firefox-canvas-swscroll.trace 32.1 0.1 32.1 0.1 85.27% -0.1% (insignificant)
t-gnome-system-monitor.trace 17.2 0.0 17.2 0.0 99.97% -0.2%
t-firefox-chalkboard.trace 36.5 0.0 36.6 0.0 100.00% -0.2%
t-firefox-asteroids.trace 11.1 0.0 11.1 0.0 100.00% -0.2%
t-firefox-canvas.trace 17.9 0.0 18.0 0.0 100.00% -0.3%
t-chromium-tabs.trace 4.9 0.0 4.9 0.0 97.95% -0.3% (insignificant)
t-xfce4-terminal-a1.trace 4.8 0.0 4.8 0.0 100.00% -0.4%
t-firefox-scrolling.trace 31.1 0.1 31.2 0.1 100.00% -0.5%
t-evolution.trace 13.7 0.1 13.8 0.1 99.99% -0.6%
t-gnome-terminal-vim.trace 22.0 0.2 22.2 0.1 99.99% -0.7%
t-gvim.trace 33.2 0.2 33.5 0.2 100.00% -0.8%
At most 6 outliers rejected per case per set.
Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
Changes in the order of +/- 1% can be accounted for measurement errors,
even if they are deemed to be statistically significant. This claim is
based on comparing two 30-iteration identical "before" runs using the
exact same binaries, and observing changes from -0.4% to +0.5% with
>=99% confidence.
Confidence is based on Welch's t-test.
v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Rebased, re-benchmarked on Raspberry Pi, commit message.
---
pixman/pixman-arm-simd-asm.S | 78 ++++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 6 ++++
2 files changed, 84 insertions(+)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index c209688..dd6f788 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -611,3 +611,81 @@ generate_composite_function \
/******************************************************************************/
+.macro over_reverse_n_8888_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ ldr MASK, =0x00800080
+ /* Split source pixel into RB/AG parts */
+ uxtb16 STRIDE_S, SRC
+ uxtb16 STRIDE_M, SRC, ror #8
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ line_saved_regs STRIDE_D, ORIG_W
+.endm
+
+.macro over_reverse_n_8888_newline
+ mov STRIDE_D, #0xFF
+.endm
+
+.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_reverse_n_8888_1pixel d, is_only
+ teq WK&d, #0
+ beq 8f /* replace with source */
+ bics ORIG_W, STRIDE_D, WK&d, lsr #24
+ .if is_only == 1
+ beq 49f /* skip store */
+ .else
+ beq 9f /* write same value back */
+ .endif
+ mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
+ mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sel ORIG_W, SCRATCH, ORIG_W
+ uqadd8 WK&d, WK&d, ORIG_W
+ b 9f
+8: mov WK&d, SRC
+9:
+.endm
+
+.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+ over_reverse_n_8888_1pixel reg1, 1
+ .else
+ and SCRATCH, WK®1, WK®2
+ .if numbytes == 16
+ and SCRATCH, SCRATCH, WK®3
+ and SCRATCH, SCRATCH, WK®4
+ .endif
+ mvns SCRATCH, SCRATCH, asr #24
+ beq 49f /* skip store if all opaque */
+ over_reverse_n_8888_1pixel reg1, 0
+ over_reverse_n_8888_1pixel reg2, 0
+ .if numbytes == 16
+ over_reverse_n_8888_1pixel reg3, 0
+ over_reverse_n_8888_1pixel reg4, 0
+ .endif
+ .endif
+ pixst , numbytes, reg1, DST
+49:
+.endm
+
+.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
+ over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+ 3, /* prefetch distance */ \
+ over_reverse_n_8888_init, \
+ over_reverse_n_8888_newline, \
+ nop_macro, /* cleanup */ \
+ over_reverse_n_8888_process_head, \
+ over_reverse_n_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index af062e1..8fbc439 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -47,6 +47,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
+ uint32_t, 1)
+
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -225,6 +228,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),
+
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
--
1.8.3.2
More information about the Pixman
mailing list