[Pixman] [PATCH 10/10] ARM: optimization for scaled src_0565_0565 with nearest filter
Siarhei Siamashka
siarhei.siamashka at gmail.com
Wed Nov 3 16:22:25 PDT 2010
From: Siarhei Siamashka <siarhei.siamashka at nokia.com>
The performance improvement is only in the ballpark of 5% when
compared against C code built with a reasonably good compiler
(gcc 4.5.1). But gcc 4.4 produces approximately 30% slower code
here, so assembly optimization makes sense to avoid dependency
on the compiler quality and/or optimization options.
Benchmark from ARM11:
== before ==
op=1, src_fmt=10020565, dst_fmt=10020565, speed=34.86 MPix/s
== after ==
op=1, src_fmt=10020565, dst_fmt=10020565, speed=36.62 MPix/s
Benchmark from ARM Cortex-A8:
== before ==
op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s
== after ==
op=1, src_fmt=10020565, dst_fmt=10020565, speed=94.91 MPix/s
---
pixman/pixman-arm-simd-asm.S | 70 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 7 ++++
2 files changed, 77 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a3d2d40..7567700 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,5 +1,6 @@
/*
* Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
@@ -328,3 +329,72 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
pop {r4, r5, r6, r7, r8, r9, r10, r11}
bx lr
.endfunc
+
+/*
+ * Note: This function is only using armv4t instructions (not even armv6),
+ * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ * be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ */
+pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+ W .req r0
+ DST .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ VXMASK .req r6
+
+ ldr UNIT_X, [sp]
+ push {r4, r5, r6, r7}
+ mvn VXMASK, #1
+
+ /* define helper macro */
+ .macro scale_2_pixels
+ ldrh TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, lsr #15
+ add VX, VX, UNIT_X
+ strh TMP1, [DST], #2
+
+ ldrh TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, lsr #15
+ add VX, VX, UNIT_X
+ strh TMP2, [DST], #2
+ .endm
+
+ /* now do the scaling */
+ and TMP1, VXMASK, VX, lsr #15
+ add VX, VX, UNIT_X
+ subs W, #4
+ blt 2f
+1: /* main loop, process 4 pixels per iteration */
+ scale_2_pixels
+ scale_2_pixels
+ subs W, W, #4
+ bge 1b
+2:
+ tst W, #2
+ beq 2f
+ scale_2_pixels
+2:
+ tst W, #1
+ ldrneh TMP1, [SRC, TMP1]
+ strneh TMP1, [DST], #2
+ /* cleanup helper macro */
+ .purgem scale_2_pixels
+ .unreq DST
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq VXMASK
+ /* return */
+ pop {r4, r5, r6, r7}
+ bx lr
+.endfunc
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index d466a31..3b05007 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -29,6 +29,7 @@
#include "pixman-private.h"
#include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
@@ -386,6 +387,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+ uint16_t, uint16_t)
+
static const pixman_fast_path_t arm_simd_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -404,6 +408,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
{ PIXMAN_OP_NONE },
};
--
1.7.2.2
More information about the Pixman
mailing list