[Pixman] [PATCH 10/10] ARM: optimization for scaled src_0565_0565 with nearest filter

Wed Nov 3 16:22:25 PDT 2010

From: Siarhei Siamashka <siarhei.siamashka at nokia.com>

The performance improvement is only in the ballpark of 5% when
compared against C code built with a reasonably good compiler
(gcc 4.5.1). But gcc 4.4 produces approximately 30% slower code
here, so assembly optimization makes sense to avoid dependency
on the compiler quality and/or optimization options.

Benchmark from ARM11:
    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=34.86 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=36.62 MPix/s

Benchmark from ARM Cortex-A8:
    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=94.91 MPix/s
---
 pixman/pixman-arm-simd-asm.S |   70 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |    7 ++++
 2 files changed, 77 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a3d2d40..7567700 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -328,3 +329,72 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 .endfunc
+
+/*
+ * Note: This function is only using armv4t instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ */
+pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+	W	.req	r0
+	DST	.req	r1
+	SRC	.req	r2
+	VX	.req	r3
+	UNIT_X	.req	ip
+	TMP1	.req	r4
+	TMP2	.req	r5
+	VXMASK	.req	r6
+
+	ldr	UNIT_X, [sp]
+	push	{r4, r5, r6, r7}
+	mvn	VXMASK, #1
+
+	/* define helper macro */
+	.macro	scale_2_pixels
+		ldrh	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, lsr #15
+		add	VX, VX, UNIT_X
+		strh	TMP1, [DST], #2
+
+		ldrh	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, lsr #15
+		add	VX, VX, UNIT_X
+		strh	TMP2, [DST], #2
+	.endm
+
+	/* now do the scaling */
+	and	TMP1, VXMASK, VX, lsr #15
+	add	VX, VX, UNIT_X
+	subs	W, #4
+	blt	2f
+1: /* main loop, process 4 pixels per iteration */
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #4
+	bge	1b
+2:
+	tst	W, #2
+	beq	2f
+	scale_2_pixels
+2:
+	tst	W, #1
+	ldrneh	TMP1, [SRC, TMP1]
+	strneh	TMP1, [DST], #2
+	/* cleanup helper macro */
+	.purgem	scale_2_pixels
+	.unreq	DST
+	.unreq	SRC
+	.unreq	W
+	.unreq	VX
+	.unreq	UNIT_X
+	.unreq	TMP1
+	.unreq	TMP2
+	.unreq	VXMASK
+	/* return */
+	pop	{r4, r5, r6, r7}
+	bx	lr
+.endfunc
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index d466a31..3b05007 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -29,6 +29,7 @@
 
 #include "pixman-private.h"
 #include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
 
 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
 
@@ -386,6 +387,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+
 static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -404,6 +408,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
     { PIXMAN_OP_NONE },
 };
 
-- 
1.7.2.2