[Pixman] [PATCH 7/7] ARM: NEON optimization for bilinear scaled 'src_8888_8888'
Siarhei Siamashka
siarhei.siamashka at gmail.com
Tue Feb 22 13:23:48 PST 2011
From: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Initial NEON optimization for bilinear scaling. Can be probably
improved more.
Benchmark on ARM Cortex-A8:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=10.72 MPix/s
after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s
---
pixman/pixman-arm-neon-asm.S | 197 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 45 ++++++++++
2 files changed, 242 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 47daf45..c168e10 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \
10, /* dst_r_basereg */ \
8, /* src_basereg */ \
15 /* mask_basereg */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+.macro bilinear_interpolate_last_pixel
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vshr.u16 d30, d24, #8
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ /* 5 cycles bubble */
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #16
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ vst1.32 {d0[0]}, [OUT, :32]!
+.endm
+
+.macro bilinear_interpolate_two_pixels
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d20}, [TMP1]
+ vld1.32 {d21}, [TMP2]
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d30, q0, #16
+ vshrn.u32 d31, q10, #16
+ vmovn.u16 d0, q15
+ vst1.32 {d0}, [OUT]!
+.endm
+
+.macro bilinear_interpolate_four_pixels
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d20}, [TMP1]
+ vld1.32 {d21}, [TMP2]
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d4}, [TMP1]
+ vld1.32 {d5}, [TMP2]
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d5, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d16}, [TMP1]
+ vld1.32 {d17}, [TMP2]
+ vmull.u8 q9, d16, d28
+ vmlal.u8 q9, d17, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q2, d6, #8
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #8
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
+ vshrn.u32 d4, q2, #16
+ vshrn.u32 d5, q8, #16
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vst1.32 {d0, d1}, [OUT]!
+.endm
+
+
+/*
+ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
+ * const uint32_t * top,
+ * const uint32_t * bottom,
+ * int wt,
+ * int wb,
+ * pixman_fixed_t x,
+ * pixman_fixed_t ux,
+ * int width)
+ */
+
+pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+
+ mov ip, sp
+ push {r4, r5, r6, r7}
+ ldmia ip, {WB, X, UX, WIDTH}
+
+ cmp WIDTH, #0
+ ble 3f
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+ vadd.u16 q13, q13, q13
+
+ subs WIDTH, WIDTH, #4
+ blt 1f
+0:
+ bilinear_interpolate_four_pixels
+ subs WIDTH, WIDTH, #4
+ bge 0b
+1:
+ tst WIDTH, #2
+ beq 2f
+ bilinear_interpolate_two_pixels
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_interpolate_last_pixel
+3:
+ pop {r4, r5, r6, r7}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq BOTTOM
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+.endfunc
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3e0c0d1..c7c0254 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits,
}
}
+void
+pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out,
+ const uint32_t * top,
+ const uint32_t * bottom,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width);
+
+static force_inline void
+scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
+ src_bottom, wt, wb,
+ vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
static const pixman_fast_path_t arm_neon_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
{ PIXMAN_OP_NONE },
};
--
1.7.3.4
More information about the Pixman
mailing list