[Pixman] [PATCH] ARM: NEON: optimization for bilinear scaled 'over 8888 8888'

Tue Mar 15 02:02:59 PDT 2011

Hi everyone,

I wrote some NEON codes for bilinear scaled 'over 8888 8888' extending
previous patches from siarhei siamashka.
I put some combining operations just before storing interpolated pixels into
the destination buffer.
It has passed my several naive test cases, but It would be very appreciated
if any one can review my implementation.
And please let me know where I can find the Microbenchmark for performance
measurement.

Thanks in advance.

---
 pixman/pixman-arm-neon-asm.S |   96
+++++++++++++++++++++++++++++++++++++-----
 pixman/pixman-arm-neon.c     |    4 ++
 2 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 71b30ac..e178a75 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2554,7 +2554,74 @@ fname:
 .endif
 .endm

-.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+/*
+ * Combine functions are called just before writing results to memory.
+ * Assume that source pixels are located in d0, d1 registers
+ * in a8r8g8b8 format.
+ * Combine functions may use registers d2~d31 and overwrite result
+ * on d0, d1 registers.
+ * TODO: 0565 format, optimization for 2 and 1 pixel case
+ * TODO: Fix hard-coded prefetch distance
+ */
+
+/* Dummy combine function for operator SRC */
+.macro bilinear_combine_src dst_fmt, numpix
+.endm
+
+/* Destination pixel load functions for bilinear_combine_XXXX */
+.macro bilinear_load_dst_8888 numpix
+.if numpix == 4
+ vld1.32 {d2, d3}, [OUT]
+ pld [OUT, #16]
+.elseif numpix == 2
+ vld1.32 {d2}, [OUT]
+.elseif numpix == 1
+ vld1.32 {d2[0]}, [OUT]
+.else
+ .error bilinear_load_dst_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_load_dst_0565 numpix
+.if numpix == 4
+.elseif numpix == 2
+.elseif numpix == 1
+.else
+ .error bilinear_load_dst_0565 numpix is unsupported
+.endif
+.endm
+
+/* Combine function for operator OVER */
+.macro bilinear_combine_over dst_fmt, numpix
+ bilinear_load_dst_&dst_fmt numpix
+ /* Deinterleave source & destination */
+ vuzp.8 d0, d1
+ vuzp.8 d0, d1
+ vuzp.8 d2, d3
+ vuzp.8 d2, d3
+
+ /* invert source alpha */
+ vdup.32 d4, d1[1]
+ vmvn.8 d4, d4
+
+ /* result = dst*(256 - srcA) */
+ vmull.u8 q3, d2, d4
+ vmull.u8 q4, d3, d4
+
+ vrshr.u16 q5, q3, #8
+ vrshr.u16 q6, q4, #8
+ vraddhn.u16 d14, q5, q3
+ vraddhn.u16 d15, q6, q4
+
+ /* result += src (premultiplied) */
+ vqadd.u8 q0, q7, q0
+
+ /* Interleave (rrrr, gggg, bbbb, aaaa) into (rgba, rgba, rgba, rgba) */
+ vuzp.8 d0, d1
+ vuzp.8 d0, d1
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt, op
     bilinear_load_&src_fmt d0, d1, d2
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
@@ -2568,10 +2635,11 @@ fname:
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
+ bilinear_combine_&op dst_fmt, 1
     bilinear_store_&dst_fmt 1, q2, q3
 .endm

-.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt, op
     bilinear_load_and_vertical_interpolate_two_&src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
     vshr.u16  q15, q12, #8
@@ -2585,10 +2653,11 @@ fname:
     vshrn.u32 d30, q0, #16
     vshrn.u32 d31, q10, #16
     vmovn.u16 d0, q15
+ bilinear_combine_&op dst_fmt, 2
     bilinear_store_&dst_fmt 2, q2, q3
 .endm

-.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt, op
     bilinear_load_and_vertical_interpolate_four_&src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
@@ -2616,6 +2685,7 @@ fname:
     vshrn.u32 d5, q8, #16
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
+ bilinear_combine_&op dst_fmt, 4
     bilinear_store_&dst_fmt 4, q2, q3
 .endm

@@ -2635,7 +2705,7 @@ fname:
  *                      pixels ahead
  */

-.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, op, \
                                        bpp_shift, prefetch_distance

 pixman_asm_function fname
@@ -2673,17 +2743,17 @@ pixman_asm_function fname
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
 0:
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt, op
     subs      WIDTH, WIDTH, #4
     bge       0b
 1:
     tst       WIDTH, #2
     beq       2f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt, op
 2:
     tst       WIDTH, #1
     beq       3f
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt, op
 3:
     pop       {r4, r5, r6, r7, r8, r9}
     bx        lr
@@ -2706,13 +2776,17 @@ pixman_asm_function fname
 .endm

 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2,
28
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888,
src, 2, 28

 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2,
28
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565,
src, 2, 28

 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1,
28
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888,
src, 1, 28

 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1,
28
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565,
src, 1, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, 8888, 8888, over,
2, 28
+
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 0a10ca1..7042ce1 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -136,6 +136,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon,
0565_x888, SRC,
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
                                          uint16_t, uint16_t)

+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(0, neon, 8888_8888, OVER,
+ uint32_t, uint32_t)
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
                                    int32_t   h,
@@ -362,6 +364,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),

+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+
     { PIXMAN_OP_NONE },
 };


-- 
Best Regards,
Taekyun Kim
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/pixman/attachments/20110315/23a818ea/attachment.html>