[Pixman] [PATCH 2/7] ARM: use aligned memory writes in NEON bilinear scaling code

Mon Apr 4 20:41:55 PDT 2011

From: Siarhei Siamashka <siarhei.siamashka at nokia.com>

---
 pixman/pixman-arm-neon-asm.S |   49 ++++++++++++++++++++++++++++++------------
 1 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 8788e95..a4d6a9a 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2527,9 +2527,9 @@ fname:
 
 .macro bilinear_store_8888 numpix, tmp1, tmp2
 .if numpix == 4
-    vst1.32   {d0, d1}, [OUT]!
+    vst1.32   {d0, d1}, [OUT, :128]!
 .elseif numpix == 2
-    vst1.32   {d0}, [OUT]!
+    vst1.32   {d0}, [OUT, :64]!
 .elseif numpix == 1
     vst1.32   {d0[0]}, [OUT, :32]!
 .else
@@ -2544,11 +2544,11 @@ fname:
     vuzp.u8 d0, d2
     convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
 .if numpix == 4
-    vst1.16   {d2}, [OUT]!
+    vst1.16   {d2}, [OUT, :64]!
 .elseif numpix == 2
-    vst1.32   {d2[0]}, [OUT]!
+    vst1.32   {d2[0]}, [OUT, :32]!
 .elseif numpix == 1
-    vst1.16   {d2[0]}, [OUT]!
+    vst1.16   {d2[0]}, [OUT, :16]!
 .else
     .error bilinear_store_0565 numpix is unsupported
 .endif
@@ -2622,8 +2622,7 @@ fname:
  * Main template macro for generating NEON optimized bilinear scanline
  * functions.
  *
- * TODO: use software pipelining and aligned writes to the destination buffer
- *       in order to improve performance
+ * TODO: use software pipelining in order to improve performance
  *
  * Bilinear scanline scaler macro template uses the following arguments:
  *  fname             - name of the function to generate
@@ -2635,7 +2634,8 @@ fname:
  */
 
 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
-                                       bpp_shift, prefetch_distance
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance
 
 pixman_asm_function fname
     OUT       .req      r0
@@ -2666,19 +2666,40 @@ pixman_asm_function fname
     vdup.u8   d28, WT
     vdup.u8   d29, WB
     vadd.u16  d25, d25, d26
-    vadd.u16  q13, q13, q13
 
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
     vshr.u16  q15, q12, #8
     vadd.u16  q12, q12, q13
 
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #2
+0:
+
+    /* start the main loop */
     subs      WIDTH, WIDTH, #4
     blt       1f
-    mov       PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
 0:
     bilinear_interpolate_four_pixels src_fmt, dst_fmt
     subs      WIDTH, WIDTH, #4
     bge       0b
 1:
+
+    /* handle the remaining trailing pixels */
     tst       WIDTH, #2
     beq       2f
     bilinear_interpolate_two_pixels src_fmt, dst_fmt
@@ -2708,13 +2729,13 @@ pixman_asm_function fname
 .endm
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 2, 28
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 1, 28
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 2, 28
 
 generate_bilinear_scanline_func \
-    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 1, 28
-- 
1.7.3.4