[Pixman] [PATCH] pixman: Add support for aarch64 neon optimization (ver.3.1)

Thu Apr 14 13:20:24 UTC 2016

Since aarch64 has different neon syntax from aarch32 and has no
support for (older) arm-simd,
there are no SIMD accelerations for pixman on aarch64.

We need new implementations.

This patch only contains FAST_PATH codes, not bilinear optimizations codes.
After completing optimization this patch, bilinear related codes should be done.


This patch contains additional optimization from my previous patch
to omit using unncessary register movings.

Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
Signed-off-by: Mizuki Asakura <ed6e117f at gmail.com>
---
diff -ruNp a/pixman/pixman/pixman-arma64-neon-asm.S
b/pixman/pixman/pixman-arma64-neon-asm.S

--- a/pixman/pixman/pixman-arma64-neon-asm.S    2016-04-14
22:09:47.120752451 +0900
+++ b/pixman/pixman/pixman-arma64-neon-asm.S    2016-04-14
22:06:45.092222137 +0900
@@ -3132,8 +3132,7 @@ generate_composite_function_nearest_scan
 .macro bilinear_load_8888 reg1, reg2, tmp
     asr       TMP1, X, #16
     add       X, X, UX
-    lsl       TMP2, TMP1, #2
-    add       TMP1, TOP, TMP2
+    add       TMP1, TOP, TMP1, lsl #2
     ld1       {&reg1&.2s}, [TMP1], STRIDE
     ld1       {&reg2&.2s}, [TMP1]
 .endm
@@ -3141,8 +3140,7 @@ generate_composite_function_nearest_scan
 .macro bilinear_load_0565 reg1, reg2, tmp
     asr       TMP1, X, #16
     add       X, X, UX
-    lsl       TMP2, TMP1, #1
-    add       TMP1, TOP, TMP2
+    add       TMP1, TOP, TMP1, lsl #1
     ld1       {&reg2&.s}[0], [TMP1], STRIDE
     ld1       {&reg2&.s}[1], [TMP1]
     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
diff -ruNp 160407/pixman/pixman/pixman-arma64-neon-asm.h
160408/pixman/pixman/pixman-arma64-neon-asm.h
--- a/pixman/pixman/pixman-arma64-neon-asm.h    2016-04-14
22:09:47.080752305 +0900
+++ b/pixman/pixman/pixman-arma64-neon-asm.h    2016-04-14
22:06:45.044222036 +0900
@@ -231,16 +231,14 @@
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP1, #1
-    add     TMP1, mem_operand, DUMMY
+    add     TMP1, mem_operand, TMP1, lsl #1
     asr     TMP2, VX, #16
     adds    VX, VX, UNIT_X
     bmi     55f
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP2, #1
-    add     TMP2, mem_operand, DUMMY
+    add     TMP2, mem_operand, TMP2, lsl #1
     ld1     {v&reg1&.h}[0], [TMP1]
     asr     TMP1, VX, #16
     adds    VX, VX, UNIT_X
@@ -248,8 +246,7 @@
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP1, #1
-    add     TMP1, mem_operand, DUMMY
+    add     TMP1, mem_operand, TMP1, lsl #1
     ld1     {v&reg1&.h}[1], [TMP2]
     asr     TMP2, VX, #16
     adds    VX, VX, UNIT_X
@@ -257,8 +254,7 @@
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP2, #1
-    add     TMP2, mem_operand, DUMMY
+    add     TMP2, mem_operand, TMP2, lsl #1
     ld1     {v&reg1&.h}[2], [TMP1]
     ld1     {v&reg1&.h}[3], [TMP2]
 .elseif elem_size == 32
@@ -268,16 +264,14 @@
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP1, #2
-    add     TMP1, mem_operand, DUMMY
+    add     TMP1, mem_operand, TMP1, lsl #2
     asr     TMP2, VX, #16
     adds    VX, VX, UNIT_X
     bmi     55f
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP2, #2
-    add     TMP2, mem_operand, DUMMY
+    add     TMP2, mem_operand, TMP2, lsl #2
     ld1     {v&reg1&.s}[0], [TMP1]
     ld1     {v&reg1&.s}[1], [TMP2]
 .else
@@ -317,8 +311,7 @@
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP1, #1
-    add     TMP1, mem_operand, DUMMY
+    add     TMP1, mem_operand, TMP1, lsl #1
     ld1     {v&reg1&.h}[idx], [TMP1]
 .elseif elem_size == 32
     asr     DUMMY, VX, #16
@@ -328,8 +321,7 @@
 5:  subs    VX, VX, SRC_WIDTH_FIXED
     bpl     5b
 55:
-    lsl     DUMMY, TMP1, #2
-    add     TMP1, mem_operand, DUMMY
+    add     TMP1, mem_operand, TMP1, lsl #2
     ld1     {v&reg1&.s}[idx], [TMP1]
 .endif
 .endm
@@ -638,27 +630,21 @@ local skip1
  */
 .macro advance_to_next_scanline start_of_loop_label
     mov         W, ORIG_W
-    lsl         DUMMY, DST_STRIDE, #dst_bpp_shift
-    add         DST_W, DST_W, DUMMY
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
 .if src_bpp != 0
-    lsl         DUMMY, SRC_STRIDE, #src_bpp_shift
-    add         SRC, SRC, DUMMY
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
 .endif
 .if mask_bpp != 0
-    lsl         DUMMY, MASK_STRIDE, #mask_bpp_shift
-    add         MASK, MASK, DUMMY
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
 .endif
 .if (dst_w_bpp != 24)
-    lsl         DUMMY, W, #dst_bpp_shift
-    sub         DST_W, DST_W, DUMMY
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
 .endif
 .if (src_bpp != 24) && (src_bpp != 0)
-    lsl         DUMMY, W, #src_bpp_shift
-    sub         SRC, SRC, DUMMY
+    sub         SRC, SRC, W, lsl #src_bpp_shift
 .endif
 .if (mask_bpp != 24) && (mask_bpp != 0)
-    lsl         DUMMY, W, #mask_bpp_shift
-    sub         MASK, MASK, DUMMY
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
 .endif
     subs        H, H, #1
     mov         DST_R, DST_W

-- 
2.7.4