[Pixman] [PATCH] pixman: Add support for aarch64 neon optimization (ver.3.1)
Mizuki Asakura
ed6e117f at gmail.com
Thu Apr 14 13:20:24 UTC 2016
Since aarch64 has different neon syntax from aarch32 and has no
support for (older) arm-simd,
there are no SIMD accelerations for pixman on aarch64.
We need new implementations.
This patch only contains FAST_PATH codes, not bilinear optimizations codes.
After completing optimization this patch, bilinear related codes should be done.
This patch contains additional optimization from my previous patch
to omit using unncessary register movings.
Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
Signed-off-by: Mizuki Asakura <ed6e117f at gmail.com>
---
diff -ruNp a/pixman/pixman/pixman-arma64-neon-asm.S
b/pixman/pixman/pixman-arma64-neon-asm.S
--- a/pixman/pixman/pixman-arma64-neon-asm.S 2016-04-14
22:09:47.120752451 +0900
+++ b/pixman/pixman/pixman-arma64-neon-asm.S 2016-04-14
22:06:45.092222137 +0900
@@ -3132,8 +3132,7 @@ generate_composite_function_nearest_scan
.macro bilinear_load_8888 reg1, reg2, tmp
asr TMP1, X, #16
add X, X, UX
- lsl TMP2, TMP1, #2
- add TMP1, TOP, TMP2
+ add TMP1, TOP, TMP1, lsl #2
ld1 {®1&.2s}, [TMP1], STRIDE
ld1 {®2&.2s}, [TMP1]
.endm
@@ -3141,8 +3140,7 @@ generate_composite_function_nearest_scan
.macro bilinear_load_0565 reg1, reg2, tmp
asr TMP1, X, #16
add X, X, UX
- lsl TMP2, TMP1, #1
- add TMP1, TOP, TMP2
+ add TMP1, TOP, TMP1, lsl #1
ld1 {®2&.s}[0], [TMP1], STRIDE
ld1 {®2&.s}[1], [TMP1]
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
diff -ruNp 160407/pixman/pixman/pixman-arma64-neon-asm.h
160408/pixman/pixman/pixman-arma64-neon-asm.h
--- a/pixman/pixman/pixman-arma64-neon-asm.h 2016-04-14
22:09:47.080752305 +0900
+++ b/pixman/pixman/pixman-arma64-neon-asm.h 2016-04-14
22:06:45.044222036 +0900
@@ -231,16 +231,14 @@
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP1, #1
- add TMP1, mem_operand, DUMMY
+ add TMP1, mem_operand, TMP1, lsl #1
asr TMP2, VX, #16
adds VX, VX, UNIT_X
bmi 55f
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP2, #1
- add TMP2, mem_operand, DUMMY
+ add TMP2, mem_operand, TMP2, lsl #1
ld1 {v®1&.h}[0], [TMP1]
asr TMP1, VX, #16
adds VX, VX, UNIT_X
@@ -248,8 +246,7 @@
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP1, #1
- add TMP1, mem_operand, DUMMY
+ add TMP1, mem_operand, TMP1, lsl #1
ld1 {v®1&.h}[1], [TMP2]
asr TMP2, VX, #16
adds VX, VX, UNIT_X
@@ -257,8 +254,7 @@
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP2, #1
- add TMP2, mem_operand, DUMMY
+ add TMP2, mem_operand, TMP2, lsl #1
ld1 {v®1&.h}[2], [TMP1]
ld1 {v®1&.h}[3], [TMP2]
.elseif elem_size == 32
@@ -268,16 +264,14 @@
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP1, #2
- add TMP1, mem_operand, DUMMY
+ add TMP1, mem_operand, TMP1, lsl #2
asr TMP2, VX, #16
adds VX, VX, UNIT_X
bmi 55f
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP2, #2
- add TMP2, mem_operand, DUMMY
+ add TMP2, mem_operand, TMP2, lsl #2
ld1 {v®1&.s}[0], [TMP1]
ld1 {v®1&.s}[1], [TMP2]
.else
@@ -317,8 +311,7 @@
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP1, #1
- add TMP1, mem_operand, DUMMY
+ add TMP1, mem_operand, TMP1, lsl #1
ld1 {v®1&.h}[idx], [TMP1]
.elseif elem_size == 32
asr DUMMY, VX, #16
@@ -328,8 +321,7 @@
5: subs VX, VX, SRC_WIDTH_FIXED
bpl 5b
55:
- lsl DUMMY, TMP1, #2
- add TMP1, mem_operand, DUMMY
+ add TMP1, mem_operand, TMP1, lsl #2
ld1 {v®1&.s}[idx], [TMP1]
.endif
.endm
@@ -638,27 +630,21 @@ local skip1
*/
.macro advance_to_next_scanline start_of_loop_label
mov W, ORIG_W
- lsl DUMMY, DST_STRIDE, #dst_bpp_shift
- add DST_W, DST_W, DUMMY
+ add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
.if src_bpp != 0
- lsl DUMMY, SRC_STRIDE, #src_bpp_shift
- add SRC, SRC, DUMMY
+ add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
.endif
.if mask_bpp != 0
- lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
- add MASK, MASK, DUMMY
+ add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
.endif
.if (dst_w_bpp != 24)
- lsl DUMMY, W, #dst_bpp_shift
- sub DST_W, DST_W, DUMMY
+ sub DST_W, DST_W, W, lsl #dst_bpp_shift
.endif
.if (src_bpp != 24) && (src_bpp != 0)
- lsl DUMMY, W, #src_bpp_shift
- sub SRC, SRC, DUMMY
+ sub SRC, SRC, W, lsl #src_bpp_shift
.endif
.if (mask_bpp != 24) && (mask_bpp != 0)
- lsl DUMMY, W, #mask_bpp_shift
- sub MASK, MASK, DUMMY
+ sub MASK, MASK, W, lsl #mask_bpp_shift
.endif
subs H, H, #1
mov DST_R, DST_W
--
2.7.4
More information about the Pixman
mailing list