[Pixman] [PATCH] pixman: Add support for aarch64 neon optimization (ver.3.1)
Siarhei Siamashka
siarhei.siamashka at gmail.com
Mon Apr 18 09:58:30 UTC 2016
On Thu, 14 Apr 2016 22:20:24 +0900
Mizuki Asakura <ed6e117f at gmail.com> wrote:
> Since aarch64 has different neon syntax from aarch32 and has no
> support for (older) arm-simd,
> there are no SIMD accelerations for pixman on aarch64.
>
> We need new implementations.
>
> This patch only contains FAST_PATH codes, not bilinear optimizations codes.
> After completing optimization this patch, bilinear related codes should be done.
>
>
> This patch contains additional optimization from my previous patch
> to omit using unncessary register movings.
If I understand it correctly, your patch removes the differences
between the 32-bit (*) and the 64-bit assembly code variants for
the instructions where the barrel shifter argument is in use.
(*) Assuming that the assembly syntax differences are addressed via
https://lists.freedesktop.org/archives/pixman/2016-April/004489.html
It's hard to review incremental patches like this. For example,
we can't see if all of the occurrences of these instructions are
really fixed and none are left out. A side by side comparison of
the final 64-bit assembly code with the existing 32-bit assembly
code still needs to be used.
> Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
> Signed-off-by: Mizuki Asakura <ed6e117f at gmail.com>
> ---
> diff -ruNp a/pixman/pixman/pixman-arma64-neon-asm.S
> b/pixman/pixman/pixman-arma64-neon-asm.S
> --- a/pixman/pixman/pixman-arma64-neon-asm.S 2016-04-14
> 22:09:47.120752451 +0900
> +++ b/pixman/pixman/pixman-arma64-neon-asm.S 2016-04-14
> 22:06:45.092222137 +0900
> @@ -3132,8 +3132,7 @@ generate_composite_function_nearest_scan
> .macro bilinear_load_8888 reg1, reg2, tmp
> asr TMP1, X, #16
> add X, X, UX
> - lsl TMP2, TMP1, #2
> - add TMP1, TOP, TMP2
> + add TMP1, TOP, TMP1, lsl #2
> ld1 {®1&.2s}, [TMP1], STRIDE
> ld1 {®2&.2s}, [TMP1]
> .endm
> @@ -3141,8 +3140,7 @@ generate_composite_function_nearest_scan
> .macro bilinear_load_0565 reg1, reg2, tmp
> asr TMP1, X, #16
> add X, X, UX
> - lsl TMP2, TMP1, #1
> - add TMP1, TOP, TMP2
> + add TMP1, TOP, TMP1, lsl #1
> ld1 {®2&.s}[0], [TMP1], STRIDE
> ld1 {®2&.s}[1], [TMP1]
> convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
> diff -ruNp 160407/pixman/pixman/pixman-arma64-neon-asm.h
> 160408/pixman/pixman/pixman-arma64-neon-asm.h
> --- a/pixman/pixman/pixman-arma64-neon-asm.h 2016-04-14
> 22:09:47.080752305 +0900
> +++ b/pixman/pixman/pixman-arma64-neon-asm.h 2016-04-14
> 22:06:45.044222036 +0900
> @@ -231,16 +231,14 @@
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP1, #1
> - add TMP1, mem_operand, DUMMY
> + add TMP1, mem_operand, TMP1, lsl #1
> asr TMP2, VX, #16
> adds VX, VX, UNIT_X
> bmi 55f
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP2, #1
> - add TMP2, mem_operand, DUMMY
> + add TMP2, mem_operand, TMP2, lsl #1
> ld1 {v®1&.h}[0], [TMP1]
> asr TMP1, VX, #16
> adds VX, VX, UNIT_X
> @@ -248,8 +246,7 @@
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP1, #1
> - add TMP1, mem_operand, DUMMY
> + add TMP1, mem_operand, TMP1, lsl #1
> ld1 {v®1&.h}[1], [TMP2]
> asr TMP2, VX, #16
> adds VX, VX, UNIT_X
> @@ -257,8 +254,7 @@
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP2, #1
> - add TMP2, mem_operand, DUMMY
> + add TMP2, mem_operand, TMP2, lsl #1
> ld1 {v®1&.h}[2], [TMP1]
> ld1 {v®1&.h}[3], [TMP2]
> .elseif elem_size == 32
> @@ -268,16 +264,14 @@
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP1, #2
> - add TMP1, mem_operand, DUMMY
> + add TMP1, mem_operand, TMP1, lsl #2
> asr TMP2, VX, #16
> adds VX, VX, UNIT_X
> bmi 55f
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP2, #2
> - add TMP2, mem_operand, DUMMY
> + add TMP2, mem_operand, TMP2, lsl #2
> ld1 {v®1&.s}[0], [TMP1]
> ld1 {v®1&.s}[1], [TMP2]
> .else
> @@ -317,8 +311,7 @@
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP1, #1
> - add TMP1, mem_operand, DUMMY
> + add TMP1, mem_operand, TMP1, lsl #1
> ld1 {v®1&.h}[idx], [TMP1]
> .elseif elem_size == 32
> asr DUMMY, VX, #16
> @@ -328,8 +321,7 @@
> 5: subs VX, VX, SRC_WIDTH_FIXED
> bpl 5b
> 55:
> - lsl DUMMY, TMP1, #2
> - add TMP1, mem_operand, DUMMY
> + add TMP1, mem_operand, TMP1, lsl #2
> ld1 {v®1&.s}[idx], [TMP1]
> .endif
> .endm
> @@ -638,27 +630,21 @@ local skip1
> */
> .macro advance_to_next_scanline start_of_loop_label
> mov W, ORIG_W
> - lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> - add DST_W, DST_W, DUMMY
> + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
> .if src_bpp != 0
> - lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> - add SRC, SRC, DUMMY
> + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
> .endif
> .if mask_bpp != 0
> - lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> - add MASK, MASK, DUMMY
> + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
> .endif
> .if (dst_w_bpp != 24)
> - lsl DUMMY, W, #dst_bpp_shift
> - sub DST_W, DST_W, DUMMY
> + sub DST_W, DST_W, W, lsl #dst_bpp_shift
> .endif
> .if (src_bpp != 24) && (src_bpp != 0)
> - lsl DUMMY, W, #src_bpp_shift
> - sub SRC, SRC, DUMMY
> + sub SRC, SRC, W, lsl #src_bpp_shift
> .endif
> .if (mask_bpp != 24) && (mask_bpp != 0)
> - lsl DUMMY, W, #mask_bpp_shift
> - sub MASK, MASK, DUMMY
> + sub MASK, MASK, W, lsl #mask_bpp_shift
> .endif
> subs H, H, #1
> mov DST_R, DST_W
>
--
Best regards,
Siarhei Siamashka
More information about the Pixman
mailing list