[Pixman] [PATCH] pixman: Add support for aarch64 neon optimization (ver.3.1)

Mon Apr 18 12:16:17 UTC 2016

Sorry for your confusoin, but I thought the "tiny" patch should be sent before
my completed (ver.4) patch to detect some regression could happen
with a larger patch.

If the patch (ver.4) is no problem, please ignore this (v3.1) patch.

On 18 April 2016 at 18:58, Siarhei Siamashka
<siarhei.siamashka at gmail.com> wrote:
> On Thu, 14 Apr 2016 22:20:24 +0900
> Mizuki Asakura <ed6e117f at gmail.com> wrote:
>
>> Since aarch64 has different neon syntax from aarch32 and has no
>> support for (older) arm-simd,
>> there are no SIMD accelerations for pixman on aarch64.
>>
>> We need new implementations.
>>
>> This patch only contains FAST_PATH codes, not bilinear optimizations codes.
>> After completing optimization this patch, bilinear related codes should be done.
>>
>>
>> This patch contains additional optimization from my previous patch
>> to omit using unncessary register movings.
>
> If I understand it correctly, your patch removes the differences
> between the 32-bit (*) and the 64-bit assembly code variants for
> the instructions where the barrel shifter argument is in use.
>
> (*) Assuming that the assembly syntax differences are addressed via
> https://lists.freedesktop.org/archives/pixman/2016-April/004489.html
>
> It's hard to review incremental patches like this. For example,
> we can't see if all of the occurrences of these instructions are
> really fixed and none are left out. A side by side comparison of
> the final 64-bit assembly code with the existing 32-bit assembly
> code still needs to be used.
>
>> Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
>> Signed-off-by: Mizuki Asakura <ed6e117f at gmail.com>
>> ---
>> diff -ruNp a/pixman/pixman/pixman-arma64-neon-asm.S
>> b/pixman/pixman/pixman-arma64-neon-asm.S
>> --- a/pixman/pixman/pixman-arma64-neon-asm.S    2016-04-14
>> 22:09:47.120752451 +0900
>> +++ b/pixman/pixman/pixman-arma64-neon-asm.S    2016-04-14
>> 22:06:45.092222137 +0900
>> @@ -3132,8 +3132,7 @@ generate_composite_function_nearest_scan
>>  .macro bilinear_load_8888 reg1, reg2, tmp
>>      asr       TMP1, X, #16
>>      add       X, X, UX
>> -    lsl       TMP2, TMP1, #2
>> -    add       TMP1, TOP, TMP2
>> +    add       TMP1, TOP, TMP1, lsl #2
>>      ld1       {&reg1&.2s}, [TMP1], STRIDE
>>      ld1       {&reg2&.2s}, [TMP1]
>>  .endm
>> @@ -3141,8 +3140,7 @@ generate_composite_function_nearest_scan
>>  .macro bilinear_load_0565 reg1, reg2, tmp
>>      asr       TMP1, X, #16
>>      add       X, X, UX
>> -    lsl       TMP2, TMP1, #1
>> -    add       TMP1, TOP, TMP2
>> +    add       TMP1, TOP, TMP1, lsl #1
>>      ld1       {&reg2&.s}[0], [TMP1], STRIDE
>>      ld1       {&reg2&.s}[1], [TMP1]
>>      convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
>> diff -ruNp 160407/pixman/pixman/pixman-arma64-neon-asm.h
>> 160408/pixman/pixman/pixman-arma64-neon-asm.h
>> --- a/pixman/pixman/pixman-arma64-neon-asm.h    2016-04-14
>> 22:09:47.080752305 +0900
>> +++ b/pixman/pixman/pixman-arma64-neon-asm.h    2016-04-14
>> 22:06:45.044222036 +0900
>> @@ -231,16 +231,14 @@
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP1, #1
>> -    add     TMP1, mem_operand, DUMMY
>> +    add     TMP1, mem_operand, TMP1, lsl #1
>>      asr     TMP2, VX, #16
>>      adds    VX, VX, UNIT_X
>>      bmi     55f
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP2, #1
>> -    add     TMP2, mem_operand, DUMMY
>> +    add     TMP2, mem_operand, TMP2, lsl #1
>>      ld1     {v&reg1&.h}[0], [TMP1]
>>      asr     TMP1, VX, #16
>>      adds    VX, VX, UNIT_X
>> @@ -248,8 +246,7 @@
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP1, #1
>> -    add     TMP1, mem_operand, DUMMY
>> +    add     TMP1, mem_operand, TMP1, lsl #1
>>      ld1     {v&reg1&.h}[1], [TMP2]
>>      asr     TMP2, VX, #16
>>      adds    VX, VX, UNIT_X
>> @@ -257,8 +254,7 @@
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP2, #1
>> -    add     TMP2, mem_operand, DUMMY
>> +    add     TMP2, mem_operand, TMP2, lsl #1
>>      ld1     {v&reg1&.h}[2], [TMP1]
>>      ld1     {v&reg1&.h}[3], [TMP2]
>>  .elseif elem_size == 32
>> @@ -268,16 +264,14 @@
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP1, #2
>> -    add     TMP1, mem_operand, DUMMY
>> +    add     TMP1, mem_operand, TMP1, lsl #2
>>      asr     TMP2, VX, #16
>>      adds    VX, VX, UNIT_X
>>      bmi     55f
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP2, #2
>> -    add     TMP2, mem_operand, DUMMY
>> +    add     TMP2, mem_operand, TMP2, lsl #2
>>      ld1     {v&reg1&.s}[0], [TMP1]
>>      ld1     {v&reg1&.s}[1], [TMP2]
>>  .else
>> @@ -317,8 +311,7 @@
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP1, #1
>> -    add     TMP1, mem_operand, DUMMY
>> +    add     TMP1, mem_operand, TMP1, lsl #1
>>      ld1     {v&reg1&.h}[idx], [TMP1]
>>  .elseif elem_size == 32
>>      asr     DUMMY, VX, #16
>> @@ -328,8 +321,7 @@
>>  5:  subs    VX, VX, SRC_WIDTH_FIXED
>>      bpl     5b
>>  55:
>> -    lsl     DUMMY, TMP1, #2
>> -    add     TMP1, mem_operand, DUMMY
>> +    add     TMP1, mem_operand, TMP1, lsl #2
>>      ld1     {v&reg1&.s}[idx], [TMP1]
>>  .endif
>>  .endm
>> @@ -638,27 +630,21 @@ local skip1
>>   */
>>  .macro advance_to_next_scanline start_of_loop_label
>>      mov         W, ORIG_W
>> -    lsl         DUMMY, DST_STRIDE, #dst_bpp_shift
>> -    add         DST_W, DST_W, DUMMY
>> +    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
>>  .if src_bpp != 0
>> -    lsl         DUMMY, SRC_STRIDE, #src_bpp_shift
>> -    add         SRC, SRC, DUMMY
>> +    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
>>  .endif
>>  .if mask_bpp != 0
>> -    lsl         DUMMY, MASK_STRIDE, #mask_bpp_shift
>> -    add         MASK, MASK, DUMMY
>> +    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
>>  .endif
>>  .if (dst_w_bpp != 24)
>> -    lsl         DUMMY, W, #dst_bpp_shift
>> -    sub         DST_W, DST_W, DUMMY
>> +    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
>>  .endif
>>  .if (src_bpp != 24) && (src_bpp != 0)
>> -    lsl         DUMMY, W, #src_bpp_shift
>> -    sub         SRC, SRC, DUMMY
>> +    sub         SRC, SRC, W, lsl #src_bpp_shift
>>  .endif
>>  .if (mask_bpp != 24) && (mask_bpp != 0)
>> -    lsl         DUMMY, W, #mask_bpp_shift
>> -    sub         MASK, MASK, DUMMY
>> +    sub         MASK, MASK, W, lsl #mask_bpp_shift
>>  .endif
>>      subs        H, H, #1
>>      mov         DST_R, DST_W
>>
>
>
>
> --
> Best regards,
> Siarhei Siamashka