[Pixman] [PATCH] Add support for aarch64 neon optimization
Mizuki Asakura
ed6e117f at gmail.com
Sun Apr 3 11:17:45 UTC 2016
> The 'advanced' prefetch type is implemented by having some branchless ARM code
If the prefetch code assumes that "branch-less", it cannot be done in aarch64
since aarch64 doesn't support conditional alythmetics such as subge, subges.
If so, we could / should remove all prefetch-related codes because it
might cause
performance regression (by branching) rather than benefit of prefetching.
And also, we could remove all "tail-head" optimizatoins that is only
for highly utilizing prefetching.
"tail-head" codes are very complicated, hard to understand and hard to maintain.
If we could remove these codes, asm code could be more slimmer and
easy-to-maintain.
Ofcource, the modification shouldn't be applied for original
aarch32-neon codes. It may cause
performance regression on some architecture.
But for aarch64, it would be a considerable changes ?
On 2 April 2016 at 21:30, Mizuki Asakura <ed6e117f at gmail.com> wrote:
> Since aarch64 has different neon syntax from aarch32 and has no
> support for (older) arm-simd,
> there are no SIMD accelerations for pixman on aarch64.
>
> We need new implementations.
>
>
> This patch only contains STD_FAST_PATH codes, not scaling (nearest,
> bilinear) codes.
> After completing optimization this patch, scaling related codes should be done.
>
>
> This is a first step towards optimizations for aarch64-neon.
>
>
> Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
> Signed-off-by: Mizuki Asakura <ed6e117f at gmail.com>
> ---
> configure.ac | 34 +
> pixman/Makefile.am | 14 +
> pixman/pixman-arm-neon.c | 10 +-
> pixman/pixman-arm.c | 6 +
> pixman/pixman-arma64-neon-asm.S | 3771 +++++++++++++++++++++++++++++++++++++++
> pixman/pixman-arma64-neon-asm.h | 1288 +++++++++++++
> pixman/pixman-private.h | 5 +
> 7 files changed, 5127 insertions(+), 1 deletion(-)
> create mode 100644 pixman/pixman-arma64-neon-asm.S
> create mode 100644 pixman/pixman-arma64-neon-asm.h
>
> diff --git a/configure.ac b/configure.ac
> old mode 100644
> new mode 100755
> index 6b2134e..bb0192a
> --- a/configure.ac
> +++ b/configure.ac
> @@ -667,6 +667,40 @@ if test $enable_arm_neon = yes && test
> $have_arm_neon = no ; then
> AC_MSG_ERROR([ARM NEON intrinsics not detected])
> fi
>
> +dnl ==========================================================================
> +dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions
> +have_arm_a64_neon=no
> +AC_MSG_CHECKING(whether to use ARM A64 NEON assembler)
> +xserver_save_CFLAGS=$CFLAGS
> +CFLAGS="-x assembler-with-cpp $CFLAGS"
> +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
> +.text
> +.arch armv8-a
> +.altmacro
> +prfm pldl2strm, [x0]
> +xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes)
> +CFLAGS=$xserver_save_CFLAGS
> +
> +AC_ARG_ENABLE(arm-a64-neon,
> + [AC_HELP_STRING([--disable-arm-a64-neon],
> + [disable ARM A64 NEON fast paths])],
> + [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto])
> +
> +if test $enable_arm_a64_neon = no ; then
> + have_arm_a64_neon=disabled
> +fi
> +
> +if test $have_arm_a64_neon = yes ; then
> + AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations])
> +fi
> +
> +AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes)
> +
> +AC_MSG_RESULT($have_arm_a64_neon)
> +if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then
> + AC_MSG_ERROR([ARM A64 NEON intrinsics not detected])
> +fi
> +
> dnl ===========================================================================
> dnl Check for IWMMXT
>
> diff --git a/pixman/Makefile.am b/pixman/Makefile.am
> old mode 100644
> new mode 100755
> index 581b6f6..1b1a8ac
> --- a/pixman/Makefile.am
> +++ b/pixman/Makefile.am
> @@ -94,6 +94,20 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la
> ASM_CFLAGS_arm_neon=
> endif
>
> +# arm a64 neon code
> +if USE_ARM_A64_NEON
> +noinst_LTLIBRARIES += libpixman-arma64-neon.la
> +libpixman_arma64_neon_la_SOURCES = \
> + pixman-arm-neon.c \
> + pixman-arm-common.h \
> + pixman-arma64-neon-asm.S \
> + pixman-arm-asm.h \
> + pixman-arma64-neon-asm.h
> +libpixman_1_la_LIBADD += libpixman-arma64-neon.la
> +
> +ASM_CFLAGS_arm_neon=
> +endif
> +
> # iwmmxt code
> if USE_ARM_IWMMXT
> libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
> diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
> old mode 100644
> new mode 100755
> index be761c9..cef8c90
> --- a/pixman/pixman-arm-neon.c
> +++ b/pixman/pixman-arm-neon.c
> @@ -121,6 +121,7 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon,
> over_8888_8_0565,
> PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
> uint16_t, 1, uint8_t, 1, uint16_t, 1)
>
> +#ifndef __aarch64__
> PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
> uint32_t, uint32_t)
> PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
> @@ -160,6 +161,7 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST
> (SKIP_ZERO_SRC, neon, 8888_8_8888, OV
> uint32_t, uint32_t)
> PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon,
> 8888_8_8888, ADD,
> uint32_t, uint32_t)
> +#endif
>
> void
> pixman_composite_src_n_8_asm_neon (int32_t w,
> @@ -194,7 +196,7 @@ arm_neon_fill (pixman_implementation_t *imp,
> uint32_t _xor)
> {
> /* stride is always multiple of 32bit units in pixman */
> - uint32_t byte_stride = stride * sizeof(uint32_t);
> + int32_t byte_stride = stride * sizeof(uint32_t);
>
> switch (bpp)
> {
> @@ -362,6 +364,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
> PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8r8g8b8,
> neon_composite_out_reverse_8_8888),
> PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8b8g8r8,
> neon_composite_out_reverse_8_8888),
>
> +#ifndef __aarch64__
> SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
> SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
> SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
> @@ -420,10 +423,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
>
> SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8,
> neon_8888_8_8888),
> SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8,
> neon_8888_8_8888),
> +#endif
>
> { PIXMAN_OP_NONE },
> };
>
> +#ifndef __aarch64__
> #define BIND_COMBINE_U(name) \
> void \
> pixman_composite_scanline_##name##_mask_asm_neon (int32_t w, \
> @@ -454,6 +459,7 @@ neon_combine_##name##_u (pixman_implementation_t
> *imp, \
> BIND_COMBINE_U (over)
> BIND_COMBINE_U (add)
> BIND_COMBINE_U (out_reverse)
> +#endif
>
> pixman_implementation_t *
> _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
> @@ -461,9 +467,11 @@ _pixman_implementation_create_arm_neon
> (pixman_implementation_t *fallback)
> pixman_implementation_t *imp =
> _pixman_implementation_create (fallback, arm_neon_fast_paths);
>
> +#ifndef __aarch64__
> imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
> imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
> imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
> +#endif
>
> imp->blt = arm_neon_blt;
> imp->fill = arm_neon_fill;
> diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
> old mode 100644
> new mode 100755
> index 23374e4..734cbea
> --- a/pixman/pixman-arm.c
> +++ b/pixman/pixman-arm.c
> @@ -221,5 +221,11 @@ _pixman_arm_get_implementations
> (pixman_implementation_t *imp)
> imp = _pixman_implementation_create_arm_neon (imp);
> #endif
>
> +#ifdef USE_ARM_A64_NEON
> + /* neon is a part of aarch64 */
> + if (!_pixman_disabled ("arm-neon"))
> + imp = _pixman_implementation_create_arm_neon (imp);
> +#endif
> +
> return imp;
> }
> diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
> new file mode 100644
> index 0000000..f60d1b4
> --- /dev/null
> +++ b/pixman/pixman-arma64-neon-asm.S
> @@ -0,0 +1,3771 @@
> +/*
> + * Copyright ツゥ 2009 Nokia Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + *
> + * Author: Siarhei Siamashka (siarhei.siamashka at nokia.com)
> + */
> +
> +/*
> + * This file contains implementations of NEON optimized pixel processing
> + * functions. There is no full and detailed tutorial, but some functions
> + * (those which are exposing some new or interesting features) are
> + * extensively commented and can be used as examples.
> + *
> + * You may want to have a look at the comments for following functions:
> + * - pixman_composite_over_8888_0565_asm_neon
> + * - pixman_composite_over_n_8_0565_asm_neon
> + */
> +
> +/* Prevent the stack from becoming executable for no reason... */
> +#if defined(__linux__) && defined(__ELF__)
> +.section .note.GNU-stack,"",%progbits
> +#endif
> +
> +.text
> +.arch armv8-a
> +
> +.altmacro
> +.p2align 2
> +
> +#include "pixman-private.h"
> +#include "pixman-arm-asm.h"
> +#include "pixman-arma64-neon-asm.h"
> +
> +/* Global configuration options and preferences */
> +
> +/*
> + * The code can optionally make use of unaligned memory accesses to improve
> + * performance of handling leading/trailing pixels for each scanline.
> + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
> + * example in linux if unaligned memory accesses are not configured to
> + * generate.exceptions.
> + */
> +.set RESPECT_STRICT_ALIGNMENT, 1
> +
> +/*
> + * Set default prefetch type. There is a choice between the following options:
> + *
> + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
> + * as NOP to workaround some HW bugs or for whatever other reason)
> + *
> + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
> + * advanced prefetch intruduces heavy overhead)
> + *
> + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
> + * which can run ARM and NEON instructions simultaneously so that extra ARM
> + * instructions do not add (many) extra cycles, but improve prefetch
> efficiency)
> + *
> + * Note: some types of function can't support advanced prefetch and fallback
> + * to simple one (those which handle 24bpp pixels)
> + */
> +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
> +
> +/* Prefetch distance in pixels for simple prefetch */
> +.set PREFETCH_DISTANCE_SIMPLE, 64
> +
> +/*
> + * Implementation of pixman_composite_over_8888_0565_asm_neon
> + *
> + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
> + * performs OVER compositing operation. Function fast_composite_over_8888_0565
> + * from pixman-fast-path.c does the same in C and can be used as a reference.
> + *
> + * First we need to have some NEON assembly code which can do the actual
> + * operation on the pixels and provide it to the template macro.
> + *
> + * Template macro quite conveniently takes care of emitting all the necessary
> + * code for memory reading and writing (including quite tricky cases of
> + * handling unaligned leading/trailing pixels), so we only need to deal with
> + * the data in NEON registers.
> + *
> + * NEON registers allocation in general is recommented to be the following:
> + * v0, v1, v2, v3 - contain loaded source pixel data
> + * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed)
> + * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
> + * v28, v29, v30, v31 - place for storing the result (destination pixels)
> + *
> + * As can be seen above, four 64-bit NEON registers are used for keeping
> + * intermediate pixel data and up to 8 pixels can be processed in one step
> + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
> + *
> + * This particular function uses the following registers allocation:
> + * v0, v1, v2, v3 - contain loaded source pixel data
> + * v4, v5 - contain loaded destination pixels (they are needed)
> + * v28, v29 - place for storing the result (destination pixels)
> + */
> +
> +/*
> + * Step one. We need to have some code to do some arithmetics on pixel data.
> + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
> + * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
> + * perform all the needed calculations and write the result to {v28, v29}.
> + * The rationale for having two macros and not just one will be explained
> + * later. In practice, any single monolitic function which does the work can
> + * be split into two parts in any arbitrary way without affecting correctness.
> + *
> + * There is one special trick here too. Common template macro can optionally
> + * make our life a bit easier by doing R, G, B, A color components
> + * deinterleaving for 32bpp pixel formats (and this feature is used in
> + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
> + * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
> + * actually use v0 register for blue channel (a vector of eight 8-bit
> + * values), v1 register for green, v2 for red and v3 for alpha. This
> + * simple conversion can be also done with a few NEON instructions:
> + *
> + * Packed to planar conversion: // vuzp8 is a wrapper macro
> + * vuzp8 v0, v1
> + * vuzp8 v2, v3
> + * vuzp8 v1, v3
> + * vuzp8 v0, v2
> + *
> + * Planar to packed conversion: // vzip8 is a wrapper macro
> + * vzip8 v0, v2
> + * vzip8 v1, v3
> + * vzip8 v2, v3
> + * vzip8 v0, v1
> + *
> + * But pixel can be loaded directly in planar format using LD4 / b NEON
> + * instruction. It is 1 cycle slower than LD1 / s, so this is not always
> + * desirable, that's why deinterleaving is optional.
> + *
> + * But anyway, here is the code:
> + */
> +
> +.macro pixman_composite_over_8888_0565_process_pixblock_head
> + /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
> + and put data into v6 - red, v7 - green, v30 - blue */
> + mov v4.d[1], v5.d[0]
> + shrn v6.8b, v4.8h, #8
> + shrn v7.8b, v4.8h, #3
> + sli v4.8h, v4.8h, #5
> + sri v6.8b, v6.8b, #5
> + mvn v3.8b, v3.8b /* invert source alpha */
> + sri v7.8b, v7.8b, #6
> + shrn v30.8b, v4.8h, #2
> + /* now do alpha blending, storing results in 8-bit planar format
> + into v20 - red, v23 - green, v22 - blue */
> + umull v10.8h, v3.8b, v6.8b
> + umull v11.8h, v3.8b, v7.8b
> + umull v12.8h, v3.8b, v30.8b
> + urshr v17.8h, v10.8h, #8
> + urshr v18.8h, v11.8h, #8
> + urshr v19.8h, v12.8h, #8
> + raddhn v20.8b, v10.8h, v17.8h
> + raddhn v23.8b, v11.8h, v18.8h
> + raddhn v22.8b, v12.8h, v19.8h
> +.endm
> +
> +.macro pixman_composite_over_8888_0565_process_pixblock_tail
> + /* ... continue alpha blending */
> + uqadd v17.8b, v2.8b, v20.8b
> + uqadd v18.8b, v0.8b, v22.8b
> + uqadd v19.8b, v1.8b, v23.8b
> + /* convert the result to r5g6b5 and store it into {v14} */
> + ushll v14.8h, v17.8b, #7
> + sli v14.8h, v14.8h, #1
> + ushll v8.8h, v19.8b, #7
> + sli v8.8h, v8.8h, #1
> + ushll v9.8h, v18.8b, #7
> + sli v9.8h, v9.8h, #1
> + sri v14.8h, v8.8h, #5
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +/*
> + * OK, now we got almost everything that we need. Using the above two
> + * macros, the work can be done right. But now we want to optimize
> + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
> + * a lot from good code scheduling and software pipelining.
> + *
> + * Let's construct some code, which will run in the core main loop.
> + * Some pseudo-code of the main loop will look like this:
> + * head
> + * while (...) {
> + * tail
> + * head
> + * }
> + * tail
> + *
> + * It may look a bit weird, but this setup allows to hide instruction
> + * latencies better and also utilize dual-issue capability more
> + * efficiently (make pairs of load-store and ALU instructions).
> + *
> + * So what we need now is a '*_tail_head' macro, which will be used
> + * in the core main loop. A trivial straightforward implementation
> + * of this macro would look like this:
> + *
> + * pixman_composite_over_8888_0565_process_pixblock_tail
> + * st1 {v28.4h, v29.4h}, [DST_W], #32
> + * ld1 {v4.4h, v5.4h}, [DST_R], #16
> + * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
> + * pixman_composite_over_8888_0565_process_pixblock_head
> + * cache_preload 8, 8
> + *
> + * Now it also got some VLD/VST instructions. We simply can't move from
> + * processing one block of pixels to the other one with just arithmetics.
> + * The previously processed data needs to be written to memory and new
> + * data needs to be fetched. Fortunately, this main loop does not deal
> + * with partial leading/trailing pixels and can load/store a full block
> + * of pixels in a bulk. Additionally, destination buffer is already
> + * 16 bytes aligned here (which is good for performance).
> + *
> + * New things here are DST_R, DST_W, SRC and MASK identifiers. These
> + * are the aliases for ARM registers which are used as pointers for
> + * accessing data. We maintain separate pointers for reading and writing
> + * destination buffer (DST_R and DST_W).
> + *
> + * Another new thing is 'cache_preload' macro. It is used for prefetching
> + * data into CPU L2 cache and improve performance when dealing with large
> + * images which are far larger than cache size. It uses one argument
> + * (actually two, but they need to be the same here) - number of pixels
> + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
> + * details about this macro. Moreover, if good performance is needed
> + * the code from this macro needs to be copied into '*_tail_head' macro
> + * and mixed with the rest of code for optimal instructions scheduling.
> + * We are actually doing it below.
> + *
> + * Now after all the explanations, here is the optimized code.
> + * Different instruction streams (originaling from '*_head', '*_tail'
> + * and 'cache_preload' macro) use different indentation levels for
> + * better readability. Actually taking the code from one of these
> + * indentation levels and ignoring a few LD/ST instructions would
> + * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
> + * macro!
> + */
> +
> +#if 1
> +
> +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
> + uqadd v17.8b, v2.8b, v20.8b
> + ld1 {v4.4h, v5.4h}, [DST_R], #16
> + mov v4.d[1], v5.d[0]
> + uqadd v18.8b, v0.8b, v22.8b
> + uqadd v19.8b, v1.8b, v23.8b
> + shrn v6.8b, v4.8h, #8
> + fetch_src_pixblock
> + shrn v7.8b, v4.8h, #3
> + sli v4.8h, v4.8h, #5
> + ushll v14.8h, v17.8b, #7
> + sli v14.8h, v14.8h, #1
> + PF add PF_X, PF_X, #8
> + ushll v8.8h, v19.8b, #7
> + sli v8.8h, v8.8h, #1
> + PF tst PF_CTL, #0xF
> + sri v6.8b, v6.8b, #5
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> +10:
> + mvn v3.8b, v3.8b
> + PF beq 10f
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + sri v7.8b, v7.8b, #6
> + shrn v30.8b, v4.8h, #2
> + umull v10.8h, v3.8b, v6.8b
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + umull v11.8h, v3.8b, v7.8b
> + umull v12.8h, v3.8b, v30.8b
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + sri v14.8h, v8.8h, #5
> + PF cmp PF_X, ORIG_W
> + ushll v9.8h, v18.8b, #7
> + sli v9.8h, v9.8h, #1
> + urshr v17.8h, v10.8h, #8
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + urshr v19.8h, v11.8h, #8
> + urshr v18.8h, v12.8h, #8
> + PF ble 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +10:
> + raddhn v20.8b, v10.8h, v17.8h
> + raddhn v23.8b, v11.8h, v19.8h
> + PF ble 10f
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_SRC, #1
> +10:
> + raddhn v22.8b, v12.8h, v18.8h
> + st1 {v14.8h}, [DST_W], #16
> +.endm
> +
> +#else
> +
> +/* If we did not care much about the performance, we would just use this... */
> +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
> + pixman_composite_over_8888_0565_process_pixblock_tail
> + st1 {v14.8h}, [DST_W], #16
> + ld1 {v4.4h, v4.5h}, [DST_R], #16
> + fetch_src_pixblock
> + pixman_composite_over_8888_0565_process_pixblock_head
> + cache_preload 8, 8
> +.endm
> +
> +#endif
> +
> +/*
> + * And now the final part. We are using 'generate_composite_function' macro
> + * to put all the stuff together. We are specifying the name of the function
> + * which we want to get, number of bits per pixel for the source, mask and
> + * destination (0 if unused, like mask in this case). Next come some bit
> + * flags:
> + * FLAG_DST_READWRITE - tells that the destination buffer is both read
> + * and written, for write-only buffer we would use
> + * FLAG_DST_WRITEONLY flag instead
> + * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
> + * and separate color channels for 32bpp format.
> + * The next things are:
> + * - the number of pixels processed per iteration (8 in this case, because
> + * that's the maximum what can fit into four 64-bit NEON registers).
> + * - prefetch distance, measured in pixel blocks. In this case it is 5 times
> + * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
> + * prefetch distance can be selected by running some benchmarks.
> + *
> + * After that we specify some macros, these are 'default_init',
> + * 'default_cleanup' here which are empty (but it is possible to have custom
> + * init/cleanup macros to be able to save/restore some extra NEON registers
> + * like d8-d15 or do anything else) followed by
> + * 'pixman_composite_over_8888_0565_process_pixblock_head',
> + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
> + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
> + * which we got implemented above.
> + *
> + * The last part is the NEON registers allocation scheme.
> + */
> +generate_composite_function \
> + pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_0565_process_pixblock_head, \
> + pixman_composite_over_8888_0565_process_pixblock_tail, \
> + pixman_composite_over_8888_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_0565_process_pixblock_head
> + /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
> + and put data into v6 - red, v7 - green, v30 - blue */
> + mov v4.d[1], v5.d[0]
> + shrn v6.8b, v4.8h, #8
> + shrn v7.8b, v4.8h, #3
> + sli v4.8h, v4.8h, #5
> + sri v6.8b, v6.8b, #5
> + sri v7.8b, v7.8b, #6
> + shrn v30.8b, v4.8h, #2
> + /* now do alpha blending, storing results in 8-bit planar format
> + into v20 - red, v23 - green, v22 - blue */
> + umull v10.8h, v3.8b, v6.8b
> + umull v11.8h, v3.8b, v7.8b
> + umull v12.8h, v3.8b, v30.8b
> + urshr v13.8h, v10.8h, #8
> + urshr v14.8h, v11.8h, #8
> + urshr v15.8h, v12.8h, #8
> + raddhn v20.8b, v10.8h, v13.8h
> + raddhn v23.8b, v11.8h, v14.8h
> + raddhn v22.8b, v12.8h, v15.8h
> +.endm
> +
> +.macro pixman_composite_over_n_0565_process_pixblock_tail
> + /* ... continue alpha blending */
> + uqadd v17.8b, v2.8b, v20.8b
> + uqadd v18.8b, v0.8b, v22.8b
> + uqadd v19.8b, v1.8b, v23.8b
> + /* convert the result to r5g6b5 and store it into {v14} */
> + ushll v14.8h, v17.8b, #7
> + sli v14.8h, v14.8h, #1
> + ushll v8.8h, v19.8b, #7
> + sli v8.8h, v8.8h, #1
> + ushll v9.8h, v18.8b, #7
> + sli v9.8h, v9.8h, #1
> + sri v14.8h, v8.8h, #5
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_n_0565_process_pixblock_tail_head
> + pixman_composite_over_n_0565_process_pixblock_tail
> + ld1 {v4.4h, v5.4h}, [DST_R], #16
> + st1 {v14.8h}, [DST_W], #16
> + pixman_composite_over_n_0565_process_pixblock_head
> + cache_preload 8, 8
> +.endm
> +
> +.macro pixman_composite_over_n_0565_init
> + mov v3.s[0], w4
> + dup v0.8b, v3.b[0]
> + dup v1.8b, v3.b[1]
> + dup v2.8b, v3.b[2]
> + dup v3.8b, v3.b[3]
> + mvn v3.8b, v3.8b /* invert source alpha */
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_0565_init, \
> + default_cleanup, \
> + pixman_composite_over_n_0565_process_pixblock_head, \
> + pixman_composite_over_n_0565_process_pixblock_tail, \
> + pixman_composite_over_n_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_8888_0565_process_pixblock_head
> + ushll v8.8h, v1.8b, #7
> + sli v8.8h, v8.8h, #1
> + ushll v14.8h, v2.8b, #7
> + sli v14.8h, v14.8h, #1
> + ushll v9.8h, v0.8b, #7
> + sli v9.8h, v9.8h, #1
> +.endm
> +
> +.macro pixman_composite_src_8888_0565_process_pixblock_tail
> + sri v14.8h, v8.8h, #5
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
> + sri v14.8h, v8.8h, #5
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + fetch_src_pixblock
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> + PF cmp PF_X, ORIG_W
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + ushll v8.8h, v1.8b, #7
> + sli v8.8h, v8.8h, #1
> + st1 {v14.8h}, [DST_W], #16
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + ushll v14.8h, v2.8b, #7
> + sli v14.8h, v14.8h, #1
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +10:
> + ushll v9.8h, v0.8b, #7
> + sli v9.8h, v9.8h, #1
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_8888_0565_process_pixblock_head, \
> + pixman_composite_src_8888_0565_process_pixblock_tail, \
> + pixman_composite_src_8888_0565_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0565_8888_process_pixblock_head
> + mov v0.d[1], v1.d[0]
> + shrn v30.8b, v0.8h, #8
> + shrn v29.8b, v0.8h, #3
> + sli v0.8h, v0.8h, #5
> + movi v31.8b, #255
> + sri v30.8b, v30.8b, #5
> + sri v29.8b, v29.8b, #6
> + shrn v28.8b, v0.8h, #2
> +.endm
> +
> +.macro pixman_composite_src_0565_8888_process_pixblock_tail
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
> + pixman_composite_src_0565_8888_process_pixblock_tail
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + fetch_src_pixblock
> + pixman_composite_src_0565_8888_process_pixblock_head
> + cache_preload 8, 8
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_0565_8888_process_pixblock_head, \
> + pixman_composite_src_0565_8888_process_pixblock_tail, \
> + pixman_composite_src_0565_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8_8_process_pixblock_head
> + uqadd v28.8b, v0.8b, v4.8b
> + uqadd v29.8b, v1.8b, v5.8b
> + uqadd v30.8b, v2.8b, v6.8b
> + uqadd v31.8b, v3.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_add_8_8_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_add_8_8_process_pixblock_tail_head
> + fetch_src_pixblock
> + PF add PF_X, PF_X, #32
> + PF tst PF_CTL, #0xF
> + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + PF beq 10f
> + PF add PF_X, PF_X, #32
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF cmp PF_X, ORIG_W
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + uqadd v28.8b, v0.8b, v4.8b
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + uqadd v29.8b, v1.8b, v5.8b
> + uqadd v30.8b, v2.8b, v6.8b
> + uqadd v31.8b, v3.8b, v7.8b
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
> + FLAG_DST_READWRITE, \
> + 32, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_add_8_8_process_pixblock_head, \
> + pixman_composite_add_8_8_process_pixblock_tail, \
> + pixman_composite_add_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
> + fetch_src_pixblock
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF cmp PF_X, ORIG_W
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + uqadd v28.8b, v0.8b, v4.8b
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + uqadd v29.8b, v1.8b, v5.8b
> + uqadd v30.8b, v2.8b, v6.8b
> + uqadd v31.8b, v3.8b, v7.8b
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_add_8_8_process_pixblock_head, \
> + pixman_composite_add_8_8_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_process_pixblock_tail_head
> +
> +generate_composite_function_single_scanline \
> + pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_add_8_8_process_pixblock_head, \
> + pixman_composite_add_8_8_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
> + mvn v24.8b, v3.8b /* get inverted alpha */
> + /* do alpha blending */
> + umull v8.8h, v24.8b, v4.8b
> + umull v9.8h, v24.8b, v5.8b
> + umull v10.8h, v24.8b, v6.8b
> + umull v11.8h, v24.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
> + urshr v14.8h, v8.8h, #8
> + urshr v15.8h, v9.8h, #8
> + urshr v16.8h, v10.8h, #8
> + urshr v17.8h, v11.8h, #8
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + raddhn v30.8b, v16.8h, v10.8h
> + raddhn v31.8b, v17.8h, v11.8h
> +.endm
> +
> +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + urshr v14.8h, v8.8h, #8
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + urshr v15.8h, v9.8h, #8
> + urshr v16.8h, v10.8h, #8
> + urshr v17.8h, v11.8h, #8
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + PF cmp PF_X, ORIG_W
> + raddhn v30.8b, v16.8h, v10.8h
> + raddhn v31.8b, v17.8h, v11.8h
> + fetch_src_pixblock
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + mvn v22.8b, v3.8b
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v8.8h, v22.8b, v4.8b
> + PF ble 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + umull v9.8h, v22.8b, v5.8b
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +10:
> + umull v10.8h, v22.8b, v6.8b
> + PF ble 10f
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + umull v11.8h, v22.8b, v7.8b
> +.endm
> +
> +generate_composite_function_single_scanline \
> + pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
> + pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
> + pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_8888_process_pixblock_head
> + pixman_composite_out_reverse_8888_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_over_8888_8888_process_pixblock_tail
> + pixman_composite_out_reverse_8888_8888_process_pixblock_tail
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + urshr v14.8h, v8.8h, #8
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + urshr v15.8h, v9.8h, #8
> + urshr v16.8h, v10.8h, #8
> + urshr v17.8h, v11.8h, #8
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + PF cmp PF_X, ORIG_W
> + raddhn v30.8b, v16.8h, v10.8h
> + raddhn v31.8b, v17.8h, v11.8h
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> + fetch_src_pixblock
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + mvn v22.8b, v3.8b
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v8.8h, v22.8b, v4.8b
> + PF ble 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + umull v9.8h, v22.8b, v5.8b
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +10:
> + umull v10.8h, v22.8b, v6.8b
> + PF ble 10f
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + umull v11.8h, v22.8b, v7.8b
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_8888_process_pixblock_head, \
> + pixman_composite_over_8888_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_8888_process_pixblock_tail_head
> +
> +generate_composite_function_single_scanline \
> + pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_8888_process_pixblock_head, \
> + pixman_composite_over_8888_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8888_process_pixblock_head
> + /* deinterleaved source pixels in {v0, v1, v2, v3} */
> + /* inverted alpha in {v24} */
> + /* destination pixels in {v4, v5, v6, v7} */
> + umull v8.8h, v24.8b, v4.8b
> + umull v9.8h, v24.8b, v5.8b
> + umull v10.8h, v24.8b, v6.8b
> + umull v11.8h, v24.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_process_pixblock_tail
> + urshr v14.8h, v8.8h, #8
> + urshr v15.8h, v9.8h, #8
> + urshr v16.8h, v10.8h, #8
> + urshr v17.8h, v11.8h, #8
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + raddhn v30.8b, v16.8h, v10.8h
> + raddhn v31.8b, v17.8h, v11.8h
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_process_pixblock_tail_head
> + urshr v14.8h, v8.8h, #8
> + urshr v15.8h, v9.8h, #8
> + urshr v16.8h, v10.8h, #8
> + urshr v17.8h, v11.8h, #8
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + raddhn v30.8b, v16.8h, v10.8h
> + raddhn v31.8b, v17.8h, v11.8h
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + uqadd v28.8b, v0.8b, v28.8b
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0x0F
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> + PF cmp PF_X, ORIG_W
> + umull v8.8h, v24.8b, v4.8b
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + umull v9.8h, v24.8b, v5.8b
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v10.8h, v24.8b, v6.8b
> + PF subs PF_CTL, PF_CTL, #0x10
> + umull v11.8h, v24.8b, v7.8b
> + PF ble 10f
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_over_n_8888_init
> + mov v3.s[0], w4
> + dup v0.8b, v3.b[0]
> + dup v1.8b, v3.b[1]
> + dup v2.8b, v3.b[2]
> + dup v3.8b, v3.b[3]
> + mvn v24.8b, v3.8b /* get inverted alpha */
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_8888_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_8888_process_pixblock_head, \
> + pixman_composite_over_8888_8888_process_pixblock_tail, \
> + pixman_composite_over_n_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
> + urshr v14.8h, v8.8h, #8
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + urshr v15.8h, v9.8h, #8
> + urshr v12.8h, v10.8h, #8
> + urshr v13.8h, v11.8h, #8
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + PF cmp PF_X, ORIG_W
> + raddhn v30.8b, v12.8h, v10.8h
> + raddhn v31.8b, v13.8h, v11.8h
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
> + mvn v22.8b, v3.8b
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF blt 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v8.8h, v22.8b, v4.8b
> + PF blt 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + umull v9.8h, v22.8b, v5.8b
> + umull v10.8h, v22.8b, v6.8b
> + PF blt 10f
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + umull v11.8h, v22.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_reverse_n_8888_init
> + mov v7.s[0], w4
> + dup v4.8b, v7.b[0]
> + dup v5.8b, v7.b[1]
> + dup v6.8b, v7.b[2]
> + dup v7.8b, v7.b[3]
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_reverse_n_8888_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_8888_process_pixblock_head, \
> + pixman_composite_over_8888_8888_process_pixblock_tail, \
> + pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 4, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_8_0565_process_pixblock_head
> + umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */
> + umull v1.8h, v24.8b, v9.8b
> + umull v2.8h, v24.8b, v10.8b
> + umull v3.8h, v24.8b, v11.8b
> + mov v4.d[1], v5.d[0]
> + shrn v25.8b, v4.8h, #8 /* convert DST_R data to
> 32-bpp (part1) */
> + shrn v26.8b, v4.8h, #3
> + sli v4.8h, v4.8h, #5
> + urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */
> + urshr v18.8h, v1.8h, #8
> + urshr v19.8h, v2.8h, #8
> + urshr v20.8h, v3.8h, #8
> + raddhn v0.8b, v0.8h, v17.8h
> + raddhn v1.8b, v1.8h, v18.8h
> + raddhn v2.8b, v2.8h, v19.8h
> + raddhn v3.8b, v3.8h, v20.8h
> + sri v25.8b, v25.8b, #5 /* convert DST_R data to
> 32-bpp (part2) */
> + sri v26.8b, v26.8b, #6
> + mvn v3.8b, v3.8b
> + shrn v30.8b, v4.8h, #2
> + umull v18.8h, v3.8b, v25.8b /* now do alpha blending */
> + umull v19.8h, v3.8b, v26.8b
> + umull v20.8h, v3.8b, v30.8b
> +.endm
> +
> +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
> + /* 3 cycle bubble (after vmull.u8) */
> + urshr v5.8h, v18.8h, #8
> + urshr v6.8h, v19.8h, #8
> + urshr v7.8h, v20.8h, #8
> + raddhn v17.8b, v18.8h, v5.8h
> + raddhn v19.8b, v19.8h, v6.8h
> + raddhn v18.8b, v20.8h, v7.8h
> + uqadd v5.8b, v2.8b, v17.8b
> + /* 1 cycle bubble */
> + uqadd v6.8b, v0.8b, v18.8b
> + uqadd v7.8b, v1.8b, v19.8b
> + ushll v14.8h, v5.8b, #7 /* convert to 16bpp */
> + sli v14.8h, v14.8h, #1
> + ushll v18.8h, v7.8b, #7
> + sli v18.8h, v18.8h, #1
> + ushll v19.8h, v6.8b, #7
> + sli v19.8h, v19.8h, #1
> + sri v14.8h, v18.8h, #5
> + /* 1 cycle bubble */
> + sri v14.8h, v19.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
> +#if 0
> + ld1 {v4.8h}, [DST_R], #16
> + shrn v25.8b, v4.8h, #8
> + fetch_mask_pixblock
> + shrn v26.8b, v4.8h, #3
> + fetch_src_pixblock
> + umull v22.8h, v24.8b, v10.8b
> + urshr v13.8h, v18.8h, #8
> + urshr v11.8h, v19.8h, #8
> + urshr v15.8h, v20.8h, #8
> + raddhn v17.8b, v18.8h, v13.8h
> + raddhn v19.8b, v19.8h, v11.8h
> + raddhn v18.8b, v20.8h, v15.8h
> + uqadd v17.8b, v2.8b, v17.8b
> + umull v21.8h, v24.8b, v9.8b
> + uqadd v18.8b, v0.8b, v18.8b
> + uqadd v19.8b, v1.8b, v19.8b
> + ushll v14.8h, v17.8b, #7
> + sli v14.8h, v14.8h, #1
> + umull v20.8h, v24.8b, v8.8b
> + ushll v18.8h, v18.8b, #7
> + sli v18.8h, v18.8h, #1
> + ushll v19.8h, v19.8b, #7
> + sli v19.8h, v19.8h, #1
> + sri v14.8h, v18.8h, #5
> + umull v23.8h, v24.8b, v11.8b
> + sri v14.8h, v19.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +
> + cache_preload 8, 8
> +
> + sli v4.8h, v4.8h, #5
> + urshr v16.8h, v20.8h, #8
> + urshr v17.8h, v21.8h, #8
> + urshr v18.8h, v22.8h, #8
> + urshr v19.8h, v23.8h, #8
> + raddhn v0.8b, v20.8h, v16.8h
> + raddhn v1.8b, v21.8h, v17.8h
> + raddhn v2.8b, v22.8h, v18.8h
> + raddhn v3.8b, v23.8h, v19.8h
> + sri v25.8b, v25.8b, #5
> + sri v26.8b, v26.8b, #6
> + mvn v3.8b, v3.8b
> + shrn v30.8b, v4.8h, #2
> + st1 {v14.8h}, [DST_W], #16
> + umull v18.8h, v3.8b, v25.8b
> + umull v19.8h, v3.8b, v26.8b
> + umull v20.8h, v3.8b, v30.8b
> +#else
> + pixman_composite_over_8888_8_0565_process_pixblock_tail
> + st1 {v28.4h, v29.4h}, [DST_W], #16
> + ld1 {v4.4h, v5.4h}, [DST_R], #16
> + fetch_mask_pixblock
> + fetch_src_pixblock
> + pixman_composite_over_8888_8_0565_process_pixblock_head
> +#endif
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_8888_8_0565_process_pixblock_head, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +/*
> + * This function needs a special initialization of solid mask.
> + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
> + * offset, split into color components and replicated in d8-d11
> + * registers. Additionally, this function needs all the NEON registers,
> + * so it has to save d8-d15 registers which are callee saved according
> + * to ABI. These registers are restored from 'cleanup' macro. All the
> + * other NEON registers are caller saved, so can be clobbered freely
> + * without introducing any problems.
> + */
> +.macro pixman_composite_over_n_8_0565_init
> + mov v11.s[0], w4
> + dup v8.8b, v11.b[0]
> + dup v9.8b, v11.b[1]
> + dup v10.8b, v11.b[2]
> + dup v11.8b, v11.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_8_0565_init, \
> + pixman_composite_over_n_8_0565_cleanup, \
> + pixman_composite_over_8888_8_0565_process_pixblock_head, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_n_0565_init
> + mov v24.s[0], w6
> + dup v24.8b, v24.b[3]
> +.endm
> +
> +.macro pixman_composite_over_8888_n_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_8888_n_0565_init, \
> + pixman_composite_over_8888_n_0565_cleanup, \
> + pixman_composite_over_8888_8_0565_process_pixblock_head, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0565_0565_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_0565_0565_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
> + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
> + fetch_src_pixblock
> + cache_preload 16, 16
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
> + FLAG_DST_WRITEONLY, \
> + 16, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_0565_0565_process_pixblock_head, \
> + pixman_composite_src_0565_0565_process_pixblock_tail, \
> + pixman_composite_src_0565_0565_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_n_8_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_n_8_process_pixblock_tail_head
> + st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
> +.endm
> +
> +.macro pixman_composite_src_n_8_init
> + mov v0.s[0], w4
> + dup v3.8b, v0.b[0]
> + dup v2.8b, v0.b[0]
> + dup v1.8b, v0.b[0]
> + dup v0.8b, v0.b[0]
> +.endm
> +
> +.macro pixman_composite_src_n_8_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
> + FLAG_DST_WRITEONLY, \
> + 32, /* number of pixels, processed in a single block */ \
> + 0, /* prefetch distance */ \
> + pixman_composite_src_n_8_init, \
> + pixman_composite_src_n_8_cleanup, \
> + pixman_composite_src_n_8_process_pixblock_head, \
> + pixman_composite_src_n_8_process_pixblock_tail, \
> + pixman_composite_src_n_8_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_0565_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_n_0565_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_n_0565_process_pixblock_tail_head
> + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_src_n_0565_init
> + mov v0.s[0], w4
> + dup v3.4h, v0.h[0]
> + dup v2.4h, v0.h[0]
> + dup v1.4h, v0.h[0]
> + dup v0.4h, v0.h[0]
> +.endm
> +
> +.macro pixman_composite_src_n_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
> + FLAG_DST_WRITEONLY, \
> + 16, /* number of pixels, processed in a single block */ \
> + 0, /* prefetch distance */ \
> + pixman_composite_src_n_0565_init, \
> + pixman_composite_src_n_0565_cleanup, \
> + pixman_composite_src_n_0565_process_pixblock_head, \
> + pixman_composite_src_n_0565_process_pixblock_tail, \
> + pixman_composite_src_n_0565_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_n_8888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_n_8888_process_pixblock_tail_head
> + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_src_n_8888_init
> + mov v0.s[0], w4
> + dup v3.2s, v0.s[0]
> + dup v2.2s, v0.s[0]
> + dup v1.2s, v0.s[0]
> + dup v0.2s, v0.s[0]
> +.endm
> +
> +.macro pixman_composite_src_n_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
> + FLAG_DST_WRITEONLY, \
> + 8, /* number of pixels, processed in a single block */ \
> + 0, /* prefetch distance */ \
> + pixman_composite_src_n_8888_init, \
> + pixman_composite_src_n_8888_cleanup, \
> + pixman_composite_src_n_8888_process_pixblock_head, \
> + pixman_composite_src_n_8888_process_pixblock_tail, \
> + pixman_composite_src_n_8888_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_8888_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_8888_8888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
> + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
> + fetch_src_pixblock
> + cache_preload 8, 8
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_WRITEONLY, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_8888_8888_process_pixblock_head, \
> + pixman_composite_src_8888_8888_process_pixblock_tail, \
> + pixman_composite_src_8888_8888_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_x888_8888_process_pixblock_head
> + orr v0.8b, v0.8b, v4.8b
> + orr v1.8b, v1.8b, v4.8b
> + orr v2.8b, v2.8b, v4.8b
> + orr v3.8b, v3.8b, v4.8b
> +.endm
> +
> +.macro pixman_composite_src_x888_8888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
> + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
> + fetch_src_pixblock
> + orr v0.8b, v0.8b, v4.8b
> + orr v1.8b, v1.8b, v4.8b
> + orr v2.8b, v2.8b, v4.8b
> + orr v3.8b, v3.8b, v4.8b
> + cache_preload 8, 8
> +.endm
> +
> +.macro pixman_composite_src_x888_8888_init
> + mov w20, #0xFF
> + dup v4.8b, w20
> + shl v4.2s, v4.2s, #24
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_WRITEONLY, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + pixman_composite_src_x888_8888_init, \
> + default_cleanup, \
> + pixman_composite_src_x888_8888_process_pixblock_head, \
> + pixman_composite_src_x888_8888_process_pixblock_tail, \
> + pixman_composite_src_x888_8888_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8_8888_process_pixblock_head
> + /* expecting solid source in {v0, v1, v2, v3} */
> + /* mask is in v24 (v25, v26, v27 are unused) */
> +
> + /* in */
> + umull v8.8h, v24.8b, v0.8b
> + umull v9.8h, v24.8b, v1.8b
> + umull v10.8h, v24.8b, v2.8b
> + umull v11.8h, v24.8b, v3.8b
> + ursra v8.8h, v8.8h, #8
> + ursra v9.8h, v9.8h, #8
> + ursra v10.8h, v10.8h, #8
> + ursra v11.8h, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_process_pixblock_tail
> + rshrn v28.8b, v8.8h, #8
> + rshrn v29.8b, v9.8h, #8
> + rshrn v30.8b, v10.8h, #8
> + rshrn v31.8b, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
> + fetch_mask_pixblock
> + PF add PF_X, PF_X, #8
> + rshrn v28.8b, v8.8h, #8
> + PF tst PF_CTL, #0x0F
> + rshrn v29.8b, v9.8h, #8
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> +10:
> + rshrn v30.8b, v10.8h, #8
> + PF beq 10f
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + rshrn v31.8b, v11.8h, #8
> + PF cmp PF_X, ORIG_W
> + umull v8.8h, v24.8b, v0.8b
> + PF lsl DUMMY, PF_X, #mask_bpp_shift
> + PF prfm pldl2strm, [PF_MASK, DUMMY]
> + umull v9.8h, v24.8b, v1.8b
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v10.8h, v24.8b, v2.8b
> + PF ble 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + umull v11.8h, v24.8b, v3.8b
> + PF ble 10f
> + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> + PF ldrsb DUMMY, [PF_MASK, DUMMY]
> + PF add PF_MASK, PF_MASK, #1
> +10:
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + ursra v8.8h, v8.8h, #8
> + ursra v9.8h, v9.8h, #8
> + ursra v10.8h, v10.8h, #8
> + ursra v11.8h, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_init
> + mov v3.s[0], w4
> + dup v0.8b, v3.b[0]
> + dup v1.8b, v3.b[1]
> + dup v2.8b, v3.b[2]
> + dup v3.8b, v3.b[3]
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_src_n_8_8888_init, \
> + pixman_composite_src_n_8_8888_cleanup, \
> + pixman_composite_src_n_8_8888_process_pixblock_head, \
> + pixman_composite_src_n_8_8888_process_pixblock_tail, \
> + pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8_8_process_pixblock_head
> + umull v0.8h, v24.8b, v16.8b
> + umull v1.8h, v25.8b, v16.8b
> + umull v2.8h, v26.8b, v16.8b
> + umull v3.8h, v27.8b, v16.8b
> + ursra v0.8h, v0.8h, #8
> + ursra v1.8h, v1.8h, #8
> + ursra v2.8h, v2.8h, #8
> + ursra v3.8h, v3.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_process_pixblock_tail
> + rshrn v28.8b, v0.8h, #8
> + rshrn v29.8b, v1.8h, #8
> + rshrn v30.8b, v2.8h, #8
> + rshrn v31.8b, v3.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
> + fetch_mask_pixblock
> + PF add PF_X, PF_X, #8
> + rshrn v28.8b, v0.8h, #8
> + PF tst PF_CTL, #0x0F
> + rshrn v29.8b, v1.8h, #8
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> +10:
> + rshrn v30.8b, v2.8h, #8
> + PF beq 10f
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + rshrn v31.8b, v3.8h, #8
> + PF cmp PF_X, ORIG_W
> + umull v0.8h, v24.8b, v16.8b
> + PF lsl DUMMY, PF_X, mask_bpp_shift
> + PF prfm pldl2strm, [PF_MASK, DUMMY]
> + umull v1.8h, v25.8b, v16.8b
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v2.8h, v26.8b, v16.8b
> + PF ble 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + umull v3.8h, v27.8b, v16.8b
> + PF ble 10f
> + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> + PF ldrsb DUMMY, [PF_MASK, DUMMY]
> + PF add PF_MASK, PF_MASK, #1
> +10:
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + ursra v0.8h, v0.8h, #8
> + ursra v1.8h, v1.8h, #8
> + ursra v2.8h, v2.8h, #8
> + ursra v3.8h, v3.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_init
> + mov v16.s[0], w4
> + dup v16.8b, v16.b[3]
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
> + FLAG_DST_WRITEONLY, \
> + 32, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_src_n_8_8_init, \
> + pixman_composite_src_n_8_8_cleanup, \
> + pixman_composite_src_n_8_8_process_pixblock_head, \
> + pixman_composite_src_n_8_8_process_pixblock_tail, \
> + pixman_composite_src_n_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8_8888_process_pixblock_head
> + /* expecting deinterleaved source data in {v8, v9, v10, v11} */
> + /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
> + /* and destination data in {v4, v5, v6, v7} */
> + /* mask is in v24 (v25, v26, v27 are unused) */
> +
> + /* in */
> + umull v12.8h, v24.8b, v8.8b
> + umull v13.8h, v24.8b, v9.8b
> + umull v14.8h, v24.8b, v10.8b
> + umull v15.8h, v24.8b, v11.8b
> + urshr v16.8h, v12.8h, #8
> + urshr v17.8h, v13.8h, #8
> + urshr v18.8h, v14.8h, #8
> + urshr v19.8h, v15.8h, #8
> + raddhn v0.8b, v12.8h, v16.8h
> + raddhn v1.8b, v13.8h, v17.8h
> + raddhn v2.8b, v14.8h, v18.8h
> + raddhn v3.8b, v15.8h, v19.8h
> + mvn v25.8b, v3.8b /* get inverted alpha */
> + /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */
> + /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
> + /* now do alpha blending */
> + umull v12.8h, v25.8b, v4.8b
> + umull v13.8h, v25.8b, v5.8b
> + umull v14.8h, v25.8b, v6.8b
> + umull v15.8h, v25.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_process_pixblock_tail
> + urshr v16.8h, v12.8h, #8
> + urshr v17.8h, v13.8h, #8
> + urshr v18.8h, v14.8h, #8
> + urshr v19.8h, v15.8h, #8
> + raddhn v28.8b, v16.8h, v12.8h
> + raddhn v29.8b, v17.8h, v13.8h
> + raddhn v30.8b, v18.8h, v14.8h
> + raddhn v31.8b, v19.8h, v15.8h
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
> + urshr v16.8h, v12.8h, #8
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + urshr v17.8h, v13.8h, #8
> + fetch_mask_pixblock
> + urshr v18.8h, v14.8h, #8
> + PF add PF_X, PF_X, #8
> + urshr v19.8h, v15.8h, #8
> + PF tst PF_CTL, #0x0F
> + raddhn v28.8b, v16.8h, v12.8h
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> +10:
> + raddhn v29.8b, v17.8h, v13.8h
> + PF beq 10f
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + raddhn v30.8b, v18.8h, v14.8h
> + PF cmp PF_X, ORIG_W
> + raddhn v31.8b, v19.8h, v15.8h
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> + umull v16.8h, v24.8b, v8.8b
> + PF lsl DUMMY, PF_X, #mask_bpp_shift
> + PF prfm pldl2strm, [PF_MASK, DUMMY]
> + umull v17.8h, v24.8b, v9.8b
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> +10:
> + umull v18.8h, v24.8b, v10.8b
> + PF ble 10f
> + PF subs PF_CTL, PF_CTL, #0x10
> +10:
> + umull v19.8h, v24.8b, v11.8b
> + PF ble 10f
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +10:
> + uqadd v28.8b, v0.8b, v28.8b
> + PF ble 10f
> + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> + PF ldrsb DUMMY, [PF_MASK, DUMMY]
> + PF add PF_MASK, PF_MASK, #1
> +10:
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> + urshr v12.8h, v16.8h, #8
> + urshr v13.8h, v17.8h, #8
> + urshr v14.8h, v18.8h, #8
> + urshr v15.8h, v19.8h, #8
> + raddhn v0.8b, v16.8h, v12.8h
> + raddhn v1.8b, v17.8h, v13.8h
> + raddhn v2.8b, v18.8h, v14.8h
> + raddhn v3.8b, v19.8h, v15.8h
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + mvn v25.8b, v3.8b
> + umull v12.8h, v25.8b, v4.8b
> + umull v13.8h, v25.8b, v5.8b
> + umull v14.8h, v25.8b, v6.8b
> + umull v15.8h, v25.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_init
> + mov v11.s[0], w4
> + dup v8.8b, v11.b[0]
> + dup v9.8b, v11.b[1]
> + dup v10.8b, v11.b[2]
> + dup v11.8b, v11.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_8_8888_init, \
> + pixman_composite_over_n_8_8888_cleanup, \
> + pixman_composite_over_n_8_8888_process_pixblock_head, \
> + pixman_composite_over_n_8_8888_process_pixblock_tail, \
> + pixman_composite_over_n_8_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8_8_process_pixblock_head
> + umull v0.8h, v24.8b, v8.8b
> + umull v1.8h, v25.8b, v8.8b
> + umull v2.8h, v26.8b, v8.8b
> + umull v3.8h, v27.8b, v8.8b
> + urshr v10.8h, v0.8h, #8
> + urshr v11.8h, v1.8h, #8
> + urshr v12.8h, v2.8h, #8
> + urshr v13.8h, v3.8h, #8
> + raddhn v0.8b, v0.8h, v10.8h
> + raddhn v1.8b, v1.8h, v11.8h
> + raddhn v2.8b, v2.8h, v12.8h
> + raddhn v3.8b, v3.8h, v13.8h
> + mvn v24.8b, v0.8b
> + mvn v25.8b, v1.8b
> + mvn v26.8b, v2.8b
> + mvn v27.8b, v3.8b
> + umull v10.8h, v24.8b, v4.8b
> + umull v11.8h, v25.8b, v5.8b
> + umull v12.8h, v26.8b, v6.8b
> + umull v13.8h, v27.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8_process_pixblock_tail
> + urshr v14.8h, v10.8h, #8
> + urshr v15.8h, v11.8h, #8
> + urshr v16.8h, v12.8h, #8
> + urshr v17.8h, v13.8h, #8
> + raddhn v28.8b, v14.8h, v10.8h
> + raddhn v29.8b, v15.8h, v11.8h
> + raddhn v30.8b, v16.8h, v12.8h
> + raddhn v31.8b, v17.8h, v13.8h
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
> + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + pixman_composite_over_n_8_8_process_pixblock_tail
> + fetch_mask_pixblock
> + cache_preload 32, 32
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + pixman_composite_over_n_8_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_over_n_8_8_init
> + mov v8.s[0], w4
> + dup v8.8b, v8.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
> + FLAG_DST_READWRITE, \
> + 32, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_8_8_init, \
> + pixman_composite_over_n_8_8_cleanup, \
> + pixman_composite_over_n_8_8_process_pixblock_head, \
> + pixman_composite_over_n_8_8_process_pixblock_tail, \
> + pixman_composite_over_n_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
> + /*
> + * 'combine_mask_ca' replacement
> + *
> + * input: solid src (n) in {v8, v9, v10, v11}
> + * dest in {v4, v5, v6, v7 }
> + * mask in {v24, v25, v26, v27}
> + * output: updated src in {v0, v1, v2, v3 }
> + * updated mask in {v24, v25, v26, v3 }
> + */
> + umull v0.8h, v24.8b, v8.8b
> + umull v1.8h, v25.8b, v9.8b
> + umull v2.8h, v26.8b, v10.8b
> + umull v3.8h, v27.8b, v11.8b
> + umull v12.8h, v11.8b, v25.8b
> + umull v13.8h, v11.8b, v24.8b
> + umull v14.8h, v11.8b, v26.8b
> + urshr v15.8h, v0.8h, #8
> + urshr v16.8h, v1.8h, #8
> + urshr v17.8h, v2.8h, #8
> + raddhn v0.8b, v0.8h, v15.8h
> + raddhn v1.8b, v1.8h, v16.8h
> + raddhn v2.8b, v2.8h, v17.8h
> + urshr v15.8h, v13.8h, #8
> + urshr v16.8h, v12.8h, #8
> + urshr v17.8h, v14.8h, #8
> + urshr v18.8h, v3.8h, #8
> + raddhn v24.8b, v13.8h, v15.8h
> + raddhn v25.8b, v12.8h, v16.8h
> + raddhn v26.8b, v14.8h, v17.8h
> + raddhn v3.8b, v3.8h, v18.8h
> + /*
> + * 'combine_over_ca' replacement
> + *
> + * output: updated dest in {v28, v29, v30, v31}
> + */
> + mvn v24.8b, v24.8b
> + mvn v25.8b, v25.8b
> + mvn v26.8b, v26.8b
> + mvn v27.8b, v3.8b
> + umull v12.8h, v24.8b, v4.8b
> + umull v13.8h, v25.8b, v5.8b
> + umull v14.8h, v26.8b, v6.8b
> + umull v15.8h, v27.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
> + /* ... continue 'combine_over_ca' replacement */
> + urshr v16.8h, v12.8h, #8
> + urshr v17.8h, v13.8h, #8
> + urshr v18.8h, v14.8h, #8
> + urshr v19.8h, v15.8h, #8
> + raddhn v28.8b, v16.8h, v12.8h
> + raddhn v29.8b, v17.8h, v13.8h
> + raddhn v30.8b, v18.8h, v14.8h
> + raddhn v31.8b, v19.8h, v15.8h
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
> + urshr v16.8h, v12.8h, #8
> + urshr v17.8h, v13.8h, #8
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + urshr v18.8h, v14.8h, #8
> + urshr v19.8h, v15.8h, #8
> + raddhn v28.8b, v16.8h, v12.8h
> + raddhn v29.8b, v17.8h, v13.8h
> + raddhn v30.8b, v18.8h, v14.8h
> + raddhn v31.8b, v19.8h, v15.8h
> + fetch_mask_pixblock
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> + cache_preload 8, 8
> + pixman_composite_over_n_8888_8888_ca_process_pixblock_head
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_init
> + mov v13.s[0], w4
> + dup v8.8b, v13.b[0]
> + dup v9.8b, v13.b[1]
> + dup v10.8b, v13.b[2]
> + dup v11.8b, v13.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_8888_8888_ca_init, \
> + pixman_composite_over_n_8888_8888_ca_cleanup, \
> + pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
> + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
> + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
> + /*
> + * 'combine_mask_ca' replacement
> + *
> + * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A]
> + * mask in {v24, v25, v26} [B, G, R]
> + * output: updated src in {v0, v1, v2 } [B, G, R]
> + * updated mask in {v24, v25, v26} [B, G, R]
> + */
> + umull v0.8h, v24.8b, v8.8b
> + umull v1.8h, v25.8b, v9.8b
> + umull v2.8h, v26.8b, v10.8b
> + umull v12.8h, v11.8b, v24.8b
> + umull v13.8h, v11.8b, v25.8b
> + umull v14.8h, v11.8b, v26.8b
> + urshr v15.8h, v0.8h, #8
> + urshr v16.8h, v1.8h, #8
> + urshr v17.8h, v2.8h, #8
> + raddhn v0.8b, v0.8h, v15.8h
> + raddhn v1.8b, v1.8h, v16.8h
> + raddhn v2.8b, v2.8h, v17.8h
> + urshr v19.8h, v12.8h, #8
> + urshr v20.8h, v13.8h, #8
> + urshr v21.8h, v14.8h, #8
> + raddhn v24.8b, v12.8h, v19.8h
> + raddhn v25.8b, v13.8h, v20.8h
> + /*
> + * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
> + * and put data into v16 - blue, v17 - green, v18 - red
> + */
> + mov v4.d[1], v5.d[0]
> + shrn v17.8b, v4.8h, #3
> + shrn v18.8b, v4.8h, #8
> + raddhn v26.8b, v14.8h, v21.8h
> + sli v4.8h, v4.8h, #5
> + sri v18.8b, v18.8b, #5
> + sri v17.8b, v17.8b, #6
> + /*
> + * 'combine_over_ca' replacement
> + *
> + * output: updated dest in v16 - blue, v17 - green, v18 - red
> + */
> + mvn v24.8b, v24.8b
> + mvn v25.8b, v25.8b
> + shrn v16.8b, v4.8h, #2
> + mvn v26.8b, v26.8b
> + umull v5.8h, v16.8b, v24.8b
> + umull v6.8h, v17.8b, v25.8b
> + umull v7.8h, v18.8b, v26.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
> + /* ... continue 'combine_over_ca' replacement */
> + urshr v13.8h, v5.8h, #8
> + urshr v14.8h, v6.8h, #8
> + urshr v15.8h, v7.8h, #8
> + raddhn v16.8b, v13.8h, v5.8h
> + raddhn v17.8b, v14.8h, v6.8h
> + raddhn v18.8b, v15.8h, v7.8h
> + uqadd v16.8b, v0.8b, v16.8b
> + uqadd v17.8b, v1.8b, v17.8b
> + uqadd v18.8b, v2.8b, v18.8b
> + /*
> + * convert the results in v16, v17, v18 to r5g6b5 and store
> + * them into {v14}
> + */
> + ushll v14.8h, v18.8b, #7
> + sli v14.8h, v14.8h, #1
> + ushll v12.8h, v17.8b, #7
> + sli v12.8h, v12.8h, #1
> + ushll v13.8h, v16.8b, #7
> + sli v13.8h, v13.8h, #1
> + sri v14.8h, v12.8h, #5
> + sri v14.8h, v13.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
> + fetch_mask_pixblock
> + urshr v13.8h, v5.8h, #8
> + urshr v14.8h, v6.8h, #8
> + ld1 {v4.8h}, [DST_R], #16
> + urshr v15.8h, v7.8h, #8
> + raddhn v16.8b, v13.8h, v5.8h
> + raddhn v17.8b, v14.8h, v6.8h
> + raddhn v18.8b, v15.8h, v7.8h
> + mov v5.d[0], v4.d[1]
> + /* process_pixblock_head */
> + /*
> + * 'combine_mask_ca' replacement
> + *
> + * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A]
> + * mask in {v24, v25, v26} [B, G, R]
> + * output: updated src in {v0, v1, v2 } [B, G, R]
> + * updated mask in {v24, v25, v26} [B, G, R]
> + */
> + uqadd v16.8b, v0.8b, v16.8b
> + uqadd v17.8b, v1.8b, v17.8b
> + uqadd v18.8b, v2.8b, v18.8b
> + umull v0.8h, v24.8b, v8.8b
> + umull v1.8h, v25.8b, v9.8b
> + umull v2.8h, v26.8b, v10.8b
> + /*
> + * convert the result in v16, v17, v18 to r5g6b5 and store
> + * it into {v14}
> + */
> + ushll v14.8h, v18.8b, #7
> + sli v14.8h, v14.8h, #1
> + ushll v18.8h, v16.8b, #7
> + sli v18.8h, v18.8h, #1
> + ushll v19.8h, v17.8b, #7
> + sli v19.8h, v19.8h, #1
> + umull v12.8h, v11.8b, v24.8b
> + sri v14.8h, v19.8h, #5
> + umull v13.8h, v11.8b, v25.8b
> + umull v15.8h, v11.8b, v26.8b
> + sri v14.8h, v18.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> + cache_preload 8, 8
> + urshr v16.8h, v0.8h, #8
> + urshr v17.8h, v1.8h, #8
> + urshr v18.8h, v2.8h, #8
> + raddhn v0.8b, v0.8h, v16.8h
> + raddhn v1.8b, v1.8h, v17.8h
> + raddhn v2.8b, v2.8h, v18.8h
> + urshr v19.8h, v12.8h, #8
> + urshr v20.8h, v13.8h, #8
> + urshr v21.8h, v15.8h, #8
> + raddhn v24.8b, v12.8h, v19.8h
> + raddhn v25.8b, v13.8h, v20.8h
> + /*
> + * convert 8 r5g6b5 pixel data from {v4, v5} to planar
> + * 8-bit format and put data into v16 - blue, v17 - green,
> + * v18 - red
> + */
> + mov v4.d[1], v5.d[0]
> + shrn v17.8b, v4.8h, #3
> + shrn v18.8b, v4.8h, #8
> + raddhn v26.8b, v15.8h, v21.8h
> + sli v4.8h, v4.8h, #5
> + sri v17.8b, v17.8b, #6
> + sri v18.8b, v18.8b, #5
> + /*
> + * 'combine_over_ca' replacement
> + *
> + * output: updated dest in v16 - blue, v17 - green, v18 - red
> + */
> + mvn v24.8b, v24.8b
> + mvn v25.8b, v25.8b
> + shrn v16.8b, v4.8h, #2
> + mvn v26.8b, v26.8b
> + umull v5.8h, v16.8b, v24.8b
> + umull v6.8h, v17.8b, v25.8b
> + umull v7.8h, v18.8b, v26.8b
> + st1 {v14.8h}, [DST_W], #16
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_init
> + mov v13.s[0], w4
> + dup v8.8b, v13.b[0]
> + dup v9.8b, v13.b[1]
> + dup v10.8b, v13.b[2]
> + dup v11.8b, v13.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_n_8888_0565_ca_init, \
> + pixman_composite_over_n_8888_0565_ca_cleanup, \
> + pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
> + pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
> + pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_in_n_8_process_pixblock_head
> + /* expecting source data in {v0, v1, v2, v3} */
> + /* and destination data in {v4, v5, v6, v7} */
> + umull v8.8h, v4.8b, v3.8b
> + umull v9.8h, v5.8b, v3.8b
> + umull v10.8h, v6.8b, v3.8b
> + umull v11.8h, v7.8b, v3.8b
> +.endm
> +
> +.macro pixman_composite_in_n_8_process_pixblock_tail
> + urshr v14.8h, v8.8h, #8
> + urshr v15.8h, v9.8h, #8
> + urshr v12.8h, v10.8h, #8
> + urshr v13.8h, v11.8h, #8
> + raddhn v28.8b, v8.8h, v14.8h
> + raddhn v29.8b, v9.8h, v15.8h
> + raddhn v30.8b, v10.8h, v12.8h
> + raddhn v31.8b, v11.8h, v13.8h
> +.endm
> +
> +.macro pixman_composite_in_n_8_process_pixblock_tail_head
> + pixman_composite_in_n_8_process_pixblock_tail
> + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + cache_preload 32, 32
> + pixman_composite_in_n_8_process_pixblock_head
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_in_n_8_init
> + mov v3.s[0], w4
> + dup v3.8b, v3.b[3]
> +.endm
> +
> +.macro pixman_composite_in_n_8_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
> + FLAG_DST_READWRITE, \
> + 32, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_in_n_8_init, \
> + pixman_composite_in_n_8_cleanup, \
> + pixman_composite_in_n_8_process_pixblock_head, \
> + pixman_composite_in_n_8_process_pixblock_tail, \
> + pixman_composite_in_n_8_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +.macro pixman_composite_add_n_8_8_process_pixblock_head
> + /* expecting source data in {v8, v9, v10, v11} */
> + /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
> + /* and destination data in {v4, v5, v6, v7} */
> + /* mask is in v24, v25, v26, v27 */
> + umull v0.8h, v24.8b, v11.8b
> + umull v1.8h, v25.8b, v11.8b
> + umull v2.8h, v26.8b, v11.8b
> + umull v3.8h, v27.8b, v11.8b
> + urshr v12.8h, v0.8h, #8
> + urshr v13.8h, v1.8h, #8
> + urshr v14.8h, v2.8h, #8
> + urshr v15.8h, v3.8h, #8
> + raddhn v0.8b, v0.8h, v12.8h
> + raddhn v1.8b, v1.8h, v13.8h
> + raddhn v2.8b, v2.8h, v14.8h
> + raddhn v3.8b, v3.8h, v15.8h
> + uqadd v28.8b, v0.8b, v4.8b
> + uqadd v29.8b, v1.8b, v5.8b
> + uqadd v30.8b, v2.8b, v6.8b
> + uqadd v31.8b, v3.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_add_n_8_8_process_pixblock_tail
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
> + pixman_composite_add_n_8_8_process_pixblock_tail
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + fetch_mask_pixblock
> + cache_preload 32, 32
> + pixman_composite_add_n_8_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_add_n_8_8_init
> + mov v11.s[0], w4
> + dup v11.8b, v11.b[3]
> +.endm
> +
> +.macro pixman_composite_add_n_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
> + FLAG_DST_READWRITE, \
> + 32, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_add_n_8_8_init, \
> + pixman_composite_add_n_8_8_cleanup, \
> + pixman_composite_add_n_8_8_process_pixblock_head, \
> + pixman_composite_add_n_8_8_process_pixblock_tail, \
> + pixman_composite_add_n_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8_8_8_process_pixblock_head
> + /* expecting source data in {v0, v1, v2, v3} */
> + /* destination data in {v4, v5, v6, v7} */
> + /* mask in {v24, v25, v26, v27} */
> + umull v8.8h, v24.8b, v0.8b
> + umull v9.8h, v25.8b, v1.8b
> + umull v10.8h, v26.8b, v2.8b
> + umull v11.8h, v27.8b, v3.8b
> + urshr v0.8h, v8.8h, #8
> + urshr v1.8h, v9.8h, #8
> + urshr v12.8h, v10.8h, #8
> + urshr v13.8h, v11.8h, #8
> + raddhn v0.8b, v0.8h, v8.8h
> + raddhn v1.8b, v1.8h, v9.8h
> + raddhn v2.8b, v12.8h, v10.8h
> + raddhn v3.8b, v13.8h, v11.8h
> + uqadd v28.8b, v0.8b, v4.8b
> + uqadd v29.8b, v1.8b, v5.8b
> + uqadd v30.8b, v2.8b, v6.8b
> + uqadd v31.8b, v3.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_add_8_8_8_process_pixblock_tail
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
> + pixman_composite_add_8_8_8_process_pixblock_tail
> + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + fetch_mask_pixblock
> + fetch_src_pixblock
> + cache_preload 32, 32
> + pixman_composite_add_8_8_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_add_8_8_8_init
> +.endm
> +
> +.macro pixman_composite_add_8_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
> + FLAG_DST_READWRITE, \
> + 32, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_add_8_8_8_init, \
> + pixman_composite_add_8_8_8_cleanup, \
> + pixman_composite_add_8_8_8_process_pixblock_head, \
> + pixman_composite_add_8_8_8_process_pixblock_tail, \
> + pixman_composite_add_8_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
> + /* expecting source data in {v0, v1, v2, v3} */
> + /* destination data in {v4, v5, v6, v7} */
> + /* mask in {v24, v25, v26, v27} */
> + umull v8.8h, v27.8b, v0.8b
> + umull v9.8h, v27.8b, v1.8b
> + umull v10.8h, v27.8b, v2.8b
> + umull v11.8h, v27.8b, v3.8b
> + /* 1 cycle bubble */
> + ursra v8.8h, v8.8h, #8
> + ursra v9.8h, v9.8h, #8
> + ursra v10.8h, v10.8h, #8
> + ursra v11.8h, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
> + /* 2 cycle bubble */
> + rshrn v28.8b, v8.8h, #8
> + rshrn v29.8b, v9.8h, #8
> + rshrn v30.8b, v10.8h, #8
> + rshrn v31.8b, v11.8h, #8
> + uqadd v28.8b, v4.8b, v28.8b
> + uqadd v29.8b, v5.8b, v29.8b
> + uqadd v30.8b, v6.8b, v30.8b
> + uqadd v31.8b, v7.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
> + fetch_src_pixblock
> + rshrn v28.8b, v8.8h, #8
> + fetch_mask_pixblock
> + rshrn v29.8b, v9.8h, #8
> + umull v8.8h, v27.8b, v0.8b
> + rshrn v30.8b, v10.8h, #8
> + umull v9.8h, v27.8b, v1.8b
> + rshrn v31.8b, v11.8h, #8
> + umull v10.8h, v27.8b, v2.8b
> + umull v11.8h, v27.8b, v3.8b
> + uqadd v28.8b, v4.8b, v28.8b
> + uqadd v29.8b, v5.8b, v29.8b
> + uqadd v30.8b, v6.8b, v30.8b
> + uqadd v31.8b, v7.8b, v31.8b
> + ursra v8.8h, v8.8h, #8
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + ursra v9.8h, v9.8h, #8
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + ursra v10.8h, v10.8h, #8
> +
> + cache_preload 8, 8
> +
> + ursra v11.8h, v11.8h, #8
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +generate_composite_function_single_scanline \
> + pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +generate_composite_function \
> + pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 27 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_n_8_8888_init
> + mov v3.s[0], w4
> + dup v0.8b, v3.b[0]
> + dup v1.8b, v3.b[1]
> + dup v2.8b, v3.b[2]
> + dup v3.8b, v3.b[3]
> +.endm
> +
> +.macro pixman_composite_add_n_8_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_add_n_8_8888_init, \
> + pixman_composite_add_n_8_8888_cleanup, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 27 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8888_n_8888_init
> + mov v27.s[0], w6
> + dup v27.8b, v27.b[3]
> +.endm
> +
> +.macro pixman_composite_add_8888_n_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_add_8888_n_8888_init, \
> + pixman_composite_add_8888_n_8888_cleanup, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 27 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
> + /* expecting source data in {v0, v1, v2, v3} */
> + /* destination data in {v4, v5, v6, v7} */
> + /* solid mask is in v15 */
> +
> + /* 'in' */
> + umull v11.8h, v15.8b, v3.8b
> + umull v10.8h, v15.8b, v2.8b
> + umull v9.8h, v15.8b, v1.8b
> + umull v8.8h, v15.8b, v0.8b
> + urshr v16.8h, v11.8h, #8
> + urshr v14.8h, v10.8h, #8
> + urshr v13.8h, v9.8h, #8
> + urshr v12.8h, v8.8h, #8
> + raddhn v3.8b, v11.8h, v16.8h
> + raddhn v2.8b, v10.8h, v14.8h
> + raddhn v1.8b, v9.8h, v13.8h
> + raddhn v0.8b, v8.8h, v12.8h
> + mvn v24.8b, v3.8b /* get inverted alpha */
> + /* now do alpha blending */
> + umull v8.8h, v24.8b, v4.8b
> + umull v9.8h, v24.8b, v5.8b
> + umull v10.8h, v24.8b, v6.8b
> + umull v11.8h, v24.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
> + urshr v16.8h, v8.8h, #8
> + urshr v17.8h, v9.8h, #8
> + urshr v18.8h, v10.8h, #8
> + urshr v19.8h, v11.8h, #8
> + raddhn v28.8b, v16.8h, v8.8h
> + raddhn v29.8b, v17.8h, v9.8h
> + raddhn v30.8b, v18.8h, v10.8h
> + raddhn v31.8b, v19.8h, v11.8h
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
> + fetch_src_pixblock
> + cache_preload 8, 8
> + fetch_mask_pixblock
> + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function_single_scanline \
> + pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
> + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
> + pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 12 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_n_8888_process_pixblock_head
> + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
> + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
> + uqadd v28.8b, v0.8b, v28.8b
> + uqadd v29.8b, v1.8b, v29.8b
> + uqadd v30.8b, v2.8b, v30.8b
> + uqadd v31.8b, v3.8b, v31.8b
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + pixman_composite_over_8888_n_8888_process_pixblock_tail
> + fetch_src_pixblock
> + cache_preload 8, 8
> + pixman_composite_over_8888_n_8888_process_pixblock_head
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_over_8888_n_8888_init
> + mov v15.s[0], w6
> + dup v15.8b, v15.b[3]
> +.endm
> +
> +.macro pixman_composite_over_8888_n_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_8888_n_8888_init, \
> + pixman_composite_over_8888_n_8888_cleanup, \
> + pixman_composite_over_8888_n_8888_process_pixblock_head, \
> + pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 12 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + pixman_composite_over_8888_n_8888_process_pixblock_tail
> + fetch_src_pixblock
> + cache_preload 8, 8
> + fetch_mask_pixblock
> + pixman_composite_over_8888_n_8888_process_pixblock_head
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_8888_n_8888_process_pixblock_head, \
> + pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 12 /* mask_basereg */
> +
> +generate_composite_function_single_scanline \
> + pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_8888_n_8888_process_pixblock_head, \
> + pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 12 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + pixman_composite_over_8888_n_8888_process_pixblock_tail
> + fetch_src_pixblock
> + cache_preload 8, 8
> + fetch_mask_pixblock
> + pixman_composite_over_8888_n_8888_process_pixblock_head
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_8888_n_8888_process_pixblock_head, \
> + pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 15 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0888_0888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_0888_0888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
> + st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24
> + fetch_src_pixblock
> + cache_preload 8, 8
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
> + FLAG_DST_WRITEONLY, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_0888_0888_process_pixblock_head, \
> + pixman_composite_src_0888_0888_process_pixblock_tail, \
> + pixman_composite_src_0888_0888_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
> + mov v31.8b, v2.8b
> + mov v2.8b, v0.8b
> + mov v0.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
> + st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
> + fetch_src_pixblock
> + mov v31.8b, v2.8b
> + mov v2.8b, v0.8b
> + mov v0.8b, v31.8b
> + cache_preload 8, 8
> +.endm
> +
> +.macro pixman_composite_src_0888_8888_rev_init
> + eor v3.8b, v3.8b, v3.8b
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + pixman_composite_src_0888_8888_rev_init, \
> + default_cleanup, \
> + pixman_composite_src_0888_8888_rev_process_pixblock_head, \
> + pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
> + pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
> + 0, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
> + ushll v8.8h, v1.8b, #7
> + sli v8.8h, v8.8h, #1
> + ushll v9.8h, v2.8b, #7
> + sli v9.8h, v9.8h, #1
> +.endm
> +
> +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
> + ushll v14.8h, v0.8b, #7
> + sli v14.8h, v14.8h, #1
> + sri v14.8h, v8.8h, #5
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
> + ushll v14.8h, v0.8b, #7
> + sli v14.8h, v14.8h, #1
> + fetch_src_pixblock
> + sri v14.8h, v8.8h, #5
> + sri v14.8h, v9.8h, #11
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> + ushll v8.8h, v1.8b, #7
> + sli v8.8h, v8.8h, #1
> + st1 {v14.8h}, [DST_W], #16
> + ushll v9.8h, v2.8b, #7
> + sli v9.8h, v9.8h, #1
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
> + FLAG_DST_WRITEONLY, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_0888_0565_rev_process_pixblock_head, \
> + pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
> + pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
> + umull v8.8h, v3.8b, v0.8b
> + umull v9.8h, v3.8b, v1.8b
> + umull v10.8h, v3.8b, v2.8b
> +.endm
> +
> +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
> + urshr v11.8h, v8.8h, #8
> + mov v30.8b, v31.8b
> + mov v31.8b, v3.8b
> + mov v3.8b, v30.8b
> + urshr v12.8h, v9.8h, #8
> + urshr v13.8h, v10.8h, #8
> + raddhn v30.8b, v11.8h, v8.8h
> + raddhn v29.8b, v12.8h, v9.8h
> + raddhn v28.8b, v13.8h, v10.8h
> +.endm
> +
> +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
> + urshr v11.8h, v8.8h, #8
> + mov v30.8b, v31.8b
> + mov v31.8b, v3.8b
> + mov v3.8b, v31.8b
> + urshr v12.8h, v9.8h, #8
> + urshr v13.8h, v10.8h, #8
> + fetch_src_pixblock
> + raddhn v30.8b, v11.8h, v8.8h
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + raddhn v29.8b, v12.8h, v9.8h
> + raddhn v28.8b, v13.8h, v10.8h
> + umull v8.8h, v3.8b, v0.8b
> + umull v9.8h, v3.8b, v1.8b
> + umull v10.8h, v3.8b, v2.8b
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF cmp PF_X, ORIG_W
> + PF lsl DUMMY, PF_X, src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> + PF subs PF_CTL, PF_CTL, #0x10
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +10:
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_pixbuf_8888_process_pixblock_head, \
> + pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
> + pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
> + umull v8.8h, v3.8b, v0.8b
> + umull v9.8h, v3.8b, v1.8b
> + umull v10.8h, v3.8b, v2.8b
> +.endm
> +
> +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
> + urshr v11.8h, v8.8h, #8
> + mov v30.8b, v31.8b
> + mov v31.8b, v3.8b
> + mov v3.8b, v30.8b
> + urshr v12.8h, v9.8h, #8
> + urshr v13.8h, v10.8h, #8
> + raddhn v28.8b, v11.8h, v8.8h
> + raddhn v29.8b, v12.8h, v9.8h
> + raddhn v30.8b, v13.8h, v10.8h
> +.endm
> +
> +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
> + urshr v11.8h, v8.8h, #8
> + mov v30.8b, v31.8b
> + mov v31.8b, v3.8b
> + mov v3.8b, v30.8b
> + urshr v12.8h, v9.8h, #8
> + urshr v13.8h, v10.8h, #8
> + fetch_src_pixblock
> + raddhn v28.8b, v11.8h, v8.8h
> + PF add PF_X, PF_X, #8
> + PF tst PF_CTL, #0xF
> + PF beq 10f
> + PF add PF_X, PF_X, #8
> + PF sub PF_CTL, PF_CTL, #1
> +10:
> + raddhn v29.8b, v12.8h, v9.8h
> + raddhn v30.8b, v13.8h, v10.8h
> + umull v8.8h, v3.8b, v0.8b
> + umull v9.8h, v3.8b, v1.8b
> + umull v10.8h, v3.8b, v2.8b
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> + PF cmp PF_X, ORIG_W
> + PF lsl DUMMY, PF_X, src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> + PF ble 10f
> + PF sub PF_X, PF_X, ORIG_W
> + PF subs PF_CTL, PF_CTL, #0x10
> + PF ble 10f
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +10:
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 10, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
> + pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
> + pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 0, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_0565_8_0565_process_pixblock_head
> + /* mask is in v15 */
> + mov v4.d[0], v8.d[0]
> + mov v4.d[1], v9.d[0]
> + mov v13.d[0], v10.d[0]
> + mov v13.d[1], v11.d[0]
> + convert_0565_to_x888 v4, v2, v1, v0
> + convert_0565_to_x888 v13, v6, v5, v4
> + /* source pixel data is in {v0, v1, v2, XX} */
> + /* destination pixel data is in {v4, v5, v6, XX} */
> + mvn v7.8b, v15.8b
> + umull v10.8h, v15.8b, v2.8b
> + umull v9.8h, v15.8b, v1.8b
> + umull v8.8h, v15.8b, v0.8b
> + umull v11.8h, v7.8b, v4.8b
> + umull v12.8h, v7.8b, v5.8b
> + umull v13.8h, v7.8b, v6.8b
> + urshr v19.8h, v10.8h, #8
> + urshr v18.8h, v9.8h, #8
> + urshr v17.8h, v8.8h, #8
> + raddhn v2.8b, v10.8h, v19.8h
> + raddhn v1.8b, v9.8h, v18.8h
> + raddhn v0.8b, v8.8h, v17.8h
> +.endm
> +
> +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
> + urshr v17.8h, v11.8h, #8
> + urshr v18.8h, v12.8h, #8
> + urshr v19.8h, v13.8h, #8
> + raddhn v28.8b, v17.8h, v11.8h
> + raddhn v29.8b, v18.8h, v12.8h
> + raddhn v30.8b, v19.8h, v13.8h
> + uqadd v0.8b, v0.8b, v28.8b
> + uqadd v1.8b, v1.8b, v29.8b
> + uqadd v2.8b, v2.8b, v30.8b
> + /* 32bpp result is in {v0, v1, v2, XX} */
> + convert_8888_to_0565 v2, v1, v0, v14, v30, v13
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
> + fetch_mask_pixblock
> + pixman_composite_over_0565_8_0565_process_pixblock_tail
> + fetch_src_pixblock
> + ld1 {v10.4h, v11.4h}, [DST_R], #16
> + cache_preload 8, 8
> + pixman_composite_over_0565_8_0565_process_pixblock_head
> + st1 {v14.8h}, [DST_W], #16
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_0565_8_0565_process_pixblock_head, \
> + pixman_composite_over_0565_8_0565_process_pixblock_tail, \
> + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 10, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 15 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_0565_n_0565_init
> + mov v15.s[0], w6
> + dup v15.8b, v15.b[3]
> +.endm
> +
> +.macro pixman_composite_over_0565_n_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + pixman_composite_over_0565_n_0565_init, \
> + pixman_composite_over_0565_n_0565_cleanup, \
> + pixman_composite_over_0565_8_0565_process_pixblock_head, \
> + pixman_composite_over_0565_8_0565_process_pixblock_tail, \
> + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 10, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 15 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_0565_8_0565_process_pixblock_head
> + /* mask is in v15 */
> + mov v4.d[0], v8.d[0]
> + mov v4.d[1], v9.d[0]
> + mov v13.d[0], v10.d[0]
> + mov v13.d[1], v11.d[0]
> + convert_0565_to_x888 v4, v2, v1, v0
> + convert_0565_to_x888 v13, v6, v5, v4
> + /* source pixel data is in {v0, v1, v2, XX} */
> + /* destination pixel data is in {v4, v5, v6, XX} */
> + umull v9.8h, v15.8b, v2.8b
> + umull v8.8h, v15.8b, v1.8b
> + umull v7.8h, v15.8b, v0.8b
> + urshr v12.8h, v9.8h, #8
> + urshr v11.8h, v8.8h, #8
> + urshr v10.8h, v7.8h, #8
> + raddhn v2.8b, v9.8h, v12.8h
> + raddhn v1.8b, v8.8h, v11.8h
> + raddhn v0.8b, v7.8h, v10.8h
> +.endm
> +
> +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
> + uqadd v0.8b, v0.8b, v4.8b
> + uqadd v1.8b, v1.8b, v5.8b
> + uqadd v2.8b, v2.8b, v6.8b
> + /* 32bpp result is in {v0, v1, v2, XX} */
> + convert_8888_to_0565 v2, v1, v0, v14, v30, v13
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
> + fetch_mask_pixblock
> + pixman_composite_add_0565_8_0565_process_pixblock_tail
> + fetch_src_pixblock
> + ld1 {v10.4h, v11.4h}, [DST_R], #16
> + cache_preload 8, 8
> + pixman_composite_add_0565_8_0565_process_pixblock_head
> + st1 {v14.8h}, [DST_W], #16
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_add_0565_8_0565_process_pixblock_head, \
> + pixman_composite_add_0565_8_0565_process_pixblock_tail, \
> + pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 10, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 15 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
> + /* mask is in v15 */
> + mov v12.d[0], v10.d[0]
> + mov v12.d[1], v11.d[0]
> + convert_0565_to_x888 v12, v6, v5, v4
> + /* destination pixel data is in {v4, v5, v6, xx} */
> + mvn v24.8b, v15.8b /* get inverted alpha */
> + /* now do alpha blending */
> + umull v8.8h, v24.8b, v4.8b
> + umull v9.8h, v24.8b, v5.8b
> + umull v10.8h, v24.8b, v6.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
> + urshr v11.8h, v8.8h, #8
> + urshr v12.8h, v9.8h, #8
> + urshr v13.8h, v10.8h, #8
> + raddhn v0.8b, v11.8h, v8.8h
> + raddhn v1.8b, v12.8h, v9.8h
> + raddhn v2.8b, v13.8h, v10.8h
> + /* 32bpp result is in {v0, v1, v2, XX} */
> + convert_8888_to_0565 v2, v1, v0, v14, v12, v3
> + mov v28.d[0], v14.d[0]
> + mov v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
> + fetch_src_pixblock
> + pixman_composite_out_reverse_8_0565_process_pixblock_tail
> + ld1 {v10.4h, v11.4h}, [DST_R], #16
> + cache_preload 8, 8
> + pixman_composite_out_reverse_8_0565_process_pixblock_head
> + st1 {v14.8h}, [DST_W], #16
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_out_reverse_8_0565_process_pixblock_head, \
> + pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
> + pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 10, /* dst_r_basereg */ \
> + 15, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
> + /* src is in v0 */
> + /* destination pixel data is in {v4, v5, v6, v7} */
> + mvn v1.8b, v0.8b /* get inverted alpha */
> + /* now do alpha blending */
> + umull v8.8h, v1.8b, v4.8b
> + umull v9.8h, v1.8b, v5.8b
> + umull v10.8h, v1.8b, v6.8b
> + umull v11.8h, v1.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
> + urshr v14.8h, v8.8h, #8
> + urshr v15.8h, v9.8h, #8
> + urshr v12.8h, v10.8h, #8
> + urshr v13.8h, v11.8h, #8
> + raddhn v28.8b, v14.8h, v8.8h
> + raddhn v29.8b, v15.8h, v9.8h
> + raddhn v30.8b, v12.8h, v10.8h
> + raddhn v31.8b, v13.8h, v11.8h
> + /* 32bpp result is in {v28, v29, v30, v31} */
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
> + fetch_src_pixblock
> + pixman_composite_out_reverse_8_8888_process_pixblock_tail
> + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> + cache_preload 8, 8
> + pixman_composite_out_reverse_8_8888_process_pixblock_head
> + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + 5, /* prefetch distance */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_out_reverse_8_8888_process_pixblock_head, \
> + pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
> + pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 0 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +generate_composite_function_nearest_scanline \
> + pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_8888_process_pixblock_head, \
> + pixman_composite_over_8888_8888_process_pixblock_tail, \
> + pixman_composite_over_8888_8888_process_pixblock_tail_head
> +
> +generate_composite_function_nearest_scanline \
> + pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_over_8888_0565_process_pixblock_head, \
> + pixman_composite_over_8888_0565_process_pixblock_tail, \
> + pixman_composite_over_8888_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 0, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +generate_composite_function_nearest_scanline \
> + pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_8888_0565_process_pixblock_head, \
> + pixman_composite_src_8888_0565_process_pixblock_tail, \
> + pixman_composite_src_8888_0565_process_pixblock_tail_head, \
> +
> +generate_composite_function_nearest_scanline \
> + pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
> + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init, \
> + default_cleanup, \
> + pixman_composite_src_0565_8888_process_pixblock_head, \
> + pixman_composite_src_0565_8888_process_pixblock_tail, \
> + pixman_composite_src_0565_8888_process_pixblock_tail_head
> +
> +generate_composite_function_nearest_scanline \
> + pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
> + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_8888_8_0565_process_pixblock_head, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> + pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 4, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 24 /* mask_basereg */
> +
> +generate_composite_function_nearest_scanline \
> + pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
> + FLAG_DST_READWRITE, \
> + 8, /* number of pixels, processed in a single block */ \
> + default_init_need_all_regs, \
> + default_cleanup_need_all_regs, \
> + pixman_composite_over_0565_8_0565_process_pixblock_head, \
> + pixman_composite_over_0565_8_0565_process_pixblock_tail, \
> + pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
> + 28, /* dst_w_basereg */ \
> + 10, /* dst_r_basereg */ \
> + 8, /* src_basereg */ \
> + 15 /* mask_basereg */
> +
> +/******************************************************************************/
> +
> +/*
> + * Bilinear scaling support code which tries to provide pixel fetching, color
> + * format conversion, and interpolation as separate macros which can be used
> + * as the basic building blocks for constructing bilinear scanline functions.
> + */
> +
> +.macro bilinear_load_8888 reg1, reg2, tmp
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP2, TMP1, #2
> + add TMP1, TOP, TMP2
> + ld1 {®1&.2s}, [TMP1], STRIDE
> + ld1 {®2&.2s}, [TMP1]
> +.endm
> +
> +.macro bilinear_load_0565 reg1, reg2, tmp
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP2, TMP1, #1
> + add TMP1, TOP, TMP2
> + ld1 {®2&.s}[0], [TMP1], STRIDE
> + ld1 {®2&.s}[1], [TMP1]
> + convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_two_8888 \
> + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
> +
> + bilinear_load_8888 reg1, reg2, tmp1
> + umull &acc1&.8h, ®1&.8b, v28.8b
> + umlal &acc1&.8h, ®2&.8b, v29.8b
> + bilinear_load_8888 reg3, reg4, tmp2
> + umull &acc2&.8h, ®3&.8b, v28.8b
> + umlal &acc2&.8h, ®4&.8b, v29.8b
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_four_8888 \
> + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
> + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
> +
> + bilinear_load_and_vertical_interpolate_two_8888 \
> + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
> + bilinear_load_and_vertical_interpolate_two_8888 \
> + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
> +.endm
> +
> +.macro vzip reg1, reg2
> + umov TMP4, v31.d[0]
> + zip1 v31.8b, reg1, reg2
> + zip2 reg2, reg1, reg2
> + mov reg1, v31.8b
> + mov v31.d[0], TMP4
> +.endm
> +
> +.macro vuzp reg1, reg2
> + umov TMP4, v31.d[0]
> + uzp1 v31.8b, reg1, reg2
> + uzp2 reg2, reg1, reg2
> + mov reg1, v31.8b
> + mov v31.d[0], TMP4
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_two_0565 \
> + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP2, TMP1, #1
> + add TMP1, TOP, TMP2
> + asr TMP2, X, #16
> + add X, X, UX
> + lsl TMP3, TMP2, #1
> + add TMP2, TOP, TMP3
> + ld1 {&acc2&.s}[0], [TMP1], STRIDE
> + ld1 {&acc2&.s}[2], [TMP2], STRIDE
> + ld1 {&acc2&.s}[1], [TMP1]
> + ld1 {&acc2&.s}[3], [TMP2]
> + convert_0565_to_x888 acc2, reg3, reg2, reg1
> + vzip ®1&.8b, ®3&.8b
> + vzip ®2&.8b, ®4&.8b
> + vzip ®3&.8b, ®4&.8b
> + vzip ®1&.8b, ®2&.8b
> + umull &acc1&.8h, ®1&.8b, v28.8b
> + umlal &acc1&.8h, ®2&.8b, v29.8b
> + umull &acc2&.8h, ®3&.8b, v28.8b
> + umlal &acc2&.8h, ®4&.8b, v29.8b
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_four_0565 \
> + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
> + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP2, TMP1, #1
> + add TMP1, TOP, TMP2
> + asr TMP2, X, #16
> + add X, X, UX
> + lsl TMP3, TMP2, #1
> + add TMP2, TOP, TMP3
> + ld1 {&xacc2&.s}[0], [TMP1], STRIDE
> + ld1 {&xacc2&.s}[2], [TMP2], STRIDE
> + ld1 {&xacc2&.s}[1], [TMP1]
> + ld1 {&xacc2&.s}[3], [TMP2]
> + convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP2, TMP1, #1
> + add TMP1, TOP, TMP2
> + asr TMP2, X, #16
> + add X, X, UX
> + lsl TMP3, TMP2, #1
> + add TMP2, TOP, TMP3
> + ld1 {&yacc2&.s}[0], [TMP1], STRIDE
> + vzip &xreg1&.8b, &xreg3&.8b
> + ld1 {&yacc2&.s}[2], [TMP2], STRIDE
> + vzip &xreg2&.8b, &xreg4&.8b
> + ld1 {&yacc2&.s}[1], [TMP1]
> + vzip &xreg3&.8b, &xreg4&.8b
> + ld1 {&yacc2&.s}[3], [TMP2]
> + vzip &xreg1&.8b, &xreg2&.8b
> + convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
> + umull &xacc1&.8h, &xreg1&.8b, v28.8b
> + vzip &yreg1&.8b, &yreg3&.8b
> + umlal &xacc1&.8h, &xreg2&.8b, v29.8b
> + vzip &yreg2&.8b, &yreg4&.8b
> + umull &xacc2&.8h, &xreg3&.8b, v28.8b
> + vzip &yreg3&.8b, &yreg4&.8b
> + umlal &xacc2&.8h, &xreg4&.8b, v29.8b
> + vzip &yreg1&.8b, &yreg2&.8b
> + umull &yacc1&.8h, &yreg1&.8b, v28.8b
> + umlal &yacc1&.8h, &yreg2&.8b, v29.8b
> + umull &yacc2&.8h, &yreg3&.8b, v28.8b
> + umlal &yacc2&.8h, &yreg4&.8b, v29.8b
> +.endm
> +
> +.macro bilinear_store_8888 numpix, tmp1, tmp2
> +.if numpix == 4
> + st1 {v0.2s, v1.2s}, [OUT], #16
> +.elseif numpix == 2
> + st1 {v0.2s}, [OUT], #8
> +.elseif numpix == 1
> + st1 {v0.s}[0], [OUT], #4
> +.else
> + .error bilinear_store_8888 numpix is unsupported
> +.endif
> +.endm
> +
> +.macro bilinear_store_0565 numpix, tmp1, tmp2
> + vuzp v0.8b, v1.8b
> + vuzp v2.8b, v3.8b
> + vuzp v1.8b, v3.8b
> + vuzp v0.8b, v2.8b
> + convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
> +.if numpix == 4
> + st1 {v1.4h}, [OUT], #8
> +.elseif numpix == 2
> + st1 {v1.s}[0], [OUT], #4
> +.elseif numpix == 1
> + st1 {v1.h}[0], [OUT], #2
> +.else
> + .error bilinear_store_0565 numpix is unsupported
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
> + bilinear_load_&src_fmt v0, v1, v2
> + umull v2.8h, v0.8b, v28.8b
> + umlal v2.8h, v1.8b, v29.8b
> + /* 5 cycles bubble */
> + mov v3.d[0], v2.d[1]
> + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v0.4s, v2.4h, v30.4h
> + umlal v0.4s, v3.4h, v30.4h
> + /* 5 cycles bubble */
> + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + /* 3 cycles bubble */
> + xtn v0.8b, v0.8h
> + /* 1 cycle bubble */
> + bilinear_store_&dst_fmt 1, v3, v4
> +.endm
> +
> +.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
> + bilinear_load_and_vertical_interpolate_two_&src_fmt \
> + v1, v11, v2, v3, v20, v21, v22, v23
> + mov v2.d[0], v1.d[0]
> + mov v3.d[0], v1.d[1]
> + mov v22.d[0], v11.d[0]
> + mov v23.d[0], v11.d[1]
> + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v0.4s, v2.4h, v30.4h
> + umlal v0.4s, v3.4h, v30.4h
> + ushll v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v10.4s, v22.4h, v31.4h
> + umlal v10.4s, v23.4h, v31.4h
> + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + shrn v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v0.d[1], v1.d[0]
> + ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v31.d[0], v30.d[1]
> + add v12.8h, v12.8h, v13.8h
> + xtn v0.8b, v0.8h
> + bilinear_store_&dst_fmt 2, v3, v4
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
> + bilinear_load_and_vertical_interpolate_four_&src_fmt \
> + v1, v11, v14, v15, v16, v17, v22, v23 \
> + v3, v9, v24, v25, v26, v27, v18, v19
> + prfm pldl2strm, [TMP1, PF_OFFS]
> + sub TMP1, TMP1, STRIDE
> + mov v6.d[0], v3.d[0]
> + mov v7.d[0], v3.d[1]
> + mov v18.d[0], v9.d[0]
> + mov v19.d[0], v9.d[1]
> + mov v2.d[0], v1.d[0]
> + mov v3.d[0], v1.d[1]
> + mov v22.d[0], v11.d[0]
> + mov v23.d[0], v11.d[1]
> + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v0.4s, v2.4h, v30.4h
> + umlal v0.4s, v3.4h, v30.4h
> + ushll v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v10.4s, v22.4h, v31.4h
> + umlal v10.4s, v23.4h, v31.4h
> + ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v31.d[0], v30.d[1]
> + ushll v2.4s, v6.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v2.4s, v6.4h, v30.4h
> + umlal v2.4s, v7.4h, v30.4h
> + ushll v8.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
> + prfm pldl2strm, [TMP2, PF_OFFS]
> + umlsl v8.4s, v18.4h, v31.4h
> + umlal v8.4s, v19.4h, v31.4h
> + add v12.8h, v12.8h, v13.8h
> + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + shrn v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v0.d[1], v1.d[0]
> + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + shrn v5.4h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v2.d[1], v5.d[0]
> + ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v31.d[0], v30.d[1]
> + xtn v0.8b, v0.8h
> + xtn v1.8b, v2.8h
> + add v12.8h, v12.8h, v13.8h
> + bilinear_store_&dst_fmt 4, v3, v4
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
> + bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
> +.else
> + bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
> + bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
> + bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
> +.else
> + bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
> + bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
> +.else
> + bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
> + bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
> + bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
> +.else
> + bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
> + bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
> +.else
> + bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> + bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.set BILINEAR_FLAG_UNROLL_4, 0
> +.set BILINEAR_FLAG_UNROLL_8, 1
> +.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
> +
> +/*
> + * Main template macro for generating NEON optimized bilinear scanline
> + * functions.
> + *
> + * Bilinear scanline scaler macro template uses the following arguments:
> + * fname - name of the function to generate
> + * src_fmt - source color format (8888 or 0565)
> + * dst_fmt - destination color format (8888 or 0565)
> + * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
> + * prefetch_distance - prefetch in the source image by that many
> + * pixels ahead
> + */
> +
> +.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
> + src_bpp_shift, dst_bpp_shift, \
> + prefetch_distance, flags
> +
> +pixman_asm_function fname
> + OUT .req x0
> + TOP .req x1
> + BOTTOM .req x2
> + WT .req x3
> + WB .req x4
> + X .req x5
> + UX .req x6
> + WIDTH .req x7
> + TMP1 .req x8
> + TMP2 .req x9
> + PF_OFFS .req x10
> + TMP3 .req x11
> + TMP4 .req x12
> + STRIDE .req x13
> +
> + sxtw x3, w3
> + sxtw x4, w4
> + sxtw x5, w5
> + sxtw x6, w6
> + sxtw x7, w7
> +
> + stp x29, x30, [sp, -16]!
> + mov x29, sp
> + sub sp, sp, 112 /* push all registers */
> + sub x29, x29, 64
> + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
> + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
> + stp x8, x9, [x29, -80]
> + stp x10, x11, [x29, -96]
> + stp x12, x13, [x29, -112]
> +
> + mov PF_OFFS, #prefetch_distance
> + mul PF_OFFS, PF_OFFS, UX
> +
> + subs STRIDE, BOTTOM, TOP
> + .unreq BOTTOM
> +
> + cmp WIDTH, #0
> + ble 300f
> +
> + dup v12.8h, w5
> + dup v13.8h, w6
> + dup v28.8b, w3
> + dup v29.8b, w4
> + mov v25.d[0], v12.d[1]
> + mov v26.d[0], v13.d[0]
> + add v25.4h, v25.4h, v26.4h
> + mov v12.d[1], v25.d[0]
> +
> + /* ensure good destination alignment */
> + cmp WIDTH, #1
> + blt 100f
> + tst OUT, #(1 << dst_bpp_shift)
> + beq 100f
> + ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v31.d[0], v30.d[1]
> + add v12.8h, v12.8h, v13.8h
> + bilinear_interpolate_last_pixel src_fmt, dst_fmt
> + sub WIDTH, WIDTH, #1
> +100:
> + add v13.8h, v13.8h, v13.8h
> + ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v31.d[0], v30.d[1]
> + add v12.8h, v12.8h, v13.8h
> +
> + cmp WIDTH, #2
> + blt 100f
> + tst OUT, #(1 << (dst_bpp_shift + 1))
> + beq 100f
> + bilinear_interpolate_two_pixels src_fmt, dst_fmt
> + sub WIDTH, WIDTH, #2
> +100:
> +.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
> +/*********** 8 pixels per iteration *****************/
> + cmp WIDTH, #4
> + blt 100f
> + tst OUT, #(1 << (dst_bpp_shift + 2))
> + beq 100f
> + bilinear_interpolate_four_pixels src_fmt, dst_fmt
> + sub WIDTH, WIDTH, #4
> +100:
> + subs WIDTH, WIDTH, #8
> + blt 100f
> + asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
> + bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
> + subs WIDTH, WIDTH, #8
> + blt 500f
> +1000:
> + bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
> + subs WIDTH, WIDTH, #8
> + bge 1000b
> +500:
> + bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
> +100:
> + tst WIDTH, #4
> + beq 200f
> + bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +200:
> +.else
> +/*********** 4 pixels per iteration *****************/
> + subs WIDTH, WIDTH, #4
> + blt 100f
> + asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
> + bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
> + subs WIDTH, WIDTH, #4
> + blt 500f
> +1000:
> + bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> + subs WIDTH, WIDTH, #4
> + bge 1000b
> +500:
> + bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
> +100:
> +/****************************************************/
> +.endif
> + /* handle the remaining trailing pixels */
> + tst WIDTH, #2
> + beq 200f
> + bilinear_interpolate_two_pixels src_fmt, dst_fmt
> +200:
> + tst WIDTH, #1
> + beq 300f
> + bilinear_interpolate_last_pixel src_fmt, dst_fmt
> +300:
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
> + ldp x8, x9, [x29, -80]
> + ldp x10, x11, [x29, -96]
> + ldp x12, x13, [x29, -104]
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret
> +
> + .unreq OUT
> + .unreq TOP
> + .unreq WT
> + .unreq WB
> + .unreq X
> + .unreq UX
> + .unreq WIDTH
> + .unreq TMP1
> + .unreq TMP2
> + .unreq PF_OFFS
> + .unreq TMP3
> + .unreq TMP4
> + .unreq STRIDE
> +.endfunc
> +
> +.endm
> +
> +/*****************************************************************************/
> +
> +.set have_bilinear_interpolate_four_pixels_8888_8888, 1
> +
> +.macro bilinear_interpolate_four_pixels_8888_8888_head
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP4, TMP1, #2
> + add TMP1, TOP, TMP4
> + asr TMP2, X, #16
> + add X, X, UX
> + lsl TMP4, TMP2, #2
> + add TMP2, TOP, TMP4
> +
> + ld1 {v22.2s}, [TMP1], STRIDE
> + ld1 {v23.2s}, [TMP1]
> + asr TMP3, X, #16
> + add X, X, UX
> + lsl TMP4, TMP3, #2
> + add TMP3, TOP, TMP4
> + umull v8.8h, v22.8b, v28.8b
> + umlal v8.8h, v23.8b, v29.8b
> + mov v16.d[0], v8.d[0]
> + mov v17.d[0], v8.d[1]
> +
> + ld1 {v22.2s}, [TMP2], STRIDE
> + ld1 {v23.2s}, [TMP2]
> + asr TMP4, X, #16
> + add X, X, UX
> + lsl TMP1, TMP4, #2
> + add TMP4, TOP, TMP1
> + umull v9.8h, v22.8b, v28.8b
> + umlal v9.8h, v23.8b, v29.8b
> + mov v18.d[0], v9.d[0]
> + mov v19.d[0], v9.d[1]
> +
> + ld1 {v22.2s}, [TMP3], STRIDE
> + ld1 {v23.2s}, [TMP3]
> + umull v10.8h, v22.8b, v28.8b
> + umlal v10.8h, v23.8b, v29.8b
> + mov v20.d[0], v10.d[0]
> + mov v21.d[0], v10.d[1]
> +
> + ushll v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v0.4s, v16.4h, v30.4h
> + umlal v0.4s, v17.4h, v30.4h
> +
> + prfm pldl2strm, [TMP4, PF_OFFS]
> + ld1 {v16.2s}, [TMP4], STRIDE
> + ld1 {v17.2s}, [TMP4]
> + prfm pldl2strm, [TMP4, PF_OFFS]
> + umull v11.8h, v16.8b, v28.8b
> + umlal v11.8h, v17.8b, v29.8b
> + mov v22.d[0], v11.d[0]
> + mov v23.d[0], v11.d[1]
> +
> + ushll v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v1.4s, v18.4h, v31.4h
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_8888_8888_tail
> + umlal v1.4s, v19.4h, v31.4h
> + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v30.d[0], v15.d[0]
> + mov v31.d[0], v15.d[1]
> + ushll v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v2.4s, v20.4h, v30.4h
> + umlal v2.4s, v21.4h, v30.4h
> + ushll v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v3.4s, v22.4h, v31.4h
> + umlal v3.4s, v23.4h, v31.4h
> + add v12.8h, v12.8h, v13.8h
> + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + shrn v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v0.d[1], v1.d[0]
> + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v31.d[0], v30.d[1]
> + shrn v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v2.d[1], v5.d[0]
> + xtn v6.8b, v0.8h
> + xtn v7.8b, v2.8h
> + add v12.8h, v12.8h, v13.8h
> + st1 {v6.2s, v7.2s}, [OUT], #16
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
> + asr TMP1, X, #16
> + add X, X, UX
> + lsl TMP2, TMP1, #2
> + add TMP1, TOP, TMP2
> + asr TMP2, X, #16
> + add X, X, UX
> + lsl TMP3, TMP2, #2
> + add TMP2, TOP, TMP3
> + umlal v1.4s, v19.4h, v31.4h
> + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v30.d[0], v15.d[0]
> + mov v31.d[0], v15.d[1]
> + ushll v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v2.4s, v20.4h, v30.4h
> + umlal v2.4s, v21.4h, v30.4h
> + ushll v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> + ld1 {v20.2s}, [TMP1], STRIDE
> + umlsl v3.4s, v22.4h, v31.4h
> + umlal v3.4s, v23.4h, v31.4h
> + ld1 {v21.2s}, [TMP1]
> + umull v8.8h, v20.8b, v28.8b
> + umlal v8.8h, v21.8b, v29.8b
> + mov v16.d[0], v8.d[0]
> + mov v17.d[0], v8.d[1]
> + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + shrn v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v0.d[1], v1.d[0]
> + shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + ld1 {v22.2s}, [TMP2], STRIDE
> + shrn v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> + mov v2.d[0], v4.d[0]
> + mov v2.d[1], v5.d[0]
> + add v12.8h, v12.8h, v13.8h
> + ld1 {v23.2s}, [TMP2]
> + umull v9.8h, v22.8b, v28.8b
> + asr TMP3, X, #16
> + add X, X, UX
> + lsl TMP4, TMP3, #2
> + add TMP3, TOP, TMP4
> + asr TMP4, X, #16
> + add X, X, UX
> + lsl TMP1, TMP4, #2
> + add TMP4, TOP, TMP1
> + umlal v9.8h, v23.8b, v29.8b
> + mov v18.d[0], v9.d[0]
> + mov v19.d[0], v9.d[1]
> + ld1 {v22.2s}, [TMP3], STRIDE
> + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> + mov v30.d[0], v15.d[0]
> + mov v31.d[0], v15.d[1]
> + ld1 {v23.2s}, [TMP3]
> + umull v10.8h, v22.8b, v28.8b
> + umlal v10.8h, v23.8b, v29.8b
> + mov v20.d[0], v10.d[0]
> + mov v21.d[0], v10.d[1]
> + xtn v6.8b, v0.8h
> + ushll v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
> + xtn v7.8b, v2.8h
> + umlsl v0.4s, v16.4h, v30.4h
> + umlal v0.4s, v17.4h, v30.4h
> + prfm pldl2strm, [TMP4, PF_OFFS]
> + ld1 {v16.2s}, [TMP4], STRIDE
> + add v12.8h, v12.8h, v13.8h
> + ld1 {v17.2s}, [TMP4]
> + prfm pldl2strm, [TMP4, PF_OFFS]
> + umull v11.8h, v16.8b, v28.8b
> + umlal v11.8h, v17.8b, v29.8b
> + mov v22.d[0], v11.d[0]
> + mov v23.d[0], v11.d[1]
> + st1 {v6.2s, v7.2s}, [OUT], #16
> + ushll v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
> + umlsl v1.4s, v18.4h, v31.4h
> +.endm
> +
> +/*****************************************************************************/
> +
> +generate_bilinear_scanline_func \
> + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
> + 2, 2, 28, BILINEAR_FLAG_UNROLL_4
> +
> +generate_bilinear_scanline_func \
> + pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
> + 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
> +
> +generate_bilinear_scanline_func \
> + pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
> + 1, 2, 28, BILINEAR_FLAG_UNROLL_4
> +
> +generate_bilinear_scanline_func \
> + pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
> + 1, 1, 28, BILINEAR_FLAG_UNROLL_4
> diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
> new file mode 100644
> index 0000000..97cde5d
> --- /dev/null
> +++ b/pixman/pixman-arma64-neon-asm.h
> @@ -0,0 +1,1288 @@
> +/*
> + * Copyright ツゥ 2009 Nokia Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + *
> + * Author: Siarhei Siamashka (siarhei.siamashka at nokia.com)
> + */
> +
> +/*
> + * This file contains a macro ('generate_composite_function') which can
> + * construct 2D image processing functions, based on a common template.
> + * Any combinations of source, destination and mask images with 8bpp,
> + * 16bpp, 24bpp, 32bpp color formats are supported.
> + *
> + * This macro takes care of:
> + * - handling of leading and trailing unaligned pixels
> + * - doing most of the work related to L2 cache preload
> + * - encourages the use of software pipelining for better instructions
> + * scheduling
> + *
> + * The user of this macro has to provide some configuration parameters
> + * (bit depths for the images, prefetch distance, etc.) and a set of
> + * macros, which should implement basic code chunks responsible for
> + * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
> + * examples.
> + *
> + * TODO:
> + * - try overlapped pixel method (from Ian Rickards) when processing
> + * exactly two blocks of pixels
> + * - maybe add an option to do reverse scanline processing
> + */
> +
> +/*
> + * Bit flags for 'generate_composite_function' macro which are used
> + * to tune generated functions behavior.
> + */
> +.set FLAG_DST_WRITEONLY, 0
> +.set FLAG_DST_READWRITE, 1
> +.set FLAG_DEINTERLEAVE_32BPP, 2
> +
> +/*
> + * Constants for selecting preferable prefetch type.
> + */
> +.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
> +.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
> +.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
> +
> +/*
> + * Definitions of supplementary pixld/pixst macros (for partial load/store of
> + * pixel data).
> + */
> +
> +.macro pixldst1 op, elem_size, reg1, mem_operand, abits
> + op {v®1&.&elem_size}, [&mem_operand&], #8
> +.endm
> +
> +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
> + op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16
> +.endm
> +
> +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
> + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size,
> v®4&.&elem_size}, [&mem_operand&], #32
> +.endm
> +
> +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
> + op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes&
> +.endm
> +
> +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
> + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size},
> [&mem_operand&], #24
> +.endm
> +
> +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
> + op {v®1&.&elem_size, v®2&.&elem_size,
> v®3&.&elem_size}[idx], [&mem_operand&], #3
> +.endm
> +
> +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
> +.if numbytes == 32
> + .if elem_size==32
> + pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
> + %(basereg+6), %(basereg+7), mem_operand, abits
> + .elseif elem_size==16
> + pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
> + %(basereg+6), %(basereg+7), mem_operand, abits
> + .else
> + pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
> + %(basereg+6), %(basereg+7), mem_operand, abits
> + .endif
> +.elseif numbytes == 16
> + .if elem_size==32
> + pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
> + .elseif elem_size==16
> + pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
> + .else
> + pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
> + .endif
> +.elseif numbytes == 8
> + .if elem_size==32
> + pixldst1 op, 2s, %(basereg+1), mem_operand, abits
> + .elseif elem_size==16
> + pixldst1 op, 4h, %(basereg+1), mem_operand, abits
> + .else
> + pixldst1 op, 8b, %(basereg+1), mem_operand, abits
> + .endif
> +.elseif numbytes == 4
> + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
> + pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
> + .elseif elem_size == 16
> + pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
> + pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
> + .else
> + pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
> + pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
> + pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
> + pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
> + .endif
> +.elseif numbytes == 2
> + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
> + pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
> + .else
> + pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
> + pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
> + .endif
> +.elseif numbytes == 1
> + pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
> +.else
> + .error "unsupported size: numbytes"
> +.endif
> +.endm
> +
> +.macro pixld numpix, bpp, basereg, mem_operand, abits=0
> +.if bpp > 0
> +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> + pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
> + %(basereg+6), %(basereg+7), mem_operand, abits
> +.elseif (bpp == 24) && (numpix == 8)
> + pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
> +.elseif (bpp == 24) && (numpix == 4)
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
> +.elseif (bpp == 24) && (numpix == 2)
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
> +.elseif (bpp == 24) && (numpix == 1)
> + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
> +.else
> + pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
> +.endif
> +.endif
> +.endm
> +
> +.macro pixst numpix, bpp, basereg, mem_operand, abits=0
> +.if bpp > 0
> +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> + pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
> + %(basereg+6), %(basereg+7), mem_operand, abits
> +.elseif (bpp == 24) && (numpix == 8)
> + pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
> +.elseif (bpp == 24) && (numpix == 4)
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
> +.elseif (bpp == 24) && (numpix == 2)
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
> +.elseif (bpp == 24) && (numpix == 1)
> + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
> +.else
> + pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
> +.endif
> +.endif
> +.endm
> +
> +.macro pixld_a numpix, bpp, basereg, mem_operand
> +.if (bpp * numpix) <= 128
> + pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
> +.else
> + pixld numpix, bpp, basereg, mem_operand, 128
> +.endif
> +.endm
> +
> +.macro pixst_a numpix, bpp, basereg, mem_operand
> +.if (bpp * numpix) <= 128
> + pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
> +.else
> + pixst numpix, bpp, basereg, mem_operand, 128
> +.endif
> +.endm
> +
> +/*
> + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
> + * aliases to be defined)
> + */
> +.macro pixld1_s elem_size, reg1, mem_operand
> +.if elem_size == 16
> + asr TMP1, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP1, #1
> + add TMP1, mem_operand, DUMMY
> + asr TMP2, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP2, #1
> + add TMP2, mem_operand, DUMMY
> + ld1 {v®1&.h}[0], [TMP1]
> + asr TMP1, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP1, #1
> + add TMP1, mem_operand, DUMMY
> + ld1 {v®1&.h}[1], [TMP2]
> + asr TMP2, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP2, #1
> + add TMP2, mem_operand, DUMMY
> + ld1 {v®1&.h}[2], [TMP1]
> + ld1 {v®1&.h}[3], [TMP2]
> +.elseif elem_size == 32
> + asr TMP1, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP1, #2
> + add TMP1, mem_operand, DUMMY
> + asr TMP2, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP2, #2
> + add TMP2, mem_operand, DUMMY
> + ld1 {v®1&.s}[0], [TMP1]
> + ld1 {v®1&.s}[1], [TMP2]
> +.else
> + .error "unsupported"
> +.endif
> +.endm
> +
> +.macro pixld2_s elem_size, reg1, reg2, mem_operand
> +.if 0 /* elem_size == 32 */
> + mov TMP1, VX, asr #16
> + add VX, VX, UNIT_X, asl #1
> + add TMP1, mem_operand, TMP1, asl #2
> + mov TMP2, VX, asr #16
> + sub VX, VX, UNIT_X
> + add TMP2, mem_operand, TMP2, asl #2
> + ld1 {v®1&.s}[0], [TMP1]
> + mov TMP1, VX, asr #16
> + add VX, VX, UNIT_X, asl #1
> + add TMP1, mem_operand, TMP1, asl #2
> + ld1 {v®2&.s}[0], [TMP2, :32]
> + mov TMP2, VX, asr #16
> + add VX, VX, UNIT_X
> + add TMP2, mem_operand, TMP2, asl #2
> + ld1 {v®1&.s}[1], [TMP1]
> + ld1 {v®2&.s}[1], [TMP2]
> +.else
> + pixld1_s elem_size, reg1, mem_operand
> + pixld1_s elem_size, reg2, mem_operand
> +.endif
> +.endm
> +
> +.macro pixld0_s elem_size, reg1, idx, mem_operand
> +.if elem_size == 16
> + asr TMP1, VX, #16
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP1, #1
> + add TMP1, mem_operand, DUMMY
> + ld1 {v®1&.h}[idx], [TMP1]
> +.elseif elem_size == 32
> + asr DUMMY, VX, #16
> + mov TMP1, DUMMY
> + adds VX, VX, UNIT_X
> + bmi 55f
> +5: subs VX, VX, SRC_WIDTH_FIXED
> + bpl 5b
> +55:
> + lsl DUMMY, TMP1, #2
> + add TMP1, mem_operand, DUMMY
> + ld1 {v®1&.s}[idx], [TMP1]
> +.endif
> +.endm
> +
> +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
> +.if numbytes == 32
> + pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
> + pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
> + pixdeinterleave elem_size, %(basereg+4)
> +.elseif numbytes == 16
> + pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
> +.elseif numbytes == 8
> + pixld1_s elem_size, %(basereg+1), mem_operand
> +.elseif numbytes == 4
> + .if elem_size == 32
> + pixld0_s elem_size, %(basereg+0), 1, mem_operand
> + .elseif elem_size == 16
> + pixld0_s elem_size, %(basereg+0), 2, mem_operand
> + pixld0_s elem_size, %(basereg+0), 3, mem_operand
> + .else
> + pixld0_s elem_size, %(basereg+0), 4, mem_operand
> + pixld0_s elem_size, %(basereg+0), 5, mem_operand
> + pixld0_s elem_size, %(basereg+0), 6, mem_operand
> + pixld0_s elem_size, %(basereg+0), 7, mem_operand
> + .endif
> +.elseif numbytes == 2
> + .if elem_size == 16
> + pixld0_s elem_size, %(basereg+0), 1, mem_operand
> + .else
> + pixld0_s elem_size, %(basereg+0), 2, mem_operand
> + pixld0_s elem_size, %(basereg+0), 3, mem_operand
> + .endif
> +.elseif numbytes == 1
> + pixld0_s elem_size, %(basereg+0), 1, mem_operand
> +.else
> + .error "unsupported size: numbytes"
> +.endif
> +.endm
> +
> +.macro pixld_s numpix, bpp, basereg, mem_operand
> +.if bpp > 0
> + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
> +.endif
> +.endm
> +
> +.macro vuzp8 reg1, reg2
> + umov DUMMY, v16.d[0]
> + uzp1 v16.8b, v®1&.8b, v®2&.8b
> + uzp2 v®2&.8b, v®1&.8b, v®2&.8b
> + mov v®1&.8b, v16.8b
> + mov v16.d[0], DUMMY
> +.endm
> +
> +.macro vzip8 reg1, reg2
> + umov DUMMY, v16.d[0]
> + zip1 v16.8b, v®1&.8b, v®2&.8b
> + zip2 v®2&.8b, v®1&.8b, v®2&.8b
> + mov v®1&.8b, v16.8b
> + mov v16.d[0], DUMMY
> +.endm
> +
> +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
> +.macro pixdeinterleave bpp, basereg
> +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> + vuzp8 %(basereg+0), %(basereg+1)
> + vuzp8 %(basereg+2), %(basereg+3)
> + vuzp8 %(basereg+1), %(basereg+3)
> + vuzp8 %(basereg+0), %(basereg+2)
> +.endif
> +.endm
> +
> +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
> +.macro pixinterleave bpp, basereg
> +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> + vzip8 %(basereg+0), %(basereg+2)
> + vzip8 %(basereg+1), %(basereg+3)
> + vzip8 %(basereg+2), %(basereg+3)
> + vzip8 %(basereg+0), %(basereg+1)
> +.endif
> +.endm
> +
> +/*
> + * This is a macro for implementing cache preload. The main idea is that
> + * cache preload logic is mostly independent from the rest of pixels
> + * processing code. It starts at the top left pixel and moves forward
> + * across pixels and can jump across scanlines. Prefetch distance is
> + * handled in an 'incremental' way: it starts from 0 and advances to the
> + * optimal distance over time. After reaching optimal prefetch distance,
> + * it is kept constant. There are some checks which prevent prefetching
> + * unneeded pixel lines below the image (but it still can prefetch a bit
> + * more data on the right side of the image - not a big issue and may
> + * be actually helpful when rendering text glyphs). Additional trick is
> + * the use of LDR instruction for prefetch instead of PLD when moving to
> + * the next line, the point is that we have a high chance of getting TLB
> + * miss in this case, and PLD would be useless.
> + *
> + * This sounds like it may introduce a noticeable overhead (when working with
> + * fully cached data). But in reality, due to having a separate pipeline and
> + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
> + * execute simultaneously with NEON and be completely shadowed by it. Thus
> + * we get no performance overhead at all (*). This looks like a very nice
> + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
> + * but still can implement some rather advanced prefetch logic in software
> + * for almost zero cost!
> + *
> + * (*) The overhead of the prefetcher is visible when running some trivial
> + * pixels processing like simple copy. Anyway, having prefetch is a must
> + * when working with the graphics data.
> + */
> +.macro PF a, x:vararg
> +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
> + a x
> +.endif
> +.endm
> +
> +.macro cache_preload std_increment, boost_increment
> +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
> +.if std_increment != 0
> + PF add PF_X, PF_X, #std_increment
> +.endif
> + PF tst PF_CTL, #0xF
> + PF beq 71f
> + PF add PF_X, PF_X, #boost_increment
> + PF sub PF_CTL, PF_CTL, #1
> +71:
> + PF cmp PF_X, ORIG_W
> +.if src_bpp_shift >= 0
> + PF lsl DUMMY, PF_X, #src_bpp_shift
> + PF prfm pldl2strm, [PF_SRC, DUMMY]
> +.endif
> +.if dst_r_bpp != 0
> + PF lsl DUMMY, PF_X, #dst_bpp_shift
> + PF prfm pldl2strm, [PF_DST, DUMMY]
> +.endif
> +.if mask_bpp_shift >= 0
> + PF lsl DUMMY, PF_X, #mask_bpp_shift
> + PF prfm pldl2strm, [PF_MASK, DUMMY]
> +.endif
> + PF ble 71f
> + PF sub PF_X, PF_X, ORIG_W
> + PF subs PF_CTL, PF_CTL, #0x10
> +71:
> + PF ble 72f
> +.if src_bpp_shift >= 0
> + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + PF ldrsb DUMMY, [PF_SRC, DUMMY]
> + PF add PF_SRC, PF_SRC, #1
> +.endif
> +.if dst_r_bpp != 0
> + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + PF ldrsb DUMMY, [PF_DST, DUMMY]
> + PF add PF_DST, PF_DST, #1
> +.endif
> +.if mask_bpp_shift >= 0
> + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> + PF ldrsb DUMMY, [PF_MASK, DUMMY]
> + PF add PF_MASK, PF_MASK, #1
> +.endif
> +72:
> +.endif
> +.endm
> +
> +.macro cache_preload_simple
> +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
> +.if src_bpp > 0
> + prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
> +.endif
> +.if dst_r_bpp > 0
> + prfm pldl2strm, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
> +.endif
> +.if mask_bpp > 0
> + prfm pldl2strm, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
> +.endif
> +.endif
> +.endm
> +
> +.macro fetch_mask_pixblock
> + pixld pixblock_size, mask_bpp, \
> + (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> +.endm
> +
> +/*
> + * Macro which is used to process leading pixels until destination
> + * pointer is properly aligned (at 16 bytes boundary). When destination
> + * buffer uses 16bpp format, this is unnecessary, or even pointless.
> + */
> +.macro ensure_destination_ptr_alignment process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> +.if dst_w_bpp != 24
> + tst DST_R, #0xF
> + beq 52f
> +.irp lowbit, 1, 2, 4, 8, 16
> +local skip1
> +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
> +.if lowbit < 16 /* we don't need more than 16-byte alignment */
> + tst DST_R, #lowbit
> + beq 51f
> +.endif
> + pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
> + pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
> +.if dst_r_bpp > 0
> + pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
> +.else
> + add DST_R, DST_R, #lowbit
> +.endif
> + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
> + sub W, W, #(lowbit * 8 / dst_w_bpp)
> +51:
> +.endif
> +.endr
> + pixdeinterleave src_bpp, src_basereg
> + pixdeinterleave mask_bpp, mask_basereg
> + pixdeinterleave dst_r_bpp, dst_r_basereg
> +
> + process_pixblock_head
> + cache_preload 0, pixblock_size
> + cache_preload_simple
> + process_pixblock_tail
> +
> + pixinterleave dst_w_bpp, dst_w_basereg
> +
> +.irp lowbit, 1, 2, 4, 8, 16
> +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
> +.if lowbit < 16 /* we don't need more than 16-byte alignment */
> + tst DST_W, #lowbit
> + beq 51f
> +.endif
> + pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
> +51:
> +.endif
> +.endr
> +.endif
> +52:
> +.endm
> +
> +/*
> + * Special code for processing up to (pixblock_size - 1) remaining
> + * trailing pixels. As SIMD processing performs operation on
> + * pixblock_size pixels, anything smaller than this has to be loaded
> + * and stored in a special way. Loading and storing of pixel data is
> + * performed in such a way that we fill some 'slots' in the NEON
> + * registers (some slots naturally are unused), then perform compositing
> + * operation as usual. In the end, the data is taken from these 'slots'
> + * and saved to memory.
> + *
> + * cache_preload_flag - allows to suppress prefetch if
> + * set to 0
> + * dst_aligned_flag - selects whether destination buffer
> + * is aligned
> + */
> +.macro process_trailing_pixels cache_preload_flag, \
> + dst_aligned_flag, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> + tst W, #(pixblock_size - 1)
> + beq 52f
> +.irp chunk_size, 16, 8, 4, 2, 1
> +.if pixblock_size > chunk_size
> + tst W, #chunk_size
> + beq 51f
> + pixld_src chunk_size, src_bpp, src_basereg, SRC
> + pixld chunk_size, mask_bpp, mask_basereg, MASK
> +.if dst_aligned_flag != 0
> + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
> +.else
> + pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
> +.endif
> +.if cache_preload_flag != 0
> + PF add PF_X, PF_X, #chunk_size
> +.endif
> +51:
> +.endif
> +.endr
> + pixdeinterleave src_bpp, src_basereg
> + pixdeinterleave mask_bpp, mask_basereg
> + pixdeinterleave dst_r_bpp, dst_r_basereg
> +
> + process_pixblock_head
> +.if cache_preload_flag != 0
> + cache_preload 0, pixblock_size
> + cache_preload_simple
> +.endif
> + process_pixblock_tail
> + pixinterleave dst_w_bpp, dst_w_basereg
> +.irp chunk_size, 16, 8, 4, 2, 1
> +.if pixblock_size > chunk_size
> + tst W, #chunk_size
> + beq 51f
> +.if dst_aligned_flag != 0
> + pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
> +.else
> + pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
> +.endif
> +51:
> +.endif
> +.endr
> +52:
> +.endm
> +
> +/*
> + * Macro, which performs all the needed operations to switch to the next
> + * scanline and start the next loop iteration unless all the scanlines
> + * are already processed.
> + */
> +.macro advance_to_next_scanline start_of_loop_label
> + mov W, ORIG_W
> + lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> + add DST_W, DST_W, DUMMY
> +.if src_bpp != 0
> + lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> + add SRC, SRC, DUMMY
> +.endif
> +.if mask_bpp != 0
> + lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> + add MASK, MASK, DUMMY
> +.endif
> +.if (dst_w_bpp != 24)
> + lsl DUMMY, W, #dst_bpp_shift
> + sub DST_W, DST_W, DUMMY
> +.endif
> +.if (src_bpp != 24) && (src_bpp != 0)
> + lsl DUMMY, W, #src_bpp_shift
> + sub SRC, SRC, DUMMY
> +.endif
> +.if (mask_bpp != 24) && (mask_bpp != 0)
> + lsl DUMMY, W, #mask_bpp_shift
> + sub MASK, MASK, DUMMY
> +.endif
> + subs H, H, #1
> + mov DST_R, DST_W
> + bge start_of_loop_label
> +.endm
> +
> +/*
> + * Registers are allocated in the following way by default:
> + * v0, v1, v2, v3 - reserved for loading source pixel data
> + * v4, v5, v6, v7 - reserved for loading destination pixel data
> + * v24, v25, v26, v27 - reserved for loading mask pixel data
> + * v28, v29, v30, v31 - final destination pixel data for writeback to memory
> + */
> +.macro generate_composite_function fname, \
> + src_bpp_, \
> + mask_bpp_, \
> + dst_w_bpp_, \
> + flags, \
> + pixblock_size_, \
> + prefetch_distance, \
> + init, \
> + cleanup, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head, \
> + dst_w_basereg_ = 28, \
> + dst_r_basereg_ = 4, \
> + src_basereg_ = 0, \
> + mask_basereg_ = 24
> +
> + pixman_asm_function fname
> + stp x29, x30, [sp, -16]!
> + mov x29, sp
> + sub sp, sp, 232 /* push all registers */
> + sub x29, x29, 64
> + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
> + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
> + stp x8, x9, [x29, -80]
> + stp x10, x11, [x29, -96]
> + stp x12, x13, [x29, -112]
> + stp x14, x15, [x29, -128]
> + stp x16, x17, [x29, -144]
> + stp x18, x19, [x29, -160]
> + stp x20, x21, [x29, -176]
> + stp x22, x23, [x29, -192]
> + stp x24, x25, [x29, -208]
> + stp x26, x27, [x29, -224]
> + str x28, [x29, -232]
> +
> +/*
> + * Select prefetch type for this function. If prefetch distance is
> + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
> + * has to be used instead of ADVANCED.
> + */
> + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
> +.if prefetch_distance == 0
> + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
> +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
> + ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
> + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
> +.endif
> +
> +/*
> + * Make some macro arguments globally visible and accessible
> + * from other macros
> + */
> + .set src_bpp, src_bpp_
> + .set mask_bpp, mask_bpp_
> + .set dst_w_bpp, dst_w_bpp_
> + .set pixblock_size, pixblock_size_
> + .set dst_w_basereg, dst_w_basereg_
> + .set dst_r_basereg, dst_r_basereg_
> + .set src_basereg, src_basereg_
> + .set mask_basereg, mask_basereg_
> +
> + .macro pixld_src x:vararg
> + pixld x
> + .endm
> + .macro fetch_src_pixblock
> + pixld_src pixblock_size, src_bpp, \
> + (src_basereg - pixblock_size * src_bpp / 64), SRC
> + .endm
> +/*
> + * Assign symbolic names to registers
> + */
> + W .req x0 /* width (is updated during processing) */
> + H .req x1 /* height (is updated during processing) */
> + DST_W .req x2 /* destination buffer pointer for writes */
> + DST_STRIDE .req x3 /* destination image stride */
> + SRC .req x4 /* source buffer pointer */
> + SRC_STRIDE .req x5 /* source image stride */
> + MASK .req x6 /* mask pointer */
> + MASK_STRIDE .req x7 /* mask stride */
> +
> + DST_R .req x8 /* destination buffer pointer for reads */
> +
> + PF_CTL .req x9 /* combined lines counter and prefetch */
> + /* distance increment counter */
> + PF_X .req x10 /* pixel index in a scanline for current */
> + /* pretetch position */
> + PF_SRC .req x11 /* pointer to source scanline start */
> + /* for prefetch purposes */
> + PF_DST .req x12 /* pointer to destination scanline start */
> + /* for prefetch purposes */
> + PF_MASK .req x13 /* pointer to mask scanline start */
> + /* for prefetch purposes */
> +
> + ORIG_W .req x14 /* saved original width */
> + DUMMY .req x15 /* temporary register */
> +
> + sxtw x0, w0
> + sxtw x1, w1
> + sxtw x3, w3
> + sxtw x5, w5
> + sxtw x7, w7
> +
> + .set mask_bpp_shift, -1
> +.if src_bpp == 32
> + .set src_bpp_shift, 2
> +.elseif src_bpp == 24
> + .set src_bpp_shift, 0
> +.elseif src_bpp == 16
> + .set src_bpp_shift, 1
> +.elseif src_bpp == 8
> + .set src_bpp_shift, 0
> +.elseif src_bpp == 0
> + .set src_bpp_shift, -1
> +.else
> + .error "requested src bpp (src_bpp) is not supported"
> +.endif
> +.if mask_bpp == 32
> + .set mask_bpp_shift, 2
> +.elseif mask_bpp == 24
> + .set mask_bpp_shift, 0
> +.elseif mask_bpp == 8
> + .set mask_bpp_shift, 0
> +.elseif mask_bpp == 0
> + .set mask_bpp_shift, -1
> +.else
> + .error "requested mask bpp (mask_bpp) is not supported"
> +.endif
> +.if dst_w_bpp == 32
> + .set dst_bpp_shift, 2
> +.elseif dst_w_bpp == 24
> + .set dst_bpp_shift, 0
> +.elseif dst_w_bpp == 16
> + .set dst_bpp_shift, 1
> +.elseif dst_w_bpp == 8
> + .set dst_bpp_shift, 0
> +.else
> + .error "requested dst bpp (dst_w_bpp) is not supported"
> +.endif
> +
> +.if (((flags) & FLAG_DST_READWRITE) != 0)
> + .set dst_r_bpp, dst_w_bpp
> +.else
> + .set dst_r_bpp, 0
> +.endif
> +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
> + .set DEINTERLEAVE_32BPP_ENABLED, 1
> +.else
> + .set DEINTERLEAVE_32BPP_ENABLED, 0
> +.endif
> +
> +.if prefetch_distance < 0 || prefetch_distance > 15
> + .error "invalid prefetch distance (prefetch_distance)"
> +.endif
> +
> + PF mov PF_X, #0
> + mov DST_R, DST_W
> +
> +.if src_bpp == 24
> + sub SRC_STRIDE, SRC_STRIDE, W
> + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
> +.endif
> +.if mask_bpp == 24
> + sub MASK_STRIDE, MASK_STRIDE, W
> + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1
> +.endif
> +.if dst_w_bpp == 24
> + sub DST_STRIDE, DST_STRIDE, W
> + sub DST_STRIDE, DST_STRIDE, W, lsl #1
> +.endif
> +
> +/*
> + * Setup advanced prefetcher initial state
> + */
> + PF mov PF_SRC, SRC
> + PF mov PF_DST, DST_R
> + PF mov PF_MASK, MASK
> + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
> + PF lsl DUMMY, H, #4
> + PF mov PF_CTL, DUMMY
> + PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
> +
> + init
> + subs H, H, #1
> + mov ORIG_W, W
> + blt 9f
> + cmp W, #(pixblock_size * 2)
> + blt 800f
> +/*
> + * This is the start of the pipelined loop, which if optimized for
> + * long scanlines
> + */
> +0:
> + ensure_destination_ptr_alignment process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> +
> + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
> + pixld_a pixblock_size, dst_r_bpp, \
> + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
> + fetch_src_pixblock
> + pixld pixblock_size, mask_bpp, \
> + (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> + PF add PF_X, PF_X, #pixblock_size
> + process_pixblock_head
> + cache_preload 0, pixblock_size
> + cache_preload_simple
> + subs W, W, #(pixblock_size * 2)
> + blt 200f
> +
> +100:
> + process_pixblock_tail_head
> + cache_preload_simple
> + subs W, W, #pixblock_size
> + bge 100b
> +
> +200:
> + process_pixblock_tail
> + pixst_a pixblock_size, dst_w_bpp, \
> + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
> +
> + /* Process the remaining trailing pixels in the scanline */
> + process_trailing_pixels 1, 1, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> + advance_to_next_scanline 0b
> +
> + cleanup
> +1000:
> + /* pop all registers */
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + ldp x8, x9, [x29, -80]
> + ldp x10, x11, [x29, -96]
> + ldp x12, x13, [x29, -112]
> + ldp x14, x15, [x29, -128]
> + ldp x16, x17, [x29, -144]
> + ldp x18, x19, [x29, -160]
> + ldp x20, x21, [x29, -176]
> + ldp x22, x23, [x29, -192]
> + ldp x24, x25, [x29, -208]
> + ldp x26, x27, [x29, -224]
> + ldr x28, [x29, -232]
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret /* exit */
> +/*
> + * This is the start of the loop, designed to process images with small width
> + * (less than pixblock_size * 2 pixels). In this case neither pipelining
> + * nor prefetch are used.
> + */
> +800:
> + /* Process exactly pixblock_size pixels if needed */
> + tst W, #pixblock_size
> + beq 100f
> + pixld pixblock_size, dst_r_bpp, \
> + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
> + fetch_src_pixblock
> + pixld pixblock_size, mask_bpp, \
> + (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> + process_pixblock_head
> + process_pixblock_tail
> + pixst pixblock_size, dst_w_bpp, \
> + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
> +100:
> + /* Process the remaining trailing pixels in the scanline */
> + process_trailing_pixels 0, 0, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> + advance_to_next_scanline 800b
> +9:
> + cleanup
> + /* pop all registers */
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + ldp x8, x9, [x29, -80]
> + ldp x10, x11, [x29, -96]
> + ldp x12, x13, [x29, -112]
> + ldp x14, x15, [x29, -128]
> + ldp x16, x17, [x29, -144]
> + ldp x18, x19, [x29, -160]
> + ldp x20, x21, [x29, -176]
> + ldp x22, x23, [x29, -192]
> + ldp x24, x25, [x29, -208]
> + ldp x26, x27, [x29, -224]
> + ldr x28, [x29, -232]
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret /* exit */
> +
> + .purgem fetch_src_pixblock
> + .purgem pixld_src
> +
> + .unreq SRC
> + .unreq MASK
> + .unreq DST_R
> + .unreq DST_W
> + .unreq ORIG_W
> + .unreq W
> + .unreq H
> + .unreq SRC_STRIDE
> + .unreq DST_STRIDE
> + .unreq MASK_STRIDE
> + .unreq PF_CTL
> + .unreq PF_X
> + .unreq PF_SRC
> + .unreq PF_DST
> + .unreq PF_MASK
> + .unreq DUMMY
> + .endfunc
> +.endm
> +
> +/*
> + * A simplified variant of function generation template for a single
> + * scanline processing (for implementing pixman combine functions)
> + */
> +.macro generate_composite_function_scanline use_nearest_scaling, \
> + fname, \
> + src_bpp_, \
> + mask_bpp_, \
> + dst_w_bpp_, \
> + flags, \
> + pixblock_size_, \
> + init, \
> + cleanup, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> +
> process_pixblock_tail_head, \
> + dst_w_basereg_ = 28, \
> + dst_r_basereg_ = 4, \
> + src_basereg_ = 0, \
> + mask_basereg_ = 24
> +
> + pixman_asm_function fname
> + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
> +
> +/*
> + * Make some macro arguments globally visible and accessible
> + * from other macros
> + */
> + .set src_bpp, src_bpp_
> + .set mask_bpp, mask_bpp_
> + .set dst_w_bpp, dst_w_bpp_
> + .set pixblock_size, pixblock_size_
> + .set dst_w_basereg, dst_w_basereg_
> + .set dst_r_basereg, dst_r_basereg_
> + .set src_basereg, src_basereg_
> + .set mask_basereg, mask_basereg_
> +
> +.if use_nearest_scaling != 0
> + /*
> + * Assign symbolic names to registers for nearest scaling
> + */
> + W .req x0
> + DST_W .req x1
> + SRC .req x2
> + VX .req x3
> + UNIT_X .req x4
> + SRC_WIDTH_FIXED .req x5
> + MASK .req x6
> + TMP1 .req x8
> + TMP2 .req x9
> + DST_R .req x10
> + DUMMY .req x30
> +
> + .macro pixld_src x:vararg
> + pixld_s x
> + .endm
> +
> + sxtw x0, w0
> + sxtw x3, w3
> + sxtw x4, w4
> + sxtw x5, w5
> +
> + stp x29, x30, [sp, -16]!
> + mov x29, sp
> + sub sp, sp, 88
> + sub x29, x29, 64
> + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + stp x8, x9, [x29, -80]
> + str x10, [x29, -88]
> +.else
> + /*
> + * Assign symbolic names to registers
> + */
> + W .req x0 /* width (is updated during processing) */
> + DST_W .req x1 /* destination buffer pointer for writes */
> + SRC .req x2 /* source buffer pointer */
> + MASK .req x3 /* mask pointer */
> + DST_R .req x4 /* destination buffer pointer for reads */
> + DUMMY .req x30
> +
> + .macro pixld_src x:vararg
> + pixld x
> + .endm
> +
> + sxtw x0, w0
> +
> + stp x29, x30, [sp, -16]!
> + mov x29, sp
> + sub sp, sp, 64
> + sub x29, x29, 64
> + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +.endif
> +
> +.if (((flags) & FLAG_DST_READWRITE) != 0)
> + .set dst_r_bpp, dst_w_bpp
> +.else
> + .set dst_r_bpp, 0
> +.endif
> +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
> + .set DEINTERLEAVE_32BPP_ENABLED, 1
> +.else
> + .set DEINTERLEAVE_32BPP_ENABLED, 0
> +.endif
> +
> + .macro fetch_src_pixblock
> + pixld_src pixblock_size, src_bpp, \
> + (src_basereg - pixblock_size * src_bpp / 64), SRC
> + .endm
> +
> + init
> + mov DST_R, DST_W
> +
> + cmp W, #pixblock_size
> + blt 800f
> +
> + ensure_destination_ptr_alignment process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> +
> + subs W, W, #pixblock_size
> + blt 700f
> +
> + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
> + pixld_a pixblock_size, dst_r_bpp, \
> + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
> + fetch_src_pixblock
> + pixld pixblock_size, mask_bpp, \
> + (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> + process_pixblock_head
> + subs W, W, #pixblock_size
> + blt 200f
> +100:
> + process_pixblock_tail_head
> + subs W, W, #pixblock_size
> + bge 100b
> +200:
> + process_pixblock_tail
> + pixst_a pixblock_size, dst_w_bpp, \
> + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
> +700:
> + /* Process the remaining trailing pixels in the scanline (dst aligned) */
> + process_trailing_pixels 0, 1, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> +
> + cleanup
> +.if use_nearest_scaling != 0
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + ldp x8, x9, [x29, -80]
> + ldr x10, [x29, -96]
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret /* exit */
> +.else
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret /* exit */
> +.endif
> +800:
> + /* Process the remaining trailing pixels in the scanline (dst unaligned) */
> + process_trailing_pixels 0, 0, \
> + process_pixblock_head, \
> + process_pixblock_tail, \
> + process_pixblock_tail_head
> +
> + cleanup
> +.if use_nearest_scaling != 0
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + ldp x8, x9, [x29, -80]
> + ldr x10, [x29, -88]
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret /* exit */
> +
> + .unreq DUMMY
> + .unreq DST_R
> + .unreq SRC
> + .unreq W
> + .unreq VX
> + .unreq UNIT_X
> + .unreq TMP1
> + .unreq TMP2
> + .unreq DST_W
> + .unreq MASK
> + .unreq SRC_WIDTH_FIXED
> +
> +.else
> + sub x29, x29, 64
> + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> + mov sp, x29
> + ldp x29, x30, [sp], 16
> + ret /* exit */
> +
> + .unreq DUMMY
> + .unreq SRC
> + .unreq MASK
> + .unreq DST_R
> + .unreq DST_W
> + .unreq W
> +.endif
> +
> + .purgem fetch_src_pixblock
> + .purgem pixld_src
> +
> + .endfunc
> +.endm
> +
> +.macro generate_composite_function_single_scanline x:vararg
> + generate_composite_function_scanline 0, x
> +.endm
> +
> +.macro generate_composite_function_nearest_scanline x:vararg
> + generate_composite_function_scanline 1, x
> +.endm
> +
> +/* Default prologue/epilogue, nothing special needs to be done */
> +
> +.macro default_init
> +.endm
> +
> +.macro default_cleanup
> +.endm
> +
> +/*
> + * Prologue/epilogue variant which additionally saves/restores v8-v15
> + * registers (they need to be saved/restored by callee according to ABI).
> + * This is required if the code needs to use all the NEON registers.
> + */
> +
> +.macro default_init_need_all_regs
> +.endm
> +
> +.macro default_cleanup_need_all_regs
> +.endm
> +
> +/******************************************************************************/
> +
> +/*
> + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
> + * into a planar a8r8g8b8 format (with a, r, g, b color components
> + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
> + *
> + * Warning: the conversion is destructive and the original
> + * value (in) is lost.
> + */
> +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
> + shrn &out_r&.8b, &in&.8h, #8
> + shrn &out_g&.8b, &in&.8h, #3
> + sli &in&.8h, &in&.8h, #5
> + movi &out_a&.8b, #255
> + sri &out_r&.8b, &out_r&.8b, #5
> + sri &out_g&.8b, &out_g&.8b, #6
> + shrn &out_b&.8b, &in&.8h, #2
> +.endm
> +
> +.macro convert_0565_to_x888 in, out_r, out_g, out_b
> + shrn &out_r&.8b, &in&.8h, #8
> + shrn &out_g&.8b, &in&.8h, #3
> + sli &in&.8h, &in&.8h, #5
> + sri &out_r&.8b, &out_r&.8b, #5
> + sri &out_g&.8b, &out_g&.8b, #6
> + shrn &out_b&.8b, &in&.8h, #2
> +.endm
> +
> +/*
> + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
> + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
> + * pixels packed in 128-bit register (out). Requires two temporary 128-bit
> + * registers (tmp1, tmp2)
> + */
> +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
> + ushll &tmp1&.8h, &in_g&.8b, #7
> + shl &tmp1&.8h, &tmp1&.8h, #1
> + ushll &out&.8h, &in_r&.8b, #7
> + shl &out&.8h, &out&.8h, #1
> + ushll &tmp2&.8h, &in_b&.8b, #7
> + shl &tmp2&.8h, &tmp2&.8h, #1
> + sri &out&.8h, &tmp1&.8h, #5
> + sri &out&.8h, &tmp2&.8h, #11
> +.endm
> +
> +/*
> + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
> + * returned in (out0, out1) registers pair. Requires one temporary
> + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
> + * value from 'in' is lost
> + */
> +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
> + shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */
> + shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */
> + sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */
> + sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */
> + sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */
> + ushr &out1&.4h, &in&.4h, #8 /* R is in place */
> + sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */
> + zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */
> + zip2 &out1&.4h, &out0&.4h, &out1&.4h
> + mov &out0&.d[0], &tmp&.d[0]
> +.endm
> diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
> old mode 100644
> new mode 100755
> index 73a5414..81e0f23
> --- a/pixman/pixman-private.h
> +++ b/pixman/pixman-private.h
> @@ -607,6 +607,11 @@ pixman_implementation_t *
> _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
> #endif
>
> +#ifdef USE_ARM_A64_NEON
> +pixman_implementation_t *
> +_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
> +#endif
> +
> #ifdef USE_MIPS_DSPR2
> pixman_implementation_t *
> _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
> --
> 2.7.4
More information about the Pixman
mailing list