pixman: Branch 'master'

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Feb 29 14:46:57 UTC 2024


 meson.build                           |    7 
 pixman/pixman-arm-asm.h               |    6 
 pixman/pixman-arm-neon-asm-bilinear.S |  362 ++++++++++----------
 pixman/pixman-arm-neon-asm.S          |  436 ++++++++++++------------
 pixman/pixman-arm-neon-asm.h          |  607 ++++++++++++++++------------------
 pixman/pixman-arm-simd-asm-scaled.S   |   42 +-
 pixman/pixman-arm-simd-asm.S          |  470 +++++++++++++-------------
 pixman/pixman-arm-simd-asm.h          |  300 ++++++++--------
 8 files changed, 1125 insertions(+), 1105 deletions(-)

New commits:
commit 74130e84c577f9ce1a54be40104f43ead8b8dac3
Author: Heiko Lewin <hlewin at worldiety.de>
Date:   Thu Feb 29 14:46:55 2024 +0000

    Allow to build pixman on clang/arm32

diff --git a/meson.build b/meson.build
index 4337f93..438e6cf 100644
--- a/meson.build
+++ b/meson.build
@@ -252,6 +252,13 @@ if cc.compiles('''
     config.set('ASM_HAVE_FUNC_DIRECTIVE', 1)    
 endif
 
+if cc.compiles('''
+    __asm__ (
+    ".syntax unified\n"
+    );''',
+    name : 'test for ASM .syntax unified directive')
+    config.set('ASM_HAVE_SYNTAX_UNIFIED', 1)
+endif
 
 if cc.links('''
     #include <stdint.h>
diff --git a/pixman/pixman-arm-asm.h b/pixman/pixman-arm-asm.h
index 8253906..edf8e82 100644
--- a/pixman/pixman-arm-asm.h
+++ b/pixman/pixman-arm-asm.h
@@ -50,6 +50,12 @@
 #endif
 .endm
 
+.macro pixman_syntax_unified
+#ifdef ASM_HAVE_SYNTAX_UNIFIED
+	.syntax unified
+#endif
+.endm
+
 .macro pixman_end_asm_function
 #ifdef ASM_HAVE_FUNC_DIRECTIVE
 	.endfunc
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 0fd92d6..6bd2736 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -68,6 +68,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
+pixman_syntax_unified
+
 /*
  * Bilinear macros from pixman-arm-neon-asm.S
  */
@@ -82,28 +84,28 @@
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #2
-    vld1.32   {reg1}, [TMP1], STRIDE
-    vld1.32   {reg2}, [TMP1]
+    vld1.32   {\reg1}, [TMP1], STRIDE
+    vld1.32   {\reg2}, [TMP1]
 .endm
 
 .macro bilinear_load_0565 reg1, reg2, tmp
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
-    vld1.32   {reg2[0]}, [TMP1], STRIDE
-    vld1.32   {reg2[1]}, [TMP1]
-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+    vld1.32   {\reg2[0]}, [TMP1], STRIDE
+    vld1.32   {\reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_8888 \
                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
 
-    bilinear_load_8888 reg1, reg2, tmp1
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    bilinear_load_8888 reg3, reg4, tmp2
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    bilinear_load_8888 \reg1, \reg2, \tmp1
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    bilinear_load_8888 \reg3, \reg4, \tmp2
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -111,9 +113,9 @@
                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
 
     bilinear_load_and_vertical_interpolate_two_8888 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
     bilinear_load_and_vertical_interpolate_two_8888 \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -125,19 +127,19 @@
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {acc2lo[1]}, [TMP1]
-    vld1.32   {acc2hi[1]}, [TMP2]
-    convert_0565_to_x888 acc2, reg3, reg2, reg1
-    vzip.u8   reg1, reg3
-    vzip.u8   reg2, reg4
-    vzip.u8   reg3, reg4
-    vzip.u8   reg1, reg2
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\acc2lo[1]}, [TMP1]
+    vld1.32   {\acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+    vzip.u8   \reg1, \reg3
+    vzip.u8   \reg2, \reg4
+    vzip.u8   \reg3, \reg4
+    vzip.u8   \reg1, \reg2
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -150,46 +152,46 @@
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {xacc2lo[1]}, [TMP1]
-    vld1.32   {xacc2hi[1]}, [TMP2]
-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\xacc2lo[1]}, [TMP1]
+    vld1.32   {\xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
-    vzip.u8   xreg1, xreg3
-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
-    vzip.u8   xreg2, xreg4
-    vld1.32   {yacc2lo[1]}, [TMP1]
-    vzip.u8   xreg3, xreg4
-    vld1.32   {yacc2hi[1]}, [TMP2]
-    vzip.u8   xreg1, xreg2
-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-    vmull.u8  xacc1, xreg1, d28
-    vzip.u8   yreg1, yreg3
-    vmlal.u8  xacc1, xreg2, d29
-    vzip.u8   yreg2, yreg4
-    vmull.u8  xacc2, xreg3, d28
-    vzip.u8   yreg3, yreg4
-    vmlal.u8  xacc2, xreg4, d29
-    vzip.u8   yreg1, yreg2
-    vmull.u8  yacc1, yreg1, d28
-    vmlal.u8  yacc1, yreg2, d29
-    vmull.u8  yacc2, yreg3, d28
-    vmlal.u8  yacc2, yreg4, d29
+    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   \xreg1, \xreg3
+    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   \xreg2, \xreg4
+    vld1.32   {\yacc2lo[1]}, [TMP1]
+    vzip.u8   \xreg3, \xreg4
+    vld1.32   {\yacc2hi[1]}, [TMP2]
+    vzip.u8   \xreg1, \xreg2
+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+    vmull.u8  \xacc1, \xreg1, d28
+    vzip.u8   \yreg1, \yreg3
+    vmlal.u8  \xacc1, \xreg2, d29
+    vzip.u8   \yreg2, \yreg4
+    vmull.u8  \xacc2, \xreg3, d28
+    vzip.u8   \yreg3, \yreg4
+    vmlal.u8  \xacc2, \xreg4, d29
+    vzip.u8   \yreg1, \yreg2
+    vmull.u8  \yacc1, \yreg1, d28
+    vmlal.u8  \yacc1, \yreg2, d29
+    vmull.u8  \yacc2, \yreg3, d28
+    vmlal.u8  \yacc2, \yreg4, d29
 .endm
 
 .macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
     vst1.32   {d0, d1}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d0}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.32   {d0[0]}, [OUT, :32]!
 .else
     .error bilinear_store_8888 numpix is unsupported
@@ -201,12 +203,12 @@
     vuzp.u8 d2, d3
     vuzp.u8 d1, d3
     vuzp.u8 d0, d2
-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
     vst1.16   {d2}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d2[0]}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.16   {d2[0]}, [OUT]!
 .else
     .error bilinear_store_0565 numpix is unsupported
@@ -222,20 +224,20 @@
 .endm
 
 .macro bilinear_load_mask_8 numpix, mask
-.if numpix == 4
-    vld1.32     {mask[0]}, [MASK]!
-.elseif numpix == 2
-    vld1.16     {mask[0]}, [MASK]!
-.elseif numpix == 1
-    vld1.8      {mask[0]}, [MASK]!
+.if \numpix == 4
+    vld1.32     {\mask[0]}, [MASK]!
+.elseif \numpix == 2
+    vld1.16     {\mask[0]}, [MASK]!
+.elseif \numpix == 1
+    vld1.8      {\mask[0]}, [MASK]!
 .else
-    .error bilinear_load_mask_8 numpix is unsupported
+    .error bilinear_load_mask_8 \numpix is unsupported
 .endif
     pld         [MASK, #prefetch_offset]
 .endm
 
 .macro bilinear_load_mask mask_fmt, numpix, mask
-    bilinear_load_mask_&mask_fmt numpix, mask
+    bilinear_load_mask_\()\mask_fmt \numpix, \mask
 .endm
 
 
@@ -250,28 +252,28 @@
 .endm
 
 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
-.if numpix == 4
-    vld1.32     {dst0, dst1}, [OUT]
-.elseif numpix == 2
-    vld1.32     {dst0}, [OUT]
-.elseif numpix == 1
-    vld1.32     {dst0[0]}, [OUT]
+.if \numpix == 4
+    vld1.32     {\dst0, \dst1}, [OUT]
+.elseif \numpix == 2
+    vld1.32     {\dst0}, [OUT]
+.elseif \numpix == 1
+    vld1.32     {\dst0[0]}, [OUT]
 .else
-    .error bilinear_load_dst_8888 numpix is unsupported
+    .error bilinear_load_dst_8888 \numpix is unsupported
 .endif
     pld         [OUT, #(prefetch_offset * 4)]
 .endm
 
 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
-    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
-    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
-    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
 .endm
 
 /*
@@ -290,19 +292,19 @@
 .endm
 
 .macro bilinear_duplicate_mask_8 numpix, mask
-.if numpix == 4
-    vdup.32     mask, mask[0]
-.elseif numpix == 2
-    vdup.16     mask, mask[0]
-.elseif numpix == 1
-    vdup.8      mask, mask[0]
+.if \numpix == 4
+    vdup.32     \mask, \mask[0]
+.elseif \numpix == 2
+    vdup.16     \mask, \mask[0]
+.elseif \numpix == 1
+    vdup.8      \mask, \mask[0]
 .else
     .error bilinear_duplicate_mask_8 is unsupported
 .endif
 .endm
 
 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
-    bilinear_duplicate_mask_&mask_fmt numpix, mask
+    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
 .endm
 
 /*
@@ -310,10 +312,10 @@
  * Interleave should be done when maks is enabled or operator is 'over'.
  */
 .macro bilinear_interleave src0, src1, dst0, dst1
-    vuzp.8      src0, src1
-    vuzp.8      dst0, dst1
-    vuzp.8      src0, src1
-    vuzp.8      dst0, dst1
+    vuzp.8      \src0, \src1
+    vuzp.8      \dst0, \dst1
+    vuzp.8      \src0, \src1
+    vuzp.8      \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_x_src \
@@ -323,7 +325,7 @@
 .macro bilinear_interleave_src_dst_x_over \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_x_add \
@@ -333,26 +335,26 @@
 .macro bilinear_interleave_src_dst_8_src \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_8_over \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_8_add \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst \
                 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave_src_dst_&mask_fmt&_&op \
-                numpix, src0, src1, src01, dst0, dst1, dst01
+    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
 .endm
 
 
@@ -370,23 +372,23 @@
                 numpix, src0, src1, src01, mask, \
                 tmp01, tmp23, tmp45, tmp67
 
-    vmull.u8        tmp01, src0, mask
-    vmull.u8        tmp23, src1, mask
+    vmull.u8        \tmp01, \src0, \mask
+    vmull.u8        \tmp23, \src1, \mask
     /* bubbles */
-    vrshr.u16       tmp45, tmp01, #8
-    vrshr.u16       tmp67, tmp23, #8
+    vrshr.u16       \tmp45, \tmp01, #8
+    vrshr.u16       \tmp67, \tmp23, #8
     /* bubbles */
-    vraddhn.u16     src0, tmp45, tmp01
-    vraddhn.u16     src1, tmp67, tmp23
+    vraddhn.u16     \src0, \tmp45, \tmp01
+    vraddhn.u16     \src1, \tmp67, \tmp23
 .endm
 
 .macro bilinear_apply_mask_to_src \
                 mask_fmt, numpix, src0, src1, src01, mask, \
                 tmp01, tmp23, tmp45, tmp67
 
-    bilinear_apply_mask_to_src_&mask_fmt \
-                numpix, src0, src1, src01, mask, \
-                tmp01, tmp23, tmp45, tmp67
+    bilinear_apply_mask_to_src_\()\mask_fmt \
+                \numpix, \src0, \src1, \src01, \mask, \
+                \tmp01, \tmp23, \tmp45, \tmp67
 .endm
 
 
@@ -403,79 +405,79 @@
                 numpix, src0, src1, src01, dst0, dst1, dst01, \
                 tmp01, tmp23, tmp45, tmp67, tmp8
 
-    vdup.32     tmp8, src1[1]
+    vdup.32     \tmp8, \src1[1]
     /* bubbles */
-    vmvn.8      tmp8, tmp8
+    vmvn.8      \tmp8, \tmp8
     /* bubbles */
-    vmull.u8    tmp01, dst0, tmp8
+    vmull.u8    \tmp01, \dst0, \tmp8
     /* bubbles */
-    vmull.u8    tmp23, dst1, tmp8
+    vmull.u8    \tmp23, \dst1, \tmp8
     /* bubbles */
-    vrshr.u16   tmp45, tmp01, #8
-    vrshr.u16   tmp67, tmp23, #8
+    vrshr.u16   \tmp45, \tmp01, #8
+    vrshr.u16   \tmp67, \tmp23, #8
     /* bubbles */
-    vraddhn.u16 dst0, tmp45, tmp01
-    vraddhn.u16 dst1, tmp67, tmp23
+    vraddhn.u16 \dst0, \tmp45, \tmp01
+    vraddhn.u16 \dst1, \tmp67, \tmp23
     /* bubbles */
-    vqadd.u8    src01, dst01, src01
+    vqadd.u8    \src01, \dst01, \src01
 .endm
 
 .macro bilinear_combine_add \
                 numpix, src0, src1, src01, dst0, dst1, dst01, \
                 tmp01, tmp23, tmp45, tmp67, tmp8
 
-    vqadd.u8    src01, dst01, src01
+    vqadd.u8    \src01, \dst01, \src01
 .endm
 
 .macro bilinear_combine \
                 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
                 tmp01, tmp23, tmp45, tmp67, tmp8
 
-    bilinear_combine_&op \
-                numpix, src0, src1, src01, dst0, dst1, dst01, \
-                tmp01, tmp23, tmp45, tmp67, tmp8
+    bilinear_combine_\()\op \
+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
+                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
 .endm
 
 /*
  * Macros for final deinterleaving of destination pixels if needed.
  */
 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
-    vuzp.8      dst0, dst1
+    vuzp.8      \dst0, \dst1
     /* bubbles */
-    vuzp.8      dst0, dst1
+    vuzp.8      \dst0, \dst1
 .endm
 
 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
 .endm
 
 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
 .endm
 
 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
-    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
 .endm
 
 
 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
-    bilinear_load_&src_fmt d0, d1, d2
-    bilinear_load_mask mask_fmt, 1, d4
-    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+    bilinear_load_\()\src_fmt d0, d1, d2
+    bilinear_load_mask \mask_fmt, 1, d4
+    bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
     /* 5 cycles bubble */
@@ -483,28 +485,28 @@
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
     /* 5 cycles bubble */
-    bilinear_duplicate_mask mask_fmt, 1, d4
+    bilinear_duplicate_mask \mask_fmt, 1, d4
     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
     bilinear_interleave_src_dst \
-                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+                \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
     bilinear_apply_mask_to_src \
-                mask_fmt, 1, d0, d1, q0, d4, \
+                \mask_fmt, 1, d0, d1, q0, d4, \
                 q3, q8, q10, q11
     bilinear_combine \
-                op, 1, d0, d1, q0, d18, d19, q9, \
+                \op, 1, d0, d1, q0, d18, d19, q9, \
                 q3, q8, q10, q11, d5
-    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
-    bilinear_store_&dst_fmt 1, q2, q3
+    bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
+    bilinear_store_\()\dst_fmt 1, q2, q3
 .endm
 
 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
-    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
-    bilinear_load_mask mask_fmt, 2, d4
-    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+    bilinear_load_mask \mask_fmt, 2, d4
+    bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
@@ -513,24 +515,24 @@
     vmlal.u16 q10, d23, d31
     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
-    bilinear_duplicate_mask mask_fmt, 2, d4
+    bilinear_duplicate_mask \mask_fmt, 2, d4
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     vmovn.u16 d0, q0
     bilinear_interleave_src_dst \
-                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+                \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
     bilinear_apply_mask_to_src \
-                mask_fmt, 2, d0, d1, q0, d4, \
+                \mask_fmt, 2, d0, d1, q0, d4, \
                 q3, q8, q10, q11
     bilinear_combine \
-                op, 2, d0, d1, q0, d18, d19, q9, \
+                \op, 2, d0, d1, q0, d18, d19, q9, \
                 q3, q8, q10, q11, d5
-    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
-    bilinear_store_&dst_fmt 2, q2, q3
+    bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
+    bilinear_store_\()\dst_fmt 2, q2, q3
 .endm
 
 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
-    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
@@ -546,8 +548,8 @@
     vmlsl.u16 q2, d6, d30
     vmlal.u16 q2, d7, d30
     vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
-    bilinear_load_mask mask_fmt, 4, d22
-    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+    bilinear_load_mask \mask_fmt, 4, d22
+    bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
     pld       [TMP1, PF_OFFS]
     vmlsl.u16 q8, d18, d31
     vmlal.u16 q8, d19, d31
@@ -556,21 +558,21 @@
     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
-    bilinear_duplicate_mask mask_fmt, 4, d22
+    bilinear_duplicate_mask \mask_fmt, 4, d22
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
     vadd.u16  q12, q12, q13
     bilinear_interleave_src_dst \
-                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+                \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
     bilinear_apply_mask_to_src \
-                mask_fmt, 4, d0, d1, q0, d22, \
+                \mask_fmt, 4, d0, d1, q0, d22, \
                 q3, q8, q9, q10
     bilinear_combine \
-                op, 4, d0, d1, q0, d2, d3, q1, \
+                \op, 4, d0, d1, q0, d2, d3, q1, \
                 q3, q8, q9, q10, d23
-    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
-    bilinear_store_&dst_fmt 4, q2, q3
+    bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
+    bilinear_store_\()\dst_fmt 4, q2, q3
 .endm
 
 .set BILINEAR_FLAG_USE_MASK,		1
@@ -610,14 +612,14 @@
 	prefetch_distance, \
 	flags
 
-pixman_asm_function fname
-.if pixblock_size == 8
-.elseif pixblock_size == 4
+pixman_asm_function \fname
+.if \pixblock_size == 8
+.elseif \pixblock_size == 4
 .else
     .error unsupported pixblock size
 .endif
 
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
     OUT       .req    r0
     TOP       .req    r1
     BOTTOM    .req    r2
@@ -635,7 +637,7 @@ pixman_asm_function fname
 
     mov		ip, sp
     push	{r4, r5, r6, r7, r8, r9}
-    mov		PF_OFFS, #prefetch_distance
+    mov		PF_OFFS, #\prefetch_distance
     ldmia	ip, {WB, X, UX, WIDTH}
 .else
     OUT       .req      r0
@@ -654,17 +656,17 @@ pixman_asm_function fname
     TMP4      .req      r10
     STRIDE    .req      r3
 
-    .set prefetch_offset, prefetch_distance
+    .set prefetch_offset, \prefetch_distance
 
     mov       ip, sp
     push      {r4, r5, r6, r7, r8, r9, r10, ip}
-    mov       PF_OFFS, #prefetch_distance
+    mov       PF_OFFS, #\prefetch_distance
     ldmia     ip, {WT, WB, X, UX, WIDTH}
 .endif
 
     mul       PF_OFFS, PF_OFFS, UX
 
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpush     {d8-d15}
 .endif
 
@@ -683,11 +685,11 @@ pixman_asm_function fname
     /* ensure good destination alignment  */
     cmp       WIDTH, #1
     blt       0f
-    tst       OUT, #(1 << dst_bpp_shift)
+    tst       OUT, #(1 << \dst_bpp_shift)
     beq       0f
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
-    bilinear_process_last_pixel
+    \bilinear_process_last_pixel
     sub       WIDTH, WIDTH, #1
 0:
     vadd.u16  q13, q13, q13
@@ -696,53 +698,53 @@ pixman_asm_function fname
 
     cmp       WIDTH, #2
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    tst       OUT, #(1 << (\dst_bpp_shift + 1))
     beq       0f
-    bilinear_process_two_pixels
+    \bilinear_process_two_pixels
     sub       WIDTH, WIDTH, #2
 0:
-.if pixblock_size == 8
+.if \pixblock_size == 8
     cmp       WIDTH, #4
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    tst       OUT, #(1 << (\dst_bpp_shift + 2))
     beq       0f
-    bilinear_process_four_pixels
+    \bilinear_process_four_pixels
     sub       WIDTH, WIDTH, #4
 0:
 .endif
-    subs      WIDTH, WIDTH, #pixblock_size
+    subs      WIDTH, WIDTH, #\pixblock_size
     blt       1f
-    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_process_pixblock_head
-    subs      WIDTH, WIDTH, #pixblock_size
+    mov       PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
+    \bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #\pixblock_size
     blt       5f
 0:
-    bilinear_process_pixblock_tail_head
-    subs      WIDTH, WIDTH, #pixblock_size
+    \bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #\pixblock_size
     bge       0b
 5:
-    bilinear_process_pixblock_tail
+    \bilinear_process_pixblock_tail
 1:
-.if pixblock_size == 8
+.if \pixblock_size == 8
     tst       WIDTH, #4
     beq       2f
-    bilinear_process_four_pixels
+    \bilinear_process_four_pixels
 2:
 .endif
     /* handle the remaining trailing pixels */
     tst       WIDTH, #2
     beq       2f
-    bilinear_process_two_pixels
+    \bilinear_process_two_pixels
 2:
     tst       WIDTH, #1
     beq       3f
-    bilinear_process_last_pixel
+    \bilinear_process_last_pixel
 3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpop      {d8-d15}
 .endif
 
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
     pop       {r4, r5, r6, r7, r8, r9}
 .else
     pop       {r4, r5, r6, r7, r8, r9, r10, ip}
@@ -762,11 +764,11 @@ pixman_asm_function fname
     .unreq    TMP3
     .unreq    TMP4
     .unreq    STRIDE
-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
     .unreq    MASK
 .endif
 
-.endfunc
+pixman_end_asm_function
 
 .endm
 
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 7e949a3..0e09257 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -53,6 +53,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
+    pixman_syntax_unified
+
 /* Global configuration options and preferences */
 
 /*
@@ -260,13 +262,13 @@
     vshrn.u16   d7, q2, #3
     vsli.u16    q2, q2, #5
         vshll.u8    q14, d16, #8
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vshll.u8    q8, d19, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF tst, PF_CTL, #0xF
     vsri.u8     d6, d6, #5
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
     vmvn.8      d3, d3
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
     vsri.u8     d7, d7, #6
     vshrn.u16   d30, q2, #2
     vmull.u8    q10, d3, d6
@@ -275,18 +277,18 @@
     vmull.u8    q12, d3, d30
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vsri.u16    q14, q8, #5
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vshll.u8    q9, d18, #8
     vrshr.u16   q13, q10, #8
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vrshr.u16   q3, q11, #8
     vrshr.u16   q15, q12, #8
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
         vsri.u16    q14, q9, #11
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vraddhn.u16 d20, q10, q13
     vraddhn.u16 d23, q11, q3
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vraddhn.u16 d22, q12, q15
         vst1.16     {d28, d29}, [DST_W, :128]!
 .endm
@@ -434,20 +436,20 @@ generate_composite_function \
 
 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
         vsri.u16    q14, q8, #5
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
     fetch_src_pixblock
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vsri.u16    q14, q9, #11
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vshll.u8    q8, d1, #8
         vst1.16     {d28, d29}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vshll.u8    q14, d2, #8
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vshll.u8    q9, d0, #8
 .endm
 
@@ -509,20 +511,20 @@ generate_composite_function \
 
 .macro pixman_composite_add_8_8_process_pixblock_tail_head
     fetch_src_pixblock
-                                    PF add PF_X, PF_X, #32
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #32
+                                    PF tst, PF_CTL, #0xF
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #32
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #32
+                                    PF subne, PF_CTL, PF_CTL, #1
         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vqadd.u8    q15, q1, q3
 .endm
 
@@ -541,20 +543,20 @@ generate_composite_function \
 
 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
     fetch_src_pixblock
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
     vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vqadd.u8    q15, q1, q3
 .endm
 
@@ -604,16 +606,16 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
     fetch_src_pixblock
@@ -621,13 +623,13 @@ generate_composite_function_single_scanline \
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
@@ -656,16 +658,16 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
@@ -675,13 +677,13 @@ generate_composite_function_single_scanline \
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
@@ -742,20 +744,20 @@ generate_composite_function_single_scanline \
         vraddhn.u16 d31, q3, q11
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vqadd.u8    q14, q0, q14
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0x0F
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0x0F
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vqadd.u8    q15, q1, q15
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
     vmull.u8    q8, d24, d4
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
     vmull.u8    q9, d24, d5
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q10, d24, d6
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q11, d24, d7
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
 .endm
 
@@ -784,16 +786,16 @@ generate_composite_function \
 
 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
         vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
@@ -802,12 +804,12 @@ generate_composite_function \
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
     vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
@@ -1245,23 +1247,23 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
     fetch_mask_pixblock
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vrshrn.u16  d28, q8, #8
-                                    PF tst PF_CTL, #0x0F
+                                    PF tst, PF_CTL, #0x0F
         vrshrn.u16  d29, q9, #8
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
         vrshrn.u16  d30, q10, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
         vrshrn.u16  d31, q11, #8
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
     vmull.u8    q8, d24, d0
                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
     vmull.u8    q9, d24, d1
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q10, d24, d2
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q11, d24, d3
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vrsra.u16   q8, q8, #8
     vrsra.u16   q9, q9, #8
@@ -1314,23 +1316,23 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
     fetch_mask_pixblock
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vrshrn.u16  d28, q0, #8
-                                    PF tst PF_CTL, #0x0F
+                                    PF tst, PF_CTL, #0x0F
         vrshrn.u16  d29, q1, #8
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
         vrshrn.u16  d30, q2, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
         vrshrn.u16  d31, q3, #8
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
     vmull.u8    q0,  d24, d16
                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
     vmull.u8    q1,  d25, d16
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q2,  d26, d16
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q3,  d27, d16
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vrsra.u16   q0, q0,  #8
     vrsra.u16   q1, q1,  #8
@@ -1408,27 +1410,27 @@ generate_composite_function \
         vrshr.u16   q15, q9, #8
     fetch_mask_pixblock
         vrshr.u16   q6, q10, #8
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vrshr.u16   q7, q11, #8
-                                    PF tst PF_CTL, #0x0F
+                                    PF tst, PF_CTL, #0x0F
         vraddhn.u16 d28, q14, q8
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
         vraddhn.u16 d29, q15, q9
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d30, q6, q10
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d31, q7, q11
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
     vmull.u8    q6, d24, d8
                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
     vmull.u8    q7, d24, d9
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d24, d10
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d24, d11
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
         vqadd.u8    q14, q0, q14
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
         vqadd.u8    q15, q1, q15
     vrshr.u16   q10, q6, #8
     vrshr.u16   q11, q7, #8
@@ -2425,21 +2427,21 @@ generate_composite_function \
         vrshr.u16   q13, q10, #8
     fetch_src_pixblock
         vraddhn.u16 d30, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d29, q12, q9
         vraddhn.u16 d28, q13, q10
     vmull.u8    q8, d3, d0
     vmull.u8    q9, d3, d1
     vmull.u8    q10, d3, d2
         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endm
 
 generate_composite_function \
@@ -2482,21 +2484,21 @@ generate_composite_function \
         vrshr.u16   q13, q10, #8
     fetch_src_pixblock
         vraddhn.u16 d28, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d29, q12, q9
         vraddhn.u16 d30, q13, q10
     vmull.u8    q8, d3, d0
     vmull.u8    q9, d3, d1
     vmull.u8    q10, d3, d2
         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endm
 
 generate_composite_function \
@@ -2841,28 +2843,28 @@ generate_composite_function_nearest_scanline \
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #2
-    vld1.32   {reg1}, [TMP1], STRIDE
-    vld1.32   {reg2}, [TMP1]
+    vld1.32   {\reg1}, [TMP1], STRIDE
+    vld1.32   {\reg2}, [TMP1]
 .endm
 
 .macro bilinear_load_0565 reg1, reg2, tmp
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
-    vld1.32   {reg2[0]}, [TMP1], STRIDE
-    vld1.32   {reg2[1]}, [TMP1]
-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+    vld1.32   {\reg2[0]}, [TMP1], STRIDE
+    vld1.32   {\reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_8888 \
                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
 
-    bilinear_load_8888 reg1, reg2, tmp1
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    bilinear_load_8888 reg3, reg4, tmp2
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    bilinear_load_8888 \reg1, \reg2, \tmp1
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    bilinear_load_8888 \reg3, \reg4, \tmp2
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -2870,9 +2872,9 @@ generate_composite_function_nearest_scanline \
                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
 
     bilinear_load_and_vertical_interpolate_two_8888 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
     bilinear_load_and_vertical_interpolate_two_8888 \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -2884,19 +2886,19 @@ generate_composite_function_nearest_scanline \
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {acc2lo[1]}, [TMP1]
-    vld1.32   {acc2hi[1]}, [TMP2]
-    convert_0565_to_x888 acc2, reg3, reg2, reg1
-    vzip.u8   reg1, reg3
-    vzip.u8   reg2, reg4
-    vzip.u8   reg3, reg4
-    vzip.u8   reg1, reg2
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\acc2lo[1]}, [TMP1]
+    vld1.32   {\acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+    vzip.u8   \reg1, \reg3
+    vzip.u8   \reg2, \reg4
+    vzip.u8   \reg3, \reg4
+    vzip.u8   \reg1, \reg2
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -2909,49 +2911,49 @@ generate_composite_function_nearest_scanline \
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {xacc2lo[1]}, [TMP1]
-    vld1.32   {xacc2hi[1]}, [TMP2]
-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\xacc2lo[1]}, [TMP1]
+    vld1.32   {\xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
-    vzip.u8   xreg1, xreg3
-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
-    vzip.u8   xreg2, xreg4
-    vld1.32   {yacc2lo[1]}, [TMP1]
-    vzip.u8   xreg3, xreg4
-    vld1.32   {yacc2hi[1]}, [TMP2]
-    vzip.u8   xreg1, xreg2
-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-    vmull.u8  xacc1, xreg1, d28
-    vzip.u8   yreg1, yreg3
-    vmlal.u8  xacc1, xreg2, d29
-    vzip.u8   yreg2, yreg4
-    vmull.u8  xacc2, xreg3, d28
-    vzip.u8   yreg3, yreg4
-    vmlal.u8  xacc2, xreg4, d29
-    vzip.u8   yreg1, yreg2
-    vmull.u8  yacc1, yreg1, d28
-    vmlal.u8  yacc1, yreg2, d29
-    vmull.u8  yacc2, yreg3, d28
-    vmlal.u8  yacc2, yreg4, d29
+    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   \xreg1, \xreg3
+    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   \xreg2, \xreg4
+    vld1.32   {\yacc2lo[1]}, [TMP1]
+    vzip.u8   \xreg3, \xreg4
+    vld1.32   {\yacc2hi[1]}, [TMP2]
+    vzip.u8   \xreg1, \xreg2
+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+    vmull.u8  \xacc1, \xreg1, d28
+    vzip.u8   \yreg1, \yreg3
+    vmlal.u8  \xacc1, \xreg2, d29
+    vzip.u8   \yreg2, \yreg4
+    vmull.u8  \xacc2, \xreg3, d28
+    vzip.u8   \yreg3, \yreg4
+    vmlal.u8  \xacc2, \xreg4, d29
+    vzip.u8   \yreg1, \yreg2
+    vmull.u8  \yacc1, \yreg1, d28
+    vmlal.u8  \yacc1, \yreg2, d29
+    vmull.u8  \yacc2, \yreg3, d28
+    vmlal.u8  \yacc2, \yreg4, d29
 .endm
 
 .macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
     vst1.32   {d0, d1}, [OUT, :128]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d0}, [OUT, :64]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.32   {d0[0]}, [OUT, :32]!
 .else
-    .error bilinear_store_8888 numpix is unsupported
+    .error bilinear_store_8888 \numpix is unsupported
 .endif
 .endm
 
@@ -2960,20 +2962,20 @@ generate_composite_function_nearest_scanline \
     vuzp.u8 d2, d3
     vuzp.u8 d1, d3
     vuzp.u8 d0, d2
-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
     vst1.16   {d2}, [OUT, :64]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d2[0]}, [OUT, :32]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.16   {d2[0]}, [OUT, :16]!
 .else
-    .error bilinear_store_0565 numpix is unsupported
+    .error bilinear_store_0565 \numpix is unsupported
 .endif
 .endm
 
 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
+    bilinear_load_\()\src_fmt d0, d1, d2
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
     /* 5 cycles bubble */
@@ -2985,11 +2987,11 @@ generate_composite_function_nearest_scanline \
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
-    bilinear_store_&dst_fmt 1, q2, q3
+    bilinear_store_\()\dst_fmt 1, q2, q3
 .endm
 
 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
-    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
@@ -3002,11 +3004,11 @@ generate_composite_function_nearest_scanline \
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     vmovn.u16 d0, q0
-    bilinear_store_&dst_fmt 2, q2, q3
+    bilinear_store_\()\dst_fmt 2, q2, q3
 .endm
 
 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
-    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
@@ -3034,54 +3036,54 @@ generate_composite_function_nearest_scanline \
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
     vadd.u16  q12, q12, q13
-    bilinear_store_&dst_fmt 4, q2, q3
+    bilinear_store_\()\dst_fmt 4, q2, q3
 .endm
 
 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
 .else
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
 .endif
 .endm
 
 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
 .else
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
 .else
-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
 .else
-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
 .else
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
 .endif
 .endm
 
@@ -3106,7 +3108,7 @@ generate_composite_function_nearest_scanline \
                                        src_bpp_shift, dst_bpp_shift, \
                                        prefetch_distance, flags
 
-pixman_asm_function fname
+pixman_asm_function \fname
     OUT       .req      r0
     TOP       .req      r1
     BOTTOM    .req      r2
@@ -3124,11 +3126,11 @@ pixman_asm_function fname
 
     mov       ip, sp
     push      {r4, r5, r6, r7, r8, r9}
-    mov       PF_OFFS, #prefetch_distance
+    mov       PF_OFFS, #\prefetch_distance
     ldmia     ip, {WB, X, UX, WIDTH}
     mul       PF_OFFS, PF_OFFS, UX
 
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpush     {d8-d15}
 .endif
 
@@ -3147,11 +3149,11 @@ pixman_asm_function fname
     /* ensure good destination alignment  */
     cmp       WIDTH, #1
     blt       0f
-    tst       OUT, #(1 << dst_bpp_shift)
+    tst       OUT, #(1 << \dst_bpp_shift)
     beq       0f
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
     sub       WIDTH, WIDTH, #1
 0:
     vadd.u16  q13, q13, q13
@@ -3160,64 +3162,64 @@ pixman_asm_function fname
 
     cmp       WIDTH, #2
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    tst       OUT, #(1 << (\dst_bpp_shift + 1))
     beq       0f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
     sub       WIDTH, WIDTH, #2
 0:
-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
 /*********** 8 pixels per iteration *****************/
     cmp       WIDTH, #4
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    tst       OUT, #(1 << (\dst_bpp_shift + 2))
     beq       0f
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
     sub       WIDTH, WIDTH, #4
 0:
     subs      WIDTH, WIDTH, #8
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #8
     blt       5f
 0:
-    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #8
     bge       0b
 5:
-    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+    bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
 1:
     tst       WIDTH, #4
     beq       2f
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
 2:
 .else
 /*********** 4 pixels per iteration *****************/
     subs      WIDTH, WIDTH, #4
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #4
     blt       5f
 0:
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #4
     bge       0b
 5:
-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
 1:
 /****************************************************/
 .endif
     /* handle the remaining trailing pixels */
     tst       WIDTH, #2
     beq       2f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
 2:
     tst       WIDTH, #1
     beq       3f
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
 3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpop      {d8-d15}
 .endif
     pop       {r4, r5, r6, r7, r8, r9}
@@ -3236,7 +3238,7 @@ pixman_asm_function fname
     .unreq    TMP3
     .unreq    TMP4
     .unreq    STRIDE
-.endfunc
+    pixman_end_asm_function
 
 .endm
 
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index bdcf6a9..06318d9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -74,134 +74,134 @@
  */
 
 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
 .else
-    op&.&elem_size {d&reg1}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
 .endif
 .endm
 
 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
 .else
-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
 .endif
 .endm
 
 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
 .else
-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
 .endif
 .endm
 
 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
-    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
 .endm
 
 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
-    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
 .endm
 
 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
-    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
 .endm
 
 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
-.if numbytes == 32
-    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
-                              %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif numbytes == 16
-    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
-.elseif numbytes == 8
-    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
-.elseif numbytes == 4
-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
-        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
-    .elseif elem_size == 16
-        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
-        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+.if \numbytes == 32
+    pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
+                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif \numbytes == 16
+    pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+.elseif \numbytes == 8
+    pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
+.elseif \numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
+        pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
+    .elseif \elem_size == 16
+        pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
+        pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
     .else
-        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+        pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
     .endif
-.elseif numbytes == 2
-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
-        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
+        pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
     .else
-        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+        pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
     .endif
-.elseif numbytes == 1
-    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 1
+    pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
 .else
-    .error "unsupported size: numbytes"
+    .error "unsupported size: \numbytes"
 .endif
 .endm
 
 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
-                      %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
-    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+    pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
 .else
-    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+    pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
 .endif
 .endif
 .endm
 
 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
-                      %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
-    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+    pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
 .else
-    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+    pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
 .endif
 .endif
 .endm
 
 .macro pixld_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
-    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
 .else
-    pixld numpix, bpp, basereg, mem_operand, 128
+    pixld \numpix, \bpp, \basereg, \mem_operand, 128
 .endif
 .endm
 
 .macro pixst_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
-    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
 .else
-    pixst numpix, bpp, basereg, mem_operand, 128
+    pixst \numpix, \bpp, \basereg, \mem_operand, 128
 .endif
 .endm
 
@@ -210,44 +210,44 @@
  * aliases to be defined)
  */
 .macro pixld1_s elem_size, reg1, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #1
+    add     TMP1, \mem_operand, TMP1, asl #1
     mov     TMP2, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP2, mem_operand, TMP2, asl #1
-    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    add     TMP2, \mem_operand, TMP2, asl #1
+    vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #1
-    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    add     TMP1, \mem_operand, TMP1, asl #1
+    vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
     mov     TMP2, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP2, mem_operand, TMP2, asl #1
-    vld1.16 {d&reg1&[2]}, [TMP1, :16]
-    vld1.16 {d&reg1&[3]}, [TMP2, :16]
-.elseif elem_size == 32
+    add     TMP2, \mem_operand, TMP2, asl #1
+    vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
+    vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
+.elseif \elem_size == 32
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #2
+    add     TMP1, \mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
-    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+    add     TMP2, \mem_operand, TMP2, asl #2
+    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
+    vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
 .else
     .error "unsupported"
 .endif
@@ -257,110 +257,110 @@
 .if 0 /* elem_size == 32 */
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
-    add     TMP1, mem_operand, TMP1, asl #2
+    add     TMP1, \mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
     sub     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    add     TMP2, \mem_operand, TMP2, asl #2
+    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
-    add     TMP1, mem_operand, TMP1, asl #2
-    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    add     TMP1, \mem_operand, TMP1, asl #2
+    vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
     mov     TMP2, VX, asr #16
     add     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[1]}, [TMP1, :32]
-    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+    add     TMP2, \mem_operand, TMP2, asl #2
+    vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
+    vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
 .else
-    pixld1_s elem_size, reg1, mem_operand
-    pixld1_s elem_size, reg2, mem_operand
+    pixld1_s \elem_size, \reg1, \mem_operand
+    pixld1_s \elem_size, \reg2, \mem_operand
 .endif
 .endm
 
 .macro pixld0_s elem_size, reg1, idx, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #1
-    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
-.elseif elem_size == 32
+    add     TMP1, \mem_operand, TMP1, asl #1
+    vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
+.elseif \elem_size == 32
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #2
-    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+    add     TMP1, \mem_operand, TMP1, asl #2
+    vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
 .endif
 .endm
 
 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
-.if numbytes == 32
-    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
-    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
-    pixdeinterleave elem_size, %(basereg+4)
-.elseif numbytes == 16
-    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
-.elseif numbytes == 8
-    pixld1_s elem_size, %(basereg+1), mem_operand
-.elseif numbytes == 4
-    .if elem_size == 32
-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
-    .elseif elem_size == 16
-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+.if \numbytes == 32
+    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
+    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
+    pixdeinterleave \elem_size, %(\basereg+4)
+.elseif \numbytes == 16
+    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
+.elseif \numbytes == 8
+    pixld1_s \elem_size, %(\basereg+1), \mem_operand
+.elseif \numbytes == 4
+    .if \elem_size == 32
+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+    .elseif \elem_size == 16
+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
     .else
-        pixld0_s elem_size, %(basereg+0), 4, mem_operand
-        pixld0_s elem_size, %(basereg+0), 5, mem_operand
-        pixld0_s elem_size, %(basereg+0), 6, mem_operand
-        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
     .endif
-.elseif numbytes == 2
-    .if elem_size == 16
-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 2
+    .if \elem_size == 16
+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
     .else
-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
     .endif
-.elseif numbytes == 1
-    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 1
+    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
 .else
-    .error "unsupported size: numbytes"
+    .error "unsupported size: \numbytes"
 .endif
 .endm
 
 .macro pixld_s numpix, bpp, basereg, mem_operand
-.if bpp > 0
-    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.if \bpp > 0
+    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
 .endif
 .endm
 
 .macro vuzp8 reg1, reg2
-    vuzp.8 d&reg1, d&reg2
+    vuzp.8 d\()\reg1, d\()\reg2
 .endm
 
 .macro vzip8 reg1, reg2
-    vzip.8 d&reg1, d&reg2
+    vzip.8 d\()\reg1, d\()\reg2
 .endm
 
 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 .macro pixdeinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    vuzp8 %(basereg+0), %(basereg+1)
-    vuzp8 %(basereg+2), %(basereg+3)
-    vuzp8 %(basereg+1), %(basereg+3)
-    vuzp8 %(basereg+0), %(basereg+2)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(\basereg+0), %(\basereg+1)
+    vuzp8 %(\basereg+2), %(\basereg+3)
+    vuzp8 %(\basereg+1), %(\basereg+3)
+    vuzp8 %(\basereg+0), %(\basereg+2)
 .endif
 .endm
 
 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 .macro pixinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    vzip8 %(basereg+0), %(basereg+2)
-    vzip8 %(basereg+1), %(basereg+3)
-    vzip8 %(basereg+2), %(basereg+3)
-    vzip8 %(basereg+0), %(basereg+1)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(\basereg+0), %(\basereg+2)
+    vzip8 %(\basereg+1), %(\basereg+3)
+    vzip8 %(\basereg+2), %(\basereg+3)
+    vzip8 %(\basereg+0), %(\basereg+1)
 .endif
 .endm
 
@@ -394,22 +394,22 @@
  */
 .macro PF a, x:vararg
 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
-    a x
+    \a \x
 .endif
 .endm
 
 .macro cache_preload std_increment, boost_increment
 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
 .if regs_shortage
-    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+    PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
 .endif
-.if std_increment != 0
-    PF add PF_X, PF_X, #std_increment
+.if \std_increment != 0
+    PF add, PF_X, PF_X, #\std_increment
 .endif
-    PF tst PF_CTL, #0xF
-    PF addne PF_X, PF_X, #boost_increment
-    PF subne PF_CTL, PF_CTL, #1
-    PF cmp PF_X, ORIG_W
+    PF tst, PF_CTL, #0xF
+    PF addne, PF_X, PF_X, #\boost_increment
+    PF subne, PF_CTL, PF_CTL, #1
+    PF cmp, PF_X, ORIG_W
 .if src_bpp_shift >= 0
     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 .endif
@@ -419,16 +419,16 @@
 .if mask_bpp_shift >= 0
     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
 .endif
-    PF subge PF_X, PF_X, ORIG_W
-    PF subges PF_CTL, PF_CTL, #0x10
+    PF subge, PF_X, PF_X, ORIG_W
+    PF subsge, PF_CTL, PF_CTL, #0x10
 .if src_bpp_shift >= 0
-    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endif
 .if dst_r_bpp != 0
-    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 .endif
 .if mask_bpp_shift >= 0
-    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
 .endif
 .endif
 .endm
@@ -465,21 +465,20 @@
     beq         2f
 
 .irp lowbit, 1, 2, 4, 8, 16
-local skip1
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
-    tst         DST_R, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #\lowbit
     beq         1f
 .endif
-    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
-    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
 .if dst_r_bpp > 0
-    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
 .else
-    add         DST_R, DST_R, #lowbit
+    add         DST_R, DST_R, #\lowbit
 .endif
-    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
-    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
 1:
 .endif
 .endr
@@ -487,19 +486,19 @@ local skip1
     pixdeinterleave mask_bpp, mask_basereg
     pixdeinterleave dst_r_bpp, dst_r_basereg
 
-    process_pixblock_head
+    \process_pixblock_head
     cache_preload 0, pixblock_size
     cache_preload_simple
-    process_pixblock_tail
+    \process_pixblock_tail
 
     pixinterleave dst_w_bpp, dst_w_basereg
 .irp lowbit, 1, 2, 4, 8, 16
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
-    tst         DST_W, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #\lowbit
     beq         1f
 .endif
-    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
 1:
 .endif
 .endr
@@ -530,18 +529,18 @@ local skip1
     tst         W, #(pixblock_size - 1)
     beq         2f
 .irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
-    tst         W, #chunk_size
+.if pixblock_size > \chunk_size
+    tst         W, #\chunk_size
     beq         1f
-    pixld_src   chunk_size, src_bpp, src_basereg, SRC
-    pixld       chunk_size, mask_bpp, mask_basereg, MASK
-.if dst_aligned_flag != 0
-    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+    pixld_src   \chunk_size, src_bpp, src_basereg, SRC
+    pixld       \chunk_size, mask_bpp, mask_basereg, MASK
+.if \dst_aligned_flag != 0
+    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 .else
-    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 .endif
-.if cache_preload_flag != 0
-    PF add      PF_X, PF_X, #chunk_size
+.if \cache_preload_flag != 0
+    PF add,     PF_X, PF_X, #\chunk_size
 .endif
 1:
 .endif
@@ -550,21 +549,21 @@ local skip1
     pixdeinterleave mask_bpp, mask_basereg
     pixdeinterleave dst_r_bpp, dst_r_basereg
 
-    process_pixblock_head
-.if cache_preload_flag != 0
+    \process_pixblock_head
+.if \cache_preload_flag != 0
     cache_preload 0, pixblock_size
     cache_preload_simple
 .endif
-    process_pixblock_tail
+    \process_pixblock_tail
     pixinterleave dst_w_bpp, dst_w_basereg
 .irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
-    tst         W, #chunk_size
+.if pixblock_size > \chunk_size
+    tst         W, #\chunk_size
     beq         1f
-.if dst_aligned_flag != 0
-    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.if \dst_aligned_flag != 0
+    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 .else
-    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 .endif
 1:
 .endif
@@ -604,7 +603,7 @@ local skip1
 .if regs_shortage
     str         H, [sp, #4] /* save updated height to stack */
 .endif
-    bge         start_of_loop_label
+    bge         \start_of_loop_label
 .endm
 
 /*
@@ -631,7 +630,7 @@ local skip1
                                    src_basereg_   = 0, \
                                    mask_basereg_  = 24
 
-    pixman_asm_function fname
+    pixman_asm_function \fname
 
     push        {r4-r12, lr}        /* save all registers */
 
@@ -641,10 +640,10 @@ local skip1
  * has to be used instead of ADVANCED.
  */
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
-.if prefetch_distance == 0
+.if \prefetch_distance == 0
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
-        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
 .endif
 
@@ -652,17 +651,17 @@ local skip1
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
-    .set src_bpp, src_bpp_
-    .set mask_bpp, mask_bpp_
-    .set dst_w_bpp, dst_w_bpp_
-    .set pixblock_size, pixblock_size_
-    .set dst_w_basereg, dst_w_basereg_
-    .set dst_r_basereg, dst_r_basereg_
-    .set src_basereg, src_basereg_
-    .set mask_basereg, mask_basereg_
+    .set src_bpp, \src_bpp_
+    .set mask_bpp, \mask_bpp_
+    .set dst_w_bpp, \dst_w_bpp_
+    .set pixblock_size, \pixblock_size_
+    .set dst_w_basereg, \dst_w_basereg_
+    .set dst_r_basereg, \dst_r_basereg_
+    .set src_basereg, \src_basereg_
+    .set mask_basereg, \mask_basereg_
 
     .macro pixld_src x:vararg
-        pixld x
+        pixld \x
     .endm
     .macro fetch_src_pixblock
         pixld_src   pixblock_size, src_bpp, \
@@ -755,19 +754,19 @@ local skip1
     .error "requested dst bpp (dst_w_bpp) is not supported"
 .endif
 
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
     .set dst_r_bpp, dst_w_bpp
 .else
     .set dst_r_bpp, 0
 .endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
     .set DEINTERLEAVE_32BPP_ENABLED, 1
 .else
     .set DEINTERLEAVE_32BPP_ENABLED, 0
 .endif
 
-.if prefetch_distance < 0 || prefetch_distance > 15
-    .error "invalid prefetch distance (prefetch_distance)"
+.if \prefetch_distance < 0 || \prefetch_distance > 15
+    .error "invalid prefetch distance (\prefetch_distance)"
 .endif
 
 .if src_bpp > 0
@@ -776,7 +775,7 @@ local skip1
 .if mask_bpp > 0
     ldr         MASK, [sp, #48]
 .endif
-    PF mov      PF_X, #0
+    PF mov,     PF_X, #0
 .if src_bpp > 0
     ldr         SRC_STRIDE, [sp, #44]
 .endif
@@ -801,14 +800,14 @@ local skip1
 /*
  * Setup advanced prefetcher initial state
  */
-    PF mov      PF_SRC, SRC
-    PF mov      PF_DST, DST_R
-    PF mov      PF_MASK, MASK
+    PF mov,     PF_SRC, SRC
+    PF mov,     PF_DST, DST_R
+    PF mov,     PF_MASK, MASK
     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
-    PF mov      PF_CTL, H, lsl #4
-    PF add      PF_CTL, #(prefetch_distance - 0x10)
+    PF mov,     PF_CTL, H, lsl #4
+    PF add,     PF_CTL, #(\prefetch_distance - 0x10)
 
-    init
+    \init
 .if regs_shortage
     push        {r0, r1}
 .endif
@@ -826,9 +825,9 @@ local skip1
  * long scanlines
  */
 0:
-    ensure_destination_ptr_alignment process_pixblock_head, \
-                                     process_pixblock_tail, \
-                                     process_pixblock_tail_head
+    ensure_destination_ptr_alignment \process_pixblock_head, \
+                                     \process_pixblock_tail, \
+                                     \process_pixblock_tail_head
 
     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
     pixld_a     pixblock_size, dst_r_bpp, \
@@ -836,33 +835,33 @@ local skip1
     fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    PF add      PF_X, PF_X, #pixblock_size
-    process_pixblock_head
+    PF add,     PF_X, PF_X, #pixblock_size
+    \process_pixblock_head
     cache_preload 0, pixblock_size
     cache_preload_simple
     subs        W, W, #(pixblock_size * 2)
     blt         2f
 1:
-    process_pixblock_tail_head
+    \process_pixblock_tail_head
     cache_preload_simple
     subs        W, W, #pixblock_size
     bge         1b
 2:
-    process_pixblock_tail
+    \process_pixblock_tail
     pixst_a     pixblock_size, dst_w_bpp, \
                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 
     /* Process the remaining trailing pixels in the scanline */
     process_trailing_pixels 1, 1, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
     advance_to_next_scanline 0b
 
 .if regs_shortage
     pop         {r0, r1}
 .endif
-    cleanup
+    \cleanup
     pop         {r4-r12, pc}  /* exit */
 /*
  * This is the start of the loop, designed to process images with small width
@@ -878,22 +877,22 @@ local skip1
     fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    process_pixblock_head
-    process_pixblock_tail
+    \process_pixblock_head
+    \process_pixblock_tail
     pixst       pixblock_size, dst_w_bpp, \
                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 1:
     /* Process the remaining trailing pixels in the scanline */
     process_trailing_pixels 0, 0, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
     advance_to_next_scanline 8b
 9:
 .if regs_shortage
     pop         {r0, r1}
 .endif
-    cleanup
+    \cleanup
     pop         {r4-r12, pc}  /* exit */
 
     .purgem     fetch_src_pixblock
@@ -915,7 +914,7 @@ local skip1
     .unreq      PF_DST
     .unreq      PF_MASK
     .unreq      DUMMY
-    .endfunc
+    pixman_end_asm_function
 .endm
 
 /*
@@ -939,23 +938,23 @@ local skip1
                                                    src_basereg_   = 0, \
                                                    mask_basereg_  = 24
 
-    pixman_asm_function fname
+    pixman_asm_function \fname
 
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 /*
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
-    .set src_bpp, src_bpp_
-    .set mask_bpp, mask_bpp_
-    .set dst_w_bpp, dst_w_bpp_
-    .set pixblock_size, pixblock_size_
-    .set dst_w_basereg, dst_w_basereg_
-    .set dst_r_basereg, dst_r_basereg_
-    .set src_basereg, src_basereg_
-    .set mask_basereg, mask_basereg_
-
-.if use_nearest_scaling != 0
+    .set src_bpp, \src_bpp_
+    .set mask_bpp, \mask_bpp_
+    .set dst_w_bpp, \dst_w_bpp_
+    .set pixblock_size, \pixblock_size_
+    .set dst_w_basereg, \dst_w_basereg_
+    .set dst_r_basereg, \dst_r_basereg_
+    .set src_basereg, \src_basereg_
+    .set mask_basereg, \mask_basereg_
+
+.if \use_nearest_scaling != 0
     /*
      * Assign symbolic names to registers for nearest scaling
      */
@@ -971,7 +970,7 @@ local skip1
     SRC_WIDTH_FIXED .req        r7
 
     .macro pixld_src x:vararg
-        pixld_s x
+        pixld_s \x
     .endm
 
     ldr         UNIT_X, [sp]
@@ -991,16 +990,16 @@ local skip1
     MASK        .req        r3      /* mask pointer */
 
     .macro pixld_src x:vararg
-        pixld x
+        pixld \x
     .endm
 .endif
 
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
     .set dst_r_bpp, dst_w_bpp
 .else
     .set dst_r_bpp, 0
 .endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
     .set DEINTERLEAVE_32BPP_ENABLED, 1
 .else
     .set DEINTERLEAVE_32BPP_ENABLED, 0
@@ -1011,15 +1010,15 @@ local skip1
                     (src_basereg - pixblock_size * src_bpp / 64), SRC
     .endm
 
-    init
+    \init
     mov         DST_R, DST_W
 
     cmp         W, #pixblock_size
     blt         8f
 
-    ensure_destination_ptr_alignment process_pixblock_head, \
-                                     process_pixblock_tail, \
-                                     process_pixblock_tail_head
+    ensure_destination_ptr_alignment \process_pixblock_head, \
+                                     \process_pixblock_tail, \
+                                     \process_pixblock_tail_head
 
     subs        W, W, #pixblock_size
     blt         7f
@@ -1030,26 +1029,26 @@ local skip1
     fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    process_pixblock_head
+    \process_pixblock_head
     subs        W, W, #pixblock_size
     blt         2f
 1:
-    process_pixblock_tail_head
+    \process_pixblock_tail_head
     subs        W, W, #pixblock_size
     bge         1b
 2:
-    process_pixblock_tail
+    \process_pixblock_tail
     pixst_a     pixblock_size, dst_w_bpp, \
                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 7:
     /* Process the remaining trailing pixels in the scanline (dst aligned) */
     process_trailing_pixels 0, 1, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
 
-    cleanup
-.if use_nearest_scaling != 0
+    \cleanup
+.if \use_nearest_scaling != 0
     pop         {r4-r8, pc}  /* exit */
 .else
     bx          lr  /* exit */
@@ -1057,13 +1056,13 @@ local skip1
 8:
     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
     process_trailing_pixels 0, 0, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
 
-    cleanup
+    \cleanup
 
-.if use_nearest_scaling != 0
+.if \use_nearest_scaling != 0
     pop         {r4-r8, pc}  /* exit */
 
     .unreq      DST_R
@@ -1090,15 +1089,15 @@ local skip1
     .purgem     fetch_src_pixblock
     .purgem     pixld_src
 
-    .endfunc
+    pixman_end_asm_function
 .endm
 
 .macro generate_composite_function_single_scanline x:vararg
-    generate_composite_function_scanline 0, x
+    generate_composite_function_scanline 0, \x
 .endm
 
 .macro generate_composite_function_nearest_scanline x:vararg
-    generate_composite_function_scanline 1, x
+    generate_composite_function_scanline 1, \x
 .endm
 
 /* Default prologue/epilogue, nothing special needs to be done */
@@ -1134,22 +1133,22 @@ local skip1
  *          value (in) is lost.
  */
 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
-    vshrn.u16   out_r, in,    #8
-    vshrn.u16   out_g, in,    #3
-    vsli.u16    in,    in,    #5
-    vmov.u8     out_a, #255
-    vsri.u8     out_r, out_r, #5
-    vsri.u8     out_g, out_g, #6
-    vshrn.u16   out_b, in,    #2
+    vshrn.u16   \out_r, \in,    #8
+    vshrn.u16   \out_g, \in,    #3
+    vsli.u16    \in,    \in,    #5
+    vmov.u8     \out_a, #255
+    vsri.u8     \out_r, \out_r, #5
+    vsri.u8     \out_g, \out_g, #6
+    vshrn.u16   \out_b, \in,    #2
 .endm
 
 .macro convert_0565_to_x888 in, out_r, out_g, out_b
-    vshrn.u16   out_r, in,    #8
-    vshrn.u16   out_g, in,    #3
-    vsli.u16    in,    in,    #5
-    vsri.u8     out_r, out_r, #5
-    vsri.u8     out_g, out_g, #6
-    vshrn.u16   out_b, in,    #2
+    vshrn.u16   \out_r, \in,    #8
+    vshrn.u16   \out_g, \in,    #3
+    vsli.u16    \in,    \in,    #5
+    vsri.u8     \out_r, \out_r, #5
+    vsri.u8     \out_g, \out_g, #6
+    vshrn.u16   \out_b, \in,    #2
 .endm
 
 /*
@@ -1159,11 +1158,11 @@ local skip1
  * registers (tmp1, tmp2)
  */
 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
-    vshll.u8    tmp1, in_g, #8
-    vshll.u8    out, in_r, #8
-    vshll.u8    tmp2, in_b, #8
-    vsri.u16    out, tmp1, #5
-    vsri.u16    out, tmp2, #11
+    vshll.u8    \tmp1, \in_g, #8
+    vshll.u8    \out, \in_r, #8
+    vshll.u8    \tmp2, \in_b, #8
+    vsri.u16    \out, \tmp1, #5
+    vsri.u16    \out, \tmp2, #11
 .endm
 
 /*
@@ -1173,12 +1172,12 @@ local skip1
  * value from 'in' is lost
  */
 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
-    vshl.u16    out0, in,   #5  /* G top 6 bits */
-    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
-    vsri.u16    in,   in,   #5  /* R is ready in top bits */
-    vsri.u16    out0, out0, #6  /* G is ready in top bits */
-    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
-    vshr.u16    out1, in,   #8  /* R is in place */
-    vsri.u16    out0, tmp,  #8  /* G & B is in place */
-    vzip.u16    out0, out1      /* everything is in place */
+    vshl.u16    \out0, \in,   #5  /* G top 6 bits */
+    vshl.u16    \tmp,  \in,   #11 /* B top 5 bits */
+    vsri.u16    \in,   \in,   #5  /* R is ready in top bits */
+    vsri.u16    \out0, \out0, #6  /* G is ready in top bits */
+    vsri.u16    \tmp,  \tmp,  #5  /* B is ready in top bits */
+    vshr.u16    \out1, \in,   #8  /* R is in place */
+    vsri.u16    \out0, \tmp,  #8  /* G & B is in place */
+    vzip.u16    \out0, \out1      /* everything is in place */
 .endm
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index e050292..cc62c81 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -39,6 +39,8 @@
 
 #include "pixman-arm-asm.h"
 
+	pixman_syntax_unified
+
 /*
  * Note: This code is only using armv5te instructions (not even armv6),
  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
@@ -62,7 +64,7 @@
                                       prefetch_distance,        \
                                       prefetch_braking_distance
 
-pixman_asm_function fname
+pixman_asm_function \fname
 	W		.req	r0
 	DST		.req	r1
 	SRC		.req	r2
@@ -76,39 +78,39 @@ pixman_asm_function fname
 
 	ldr	UNIT_X, [sp]
 	push	{r4, r5, r6, r7, r8, r10}
-	mvn	VXMASK, #((1 << bpp_shift) - 1)
+	mvn	VXMASK, #((1 << \bpp_shift) - 1)
 	ldr	SRC_WIDTH_FIXED, [sp, #28]
 
 	/* define helper macro */
 	.macro	scale_2_pixels
-		ldr&t	TMP1, [SRC, TMP1]
-		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+		ldr\()\t	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
 		adds	VX, VX, UNIT_X
-		str&t	TMP1, [DST], #(1 << bpp_shift)
-9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		str\()\t	TMP1, [DST], #(1 << \bpp_shift)
+9:		subspl	VX, VX, SRC_WIDTH_FIXED
 		bpl	9b
 
-		ldr&t	TMP2, [SRC, TMP2]
-		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+		ldr\()\t	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
 		adds	VX, VX, UNIT_X
-		str&t	TMP2, [DST], #(1 << bpp_shift)
-9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		str\()\t	TMP2, [DST], #(1 << \bpp_shift)
+9:		subspl	VX, VX, SRC_WIDTH_FIXED
 		bpl	9b
 	.endm
 
 	/* now do the scaling */
-	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+	and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
 	adds	VX, VX, UNIT_X
-9:	subpls	VX, VX, SRC_WIDTH_FIXED
+9:	subspl	VX, VX, SRC_WIDTH_FIXED
 	bpl	9b
-	subs	W, W, #(8 + prefetch_braking_distance)
+	subs	W, W, #(8 + \prefetch_braking_distance)
 	blt	2f
 	/* calculate prefetch offset */
-	mov	PF_OFFS, #prefetch_distance
+	mov	PF_OFFS, #\prefetch_distance
 	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
 1:	/* main loop, process 8 pixels per iteration with prefetch */
-	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
-	add	PF_OFFS, UNIT_X, lsl #3
+	pld	[SRC, PF_OFFS, asr #(16 - \bpp_shift)]
+	add	PF_OFFS, PF_OFFS, UNIT_X, lsl #3
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
@@ -116,7 +118,7 @@ pixman_asm_function fname
 	subs	W, W, #8
 	bge	1b
 2:
-	subs	W, W, #(4 - 8 - prefetch_braking_distance)
+	subs	W, W, #(4 - 8 - \prefetch_braking_distance)
 	blt	2f
 1:	/* process the remaining pixels */
 	scale_2_pixels
@@ -129,8 +131,8 @@ pixman_asm_function fname
 	scale_2_pixels
 2:
 	tst	W, #1
-	ldrne&t	TMP1, [SRC, TMP1]
-	strne&t	TMP1, [DST]
+	ldr\()\t\()ne	TMP1, [SRC, TMP1]
+	str\()\t\()ne	TMP1, [DST]
 	/* cleanup helper macro */
 	.purgem	scale_2_pixels
 	.unreq	DST
@@ -146,7 +148,7 @@ pixman_asm_function fname
 	/* return */
 	pop	{r4, r5, r6, r7, r8, r10}
 	bx	lr
-.endfunc
+	pixman_end_asm_function
 .endm
 
 generate_nearest_scanline_func \
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a74a0a8..34d38f1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -40,6 +40,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-simd-asm.h"
 
+	pixman_syntax_unified
+
 /* A head macro should do all processing which results in an output of up to
  * 16 bytes, as far as the final load instruction. The corresponding tail macro
  * should complete the processing of the up-to-16 bytes. The calling macro will
@@ -57,7 +59,7 @@
 .endm
 
 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
 .endm
 
 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
@@ -65,8 +67,8 @@
     WK5     .req    STRIDE_S
     WK6     .req    MASK
     WK7     .req    STRIDE_M
-110:    pixld   , 16, 0, SRC, unaligned_src
-        pixld   , 16, 4, SRC, unaligned_src
+110:    pixld   , 16, 0, SRC, \unaligned_src
+        pixld   , 16, 4, SRC, \unaligned_src
         pld     [SRC, SCRATCH]
         pixst   , 16, 0, DST
         pixst   , 16, 4, DST
@@ -122,7 +124,7 @@ generate_composite_function \
 
 .macro src_n_0565_init
         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
-        orr     SRC, SRC, lsl #16
+        orr     SRC, SRC, SRC, lsl #16
         mov     STRIDE_S, SRC
         mov     MASK, SRC
         mov     STRIDE_M, SRC
@@ -130,8 +132,8 @@ generate_composite_function \
 
 .macro src_n_8_init
         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
-        orr     SRC, SRC, lsl #8
-        orr     SRC, SRC, lsl #16
+        orr     SRC, SRC, SRC, lsl #8
+        orr     SRC, SRC, SRC, lsl #16
         mov     STRIDE_S, SRC
         mov     MASK, SRC
         mov     STRIDE_M, SRC
@@ -142,7 +144,7 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    MASK
     WK7     .req    STRIDE_M
-        pixst   cond, numbytes, 4, DST
+        pixst   \cond, \numbytes, 4, DST
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -182,20 +184,20 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro src_x888_8888_pixel, cond, reg
-        orr&cond WK&reg, WK&reg, #0xFF000000
+        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
 .endm
 
 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
 .endm
 
 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
-        src_x888_8888_pixel cond, %(firstreg+0)
- .if numbytes >= 8
-        src_x888_8888_pixel cond, %(firstreg+1)
-  .if numbytes == 16
-        src_x888_8888_pixel cond, %(firstreg+2)
-        src_x888_8888_pixel cond, %(firstreg+3)
+        src_x888_8888_pixel \cond, %(\firstreg+0)
+ .if \numbytes >= 8
+        src_x888_8888_pixel \cond, %(\firstreg+1)
+  .if \numbytes == 16
+        src_x888_8888_pixel \cond, %(\firstreg+2)
+        src_x888_8888_pixel \cond, %(\firstreg+3)
   .endif
  .endif
 .endm
@@ -222,73 +224,73 @@ generate_composite_function \
 .endm
 
 .macro src_0565_8888_2pixels, reg1, reg2
-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
-        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
-        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
-        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
-        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
-        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
-        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
-        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
-        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
-        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
-        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
+        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000
+        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG
+        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
+        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------
+        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg
+        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
+        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 .endm
 
 /* This version doesn't need STRIDE_M, but is one instruction longer.
    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
-        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
-        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
-        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
-        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
-        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
-        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
-        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
-        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
-        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
-        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
-        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
+        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB
+        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000
+        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000
+        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000
+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
+        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb
 */
 
 .macro src_0565_8888_1pixel, reg
-        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
-        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
-        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
-        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
-        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
-        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
-        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
+        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb
+        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000
+        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000
+        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000
+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
 .endm
 
 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 16
-        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
- .elseif numbytes == 8
-        pixld   , 4, firstreg, SRC, unaligned_src
- .elseif numbytes == 4
-        pixld   , 2, firstreg, SRC, unaligned_src
+ .if \numbytes == 16
+        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
+ .elseif \numbytes == 8
+        pixld   , 4, \firstreg, SRC, \unaligned_src
+ .elseif \numbytes == 4
+        pixld   , 2, \firstreg, SRC, \unaligned_src
  .endif
 .endm
 
 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
- .if numbytes == 16
-        src_0565_8888_2pixels firstreg, %(firstreg+1)
-        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
-        src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .if \numbytes == 16
+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
+        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
  .else
-        src_0565_8888_1pixel firstreg
+        src_0565_8888_1pixel \firstreg
  .endif
 .endm
 
@@ -311,23 +313,23 @@ generate_composite_function \
 .endm
 
 .macro src_x888_0565_1pixel  s, d
-        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
-        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
-        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
-        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
+        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000
+        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb
+        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
         /* Top 16 bits are discarded during the following STRH */
 .endm
 
 .macro src_x888_0565_2pixels  slo, shi, d, tmp
-        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
-        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
-        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
-        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
-        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
-        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
-        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
-        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
-        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000
+        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB
+        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb
+        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
+        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB
+        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000
+        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
+        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
 .endm
 
 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
@@ -335,33 +337,33 @@ generate_composite_function \
         WK5     .req    STRIDE_M
         WK6     .req    WK3
         WK7     .req    ORIG_W
- .if numbytes == 16
+ .if \numbytes == 16
         pixld   , 16, 4, SRC, 0
         src_x888_0565_2pixels  4, 5, 0, 0
         pixld   , 8, 4, SRC, 0
         src_x888_0565_2pixels  6, 7, 1, 1
         pixld   , 8, 6, SRC, 0
  .else
-        pixld   , numbytes*2, 4, SRC, 0
+        pixld   , \numbytes*2, 4, SRC, 0
  .endif
 .endm
 
 .macro src_x888_0565_process_tail   cond, numbytes, firstreg
- .if numbytes == 16
+ .if \numbytes == 16
         src_x888_0565_2pixels  4, 5, 2, 2
         src_x888_0565_2pixels  6, 7, 3, 4
- .elseif numbytes == 8
+ .elseif \numbytes == 8
         src_x888_0565_2pixels  4, 5, 1, 1
         src_x888_0565_2pixels  6, 7, 2, 2
- .elseif numbytes == 4
+ .elseif \numbytes == 4
         src_x888_0565_2pixels  4, 5, 1, 1
  .else
         src_x888_0565_1pixel  4, 1
  .endif
- .if numbytes == 16
-        pixst   , numbytes, 0, DST
+ .if \numbytes == 16
+        pixst   , \numbytes, 0, DST
  .else
-        pixst   , numbytes, 1, DST
+        pixst   , \numbytes, 1, DST
  .endif
         .unreq  WK4
         .unreq  WK5
@@ -382,37 +384,37 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro add_8_8_8pixels  cond, dst1, dst2
-        uqadd8&cond  WK&dst1, WK&dst1, MASK
-        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK
+        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M
 .endm
 
 .macro add_8_8_4pixels  cond, dst
-        uqadd8&cond  WK&dst, WK&dst, MASK
+        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK
 .endm
 
 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     WK4     .req    MASK
     WK5     .req    STRIDE_M
- .if numbytes == 16
-        pixld   cond, 8, 4, SRC, unaligned_src
-        pixld   cond, 16, firstreg, DST, 0
-        add_8_8_8pixels cond, firstreg, %(firstreg+1)
-        pixld   cond, 8, 4, SRC, unaligned_src
+ .if \numbytes == 16
+        pixld   \cond, 8, 4, SRC, \unaligned_src
+        pixld   \cond, 16, \firstreg, DST, 0
+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
+        pixld   \cond, 8, 4, SRC, \unaligned_src
  .else
-        pixld   cond, numbytes, 4, SRC, unaligned_src
-        pixld   cond, numbytes, firstreg, DST, 0
+        pixld   \cond, \numbytes, 4, SRC, \unaligned_src
+        pixld   \cond, \numbytes, \firstreg, DST, 0
  .endif
     .unreq  WK4
     .unreq  WK5
 .endm
 
 .macro add_8_8_process_tail  cond, numbytes, firstreg
- .if numbytes == 16
-        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
-        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .if \numbytes == 16
+        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
  .else
-        add_8_8_4pixels cond, firstreg
+        add_8_8_4pixels \cond, \firstreg
  .endif
 .endm
 
@@ -441,8 +443,8 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    STRIDE_M
     WK7     .req    ORIG_W
-        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -451,44 +453,44 @@ generate_composite_function \
 
 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
-        teq     WK&reg0, #0
- .if numbytes > 4
-        teqeq   WK&reg1, #0
-  .if numbytes > 8
-        teqeq   WK&reg2, #0
-        teqeq   WK&reg3, #0
+        teq     WK\()\reg0, #0
+ .if \numbytes > 4
+        teqeq   WK\()\reg1, #0
+  .if \numbytes > 8
+        teqeq   WK\()\reg2, #0
+        teqeq   WK\()\reg3, #0
   .endif
  .endif
 .endm
 
 .macro over_8888_8888_prepare  next
-        mov     WK&next, WK&next, lsr #24
+        mov     WK\()\next, WK\()\next, lsr #24
 .endm
 
 .macro over_8888_8888_1pixel src, dst, offset, next
         /* src = destination component multiplier */
-        rsb     WK&src, WK&src, #255
+        rsb     WK\()\src, WK\()\src, #255
         /* Split even/odd bytes of dst into SCRATCH/dst */
-        uxtb16  SCRATCH, WK&dst
-        uxtb16  WK&dst, WK&dst, ror #8
+        uxtb16  SCRATCH, WK\()\dst
+        uxtb16  WK\()\dst, WK\()\dst, ror #8
         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
-        mla     SCRATCH, SCRATCH, WK&src, MASK
-        mla     WK&dst, WK&dst, WK&src, MASK
+        mla     SCRATCH, SCRATCH, WK\()\src, MASK
+        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK
         /* Where we would have had a stall between the result of the first MLA and the shifter input,
          * reload the complete source pixel */
-        ldr     WK&src, [SRC, #offset]
+        ldr     WK\()\src, [SRC, #\offset]
         /* Multiply by 257/256 to approximate 256/255 */
         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
         /* In this stall, start processing the next pixel */
- .if offset < -4
-        mov     WK&next, WK&next, lsr #24
+ .if \offset < -4
+        mov     WK\()\next, WK\()\next, lsr #24
  .endif
-        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
         /* Recombine even/odd bytes of multiplied destination */
         mov     SCRATCH, SCRATCH, ror #8
-        sel     WK&dst, SCRATCH, WK&dst
+        sel     WK\()\dst, SCRATCH, WK\()\dst
         /* Saturated add of source to multiplied destination */
-        uqadd8  WK&dst, WK&dst, WK&src
+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
 .endm
 
 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
@@ -496,17 +498,17 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    STRIDE_M
     WK7     .req    ORIG_W
-        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
         beq     10f
-        over_8888_8888_prepare  %(4+firstreg)
- .set PROCESS_REG, firstreg
- .set PROCESS_OFF, -numbytes
- .rept numbytes / 4
+        over_8888_8888_prepare  %(4+\firstreg)
+ .set PROCESS_REG, \firstreg
+ .set PROCESS_OFF, -\numbytes
+ .rept \numbytes / 4
         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
   .set PROCESS_OFF, PROCESS_OFF+4
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
     .unreq  WK5
@@ -536,16 +538,16 @@ generate_composite_function \
  */
 .macro mul_8888_8  word, byte, tmp, half
         /* Split even/odd bytes of word apart */
-        uxtb16  tmp, word
-        uxtb16  word, word, ror #8
+        uxtb16  \tmp, \word
+        uxtb16  \word, \word, ror #8
         /* Multiply bytes together with rounding, then by 257/256 */
-        mla     tmp, tmp, byte, half
-        mla     word, word, byte, half /* 1 stall follows */
-        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
-        uxtab16 word, word, word, ror #8
+        mla     \tmp, \tmp, \byte, \half
+        mla     \word, \word, \byte, \half /* 1 stall follows */
+        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */
+        uxtab16 \word, \word, \word, ror #8
         /* Recombine bytes */
-        mov     tmp, tmp, ror #8
-        sel     word, tmp, word
+        mov     \tmp, \tmp, ror #8
+        sel     \word, \tmp, \word
 .endm
 
 /******************************************************************************/
@@ -567,8 +569,8 @@ generate_composite_function \
     WK5     .req    STRIDE_D
     WK6     .req    STRIDE_S
     WK7     .req    ORIG_W
-        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -576,10 +578,10 @@ generate_composite_function \
 .endm
 
 .macro over_8888_n_8888_1pixel src, dst
-        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
-        sub     WK7, WK6, WK&src, lsr #24
-        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
-        uqadd8  WK&dst, WK&dst, WK&src
+        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK\()\src, lsr #24
+        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
 .endm
 
 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
@@ -587,12 +589,12 @@ generate_composite_function \
     WK5     .req    STRIDE_D
     WK6     .req    STRIDE_S
     WK7     .req    ORIG_W
-        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
         beq     10f
         mov     WK6, #255
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
-  .if numbytes == 16 && PROCESS_REG == 2
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+  .if \numbytes == 16 && PROCESS_REG == 2
         /* We're using WK6 and WK7 as temporaries, so half way through
          * 4 pixels, reload the second two source pixels but this time
          * into WK4 and WK5 */
@@ -601,7 +603,7 @@ generate_composite_function \
         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
     .unreq  WK5
@@ -642,13 +644,13 @@ generate_composite_function \
 
 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     WK4     .req    STRIDE_M
-        pixld   , numbytes/4, 4, MASK, unaligned_mask
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes/4, 4, MASK, \unaligned_mask
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
 .endm
 
 .macro over_n_8_8888_1pixel src, dst
-        uxtb    Y, WK4, ror #src*8
+        uxtb    Y, WK4, ror #\src*8
         /* Trailing part of multiplication of source */
         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
         mla     Y, SRC, Y, STRIDE_D
@@ -659,20 +661,20 @@ generate_composite_function \
         sub     ORIG_W, ORIG_W, Y, lsr #24
         sel     Y, SCRATCH, Y
         /* Then multiply the destination */
-        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
-        uqadd8  WK&dst, WK&dst, Y
+        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK\()\dst, WK\()\dst, Y
 .endm
 
 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
     WK4     .req    STRIDE_M
         teq     WK4, #0
         beq     10f
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
-        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
 .endm
@@ -705,14 +707,14 @@ generate_composite_function \
 .endm
 
 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, \firstreg, DST, 0
 .endm
 
 .macro over_reverse_n_8888_1pixel  d, is_only
-        teq     WK&d, #0
+        teq     WK\()\d, #0
         beq     8f       /* replace with source */
-        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
- .if is_only == 1
+        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24
+ .if \is_only == 1
         beq     49f      /* skip store */
  .else
         beq     9f       /* write same value back */
@@ -723,36 +725,36 @@ generate_composite_function \
         uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
         mov     SCRATCH, SCRATCH, ror #8
         sel     ORIG_W, SCRATCH, ORIG_W
-        uqadd8  WK&d, WK&d, ORIG_W
+        uqadd8  WK\()\d, WK\()\d, ORIG_W
         b       9f
-8:      mov     WK&d, SRC
+8:      mov     WK\()\d, SRC
 9:
 .endm
 
 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
-        over_reverse_n_8888_1pixel  reg1, 1
+ .if \numbytes == 4
+        over_reverse_n_8888_1pixel  \reg1, 1
  .else
-        and     SCRATCH, WK&reg1, WK&reg2
-  .if numbytes == 16
-        and     SCRATCH, SCRATCH, WK&reg3
-        and     SCRATCH, SCRATCH, WK&reg4
+        and     SCRATCH, WK\()\reg1, WK\()\reg2
+  .if \numbytes == 16
+        and     SCRATCH, SCRATCH, WK\()\reg3
+        and     SCRATCH, SCRATCH, WK\()\reg4
   .endif
         mvns    SCRATCH, SCRATCH, asr #24
         beq     49f /* skip store if all opaque */
-        over_reverse_n_8888_1pixel  reg1, 0
-        over_reverse_n_8888_1pixel  reg2, 0
-  .if numbytes == 16
-        over_reverse_n_8888_1pixel  reg3, 0
-        over_reverse_n_8888_1pixel  reg4, 0
+        over_reverse_n_8888_1pixel  \reg1, 0
+        over_reverse_n_8888_1pixel  \reg2, 0
+  .if \numbytes == 16
+        over_reverse_n_8888_1pixel  \reg3, 0
+        over_reverse_n_8888_1pixel  \reg4, 0
   .endif
  .endif
-        pixst   , numbytes, reg1, DST
+        pixst   , \numbytes, \reg1, DST
 49:
 .endm
 
 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
-        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
 .endm
 
 generate_composite_function \
@@ -794,20 +796,20 @@ generate_composite_function \
 
 .macro over_white_8888_8888_ca_combine  m, d
         uxtb16  TMP1, TMP0                /* rb_notmask */
-        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
+        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */
         smlatt  TMP3, TMP2, TMP1, HALF    /* red */
         smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
         uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
-        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
-        smlatt  d, TMP1, TMP0, HALF       /* alpha */
+        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */
+        smlatt  \d, TMP1, TMP0, HALF      /* alpha */
         smlabb  TMP1, TMP1, TMP0, HALF    /* green */
         pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
-        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
+        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */
         uxtab16 TMP0, TMP0, TMP0, ror #8
         uxtab16 TMP1, TMP1, TMP1, ror #8
         mov     TMP0, TMP0, ror #8
-        sel     d, TMP0, TMP1
-        uqadd8  d, d, m                   /* d is a late result */
+        sel     \d, TMP0, TMP1
+        uqadd8  \d, \d, \m                 /* d is a late result */
 .endm
 
 .macro over_white_8888_8888_ca_1pixel_head
@@ -853,10 +855,10 @@ generate_composite_function \
 .endm
 
 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 4
+ .if \numbytes == 4
         over_white_8888_8888_ca_1pixel_head
  .else
-  .if numbytes == 16
+  .if \numbytes == 16
         over_white_8888_8888_ca_2pixels_head
         over_white_8888_8888_ca_2pixels_tail
   .endif
@@ -865,7 +867,7 @@ generate_composite_function \
 .endm
 
 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
- .if numbytes == 4
+ .if \numbytes == 4
         over_white_8888_8888_ca_1pixel_tail
  .else
         over_white_8888_8888_ca_2pixels_tail
@@ -1004,7 +1006,7 @@ generate_composite_function \
 .endm
 
 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .rept (numbytes / 4) - 1
+ .rept (\numbytes / 4) - 1
         over_n_8888_8888_ca_1pixel_head
         over_n_8888_8888_ca_1pixel_tail
  .endr
@@ -1020,7 +1022,7 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
         cmp     ip, #-1
         beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
         /* else drop through... */
- .endfunc
+pixman_end_asm_function
 generate_composite_function \
     pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
@@ -1045,84 +1047,84 @@ generate_composite_function \
 
 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
         ldrb    ORIG_W, [SRC], #4
- .if numbytes >= 8
-        ldrb    WK&reg1, [SRC], #4
-  .if numbytes == 16
-        ldrb    WK&reg2, [SRC], #4
-        ldrb    WK&reg3, [SRC], #4
+ .if \numbytes >= 8
+        ldrb    WK\()\reg1, [SRC], #4
+  .if \numbytes == 16
+        ldrb    WK\()\reg2, [SRC], #4
+        ldrb    WK\()\reg3, [SRC], #4
   .endif
  .endif
-        add     DST, DST, #numbytes
+        add     DST, DST, #\numbytes
 .endm
 
 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
 .endm
 
 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
- .if is_only != 1
-        movs    s, ORIG_W
-  .if offset != 0
-        ldrb    ORIG_W, [SRC, #offset]
+ .if \is_only != 1
+        movs    \s, ORIG_W
+  .if \offset != 0
+        ldrb    ORIG_W, [SRC, #\offset]
   .endif
         beq     01f
         teq     STRIDE_M, #0xFF
         beq     02f
  .endif
-        uxtb16  SCRATCH, d                 /* rb_dest */
-        uxtb16  d, d, ror #8               /* ag_dest */
-        mla     SCRATCH, SCRATCH, s, MASK
-        mla     d, d, s, MASK
+        uxtb16  SCRATCH, \d                 /* rb_dest */
+        uxtb16  \d, \d, ror #8               /* ag_dest */
+        mla     SCRATCH, SCRATCH, \s, MASK
+        mla     \d, \d, \s, MASK
         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
-        uxtab16 d, d, d, ror #8
+        uxtab16 \d, \d, \d, ror #8
         mov     SCRATCH, SCRATCH, ror #8
-        sel     d, SCRATCH, d
+        sel     \d, SCRATCH, \d
         b       02f
- .if offset == 0
+ .if \offset == 0
 48:     /* Last mov d,#0 of the set - used as part of shortcut for
          * source values all 0 */
  .endif
-01:     mov     d, #0
+01:     mov     \d, #0
 02:
 .endm
 
 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
+ .if \numbytes == 4
         teq     ORIG_W, ORIG_W, asr #32
-        ldrne   WK&reg1, [DST, #-4]
- .elseif numbytes == 8
-        teq     ORIG_W, WK&reg1
+        ldrne   WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+        teq     ORIG_W, WK\()\reg1
         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
-        ldmnedb DST, {WK&reg1-WK&reg2}
+        ldmdbne DST, {WK\()\reg1-WK\()\reg2}
  .else
-        teq     ORIG_W, WK&reg1
-        teqeq   ORIG_W, WK&reg2
-        teqeq   ORIG_W, WK&reg3
+        teq     ORIG_W, WK\()\reg1
+        teqeq   ORIG_W, WK\()\reg2
+        teqeq   ORIG_W, WK\()\reg3
         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
-        ldmnedb DST, {WK&reg1-WK&reg4}
+        ldmdbne DST, {WK\()\reg1-WK\()\reg4}
  .endif
         cmnne   DST, #0   /* clear C if NE */
         bcs     49f       /* no writes to dest if source all -1 */
         beq     48f       /* set dest to all 0 if source all 0 */
- .if numbytes == 4
-        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
-        str     WK&reg1, [DST, #-4]
- .elseif numbytes == 8
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
-        stmdb   DST, {WK&reg1-WK&reg2}
+ .if \numbytes == 4
+        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1
+        str     WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0
+        stmdb   DST, {WK\()\reg1-WK\()\reg2}
  .else
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
-        stmdb   DST, {WK&reg1-WK&reg4}
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0
+        stmdb   DST, {WK\()\reg1-WK\()\reg4}
  .endif
 49:
 .endm
 
 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
-        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
 .endm
 
 generate_composite_function \
@@ -1149,21 +1151,21 @@ generate_composite_function \
 .endm
 
 .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, \firstreg, DST, 0
 .endm
 
 .macro over_n_8888_1pixel dst
-        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
-        uqadd8  WK&dst, WK&dst, SRC
+        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK
+        uqadd8  WK\()\dst, WK\()\dst, SRC
 .endm
 
 .macro over_n_8888_process_tail  cond, numbytes, firstreg
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
         over_n_8888_1pixel %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 .endm
 
 generate_composite_function \
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index da153c3..5ec19e0 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -112,64 +112,64 @@
  */
 
 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
- .if numbytes == 16
-  .if unaligned == 1
-        op&r&cond    WK&reg0, [base], #4
-        op&r&cond    WK&reg1, [base], #4
-        op&r&cond    WK&reg2, [base], #4
-        op&r&cond    WK&reg3, [base], #4
+ .if \numbytes == 16
+  .if \unaligned == 1
+        \op\()r\()\cond    WK\()\reg0, [\base], #4
+        \op\()r\()\cond    WK\()\reg1, [\base], #4
+        \op\()r\()\cond    WK\()\reg2, [\base], #4
+        \op\()r\()\cond    WK\()\reg3, [\base], #4
   .else
-        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
   .endif
- .elseif numbytes == 8
-  .if unaligned == 1
-        op&r&cond    WK&reg0, [base], #4
-        op&r&cond    WK&reg1, [base], #4
+ .elseif \numbytes == 8
+  .if \unaligned == 1
+        \op\()r\()\cond    WK\()\reg0, [\base], #4
+        \op\()r\()\cond    WK\()\reg1, [\base], #4
   .else
-        op&m&cond&ia base!, {WK&reg0,WK&reg1}
-  .endif
- .elseif numbytes == 4
-        op&r&cond    WK&reg0, [base], #4
- .elseif numbytes == 2
-        op&r&cond&h  WK&reg0, [base], #2
- .elseif numbytes == 1
-        op&r&cond&b  WK&reg0, [base], #1
+        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1}
+  .endif
+ .elseif \numbytes == 4
+        \op\()r\()\cond    WK\()\reg0, [\base], #4
+ .elseif \numbytes == 2
+        \op\()rh\()\cond   WK\()\reg0, [\base], #2
+ .elseif \numbytes == 1
+        \op\()rb\()\cond   WK\()\reg0, [\base], #1
  .else
-  .error "unsupported size: numbytes"
+  .error "unsupported size: \numbytes"
  .endif
 .endm
 
 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
- .if numbytes == 16
-        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
- .elseif numbytes == 8
-        stm&cond&db base, {WK&reg0,WK&reg1}
- .elseif numbytes == 4
-        str&cond    WK&reg0, [base, #-4]
- .elseif numbytes == 2
-        str&cond&h  WK&reg0, [base, #-2]
- .elseif numbytes == 1
-        str&cond&b  WK&reg0, [base, #-1]
+ .if \numbytes == 16
+        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
+ .elseif \numbytes == 8
+        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
+ .elseif \numbytes == 4
+        str\()\cond    WK\()\reg0, [\base, #-4]
+ .elseif \numbytes == 2
+        strh\()\cond   WK\()\reg0, [\base, #-2]
+ .elseif \numbytes == 1
+        strb\()\cond   WK\()\reg0, [\base, #-1]
  .else
-  .error "unsupported size: numbytes"
+  .error "unsupported size: \numbytes"
  .endif
 .endm
 
 .macro pixld cond, numbytes, firstreg, base, unaligned
-        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+        pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
 .endm
 
 .macro pixst cond, numbytes, firstreg, base
  .if (flags) & FLAG_DST_READWRITE
-        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+        pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
  .else
-        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+        pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
  .endif
 .endm
 
 .macro PF a, x:vararg
  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
-        a x
+        \a \x
  .endif
 .endm
 
@@ -179,11 +179,11 @@
  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
  * are no gaps when the inner loop starts.
  */
- .if bpp > 0
-        PF  bic,    ptr, base, #31
+ .if \bpp > 0
+        PF  bic,    \ptr, \base, #31
   .set OFFSET, 0
   .rept prefetch_distance+1
-        PF  pld,    [ptr, #OFFSET]
+        PF  pld,    [\ptr, #OFFSET]
    .set OFFSET, OFFSET+32
   .endr
  .endif
@@ -201,42 +201,42 @@
  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
  * possible when there are 4 src bytes for every 1 dst byte).
  */
- .if bpp > 0
-  .ifc base,DST
+ .if \bpp > 0
+  .ifc \base,DST
         /* The test can be simplified further when preloading the destination */
-        PF  tst,    base, #16
+        PF  tst,    \base, #16
         PF  beq,    61f
   .else
-   .if bpp/dst_w_bpp == 4
-        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+   .if \bpp/dst_w_bpp == 4
+        PF  add,    SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
         PF  and,    SCRATCH, SCRATCH, #31
-        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
         PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
         PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
         PF  bcs,    61f
         PF  bpl,    60f
         PF  pld,    [ptr, #32*(prefetch_distance+2)]
    .else
-        PF  mov,    SCRATCH, base, lsl #32-5
-        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
-        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  mov,    SCRATCH, \base, lsl #32-5
+        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
+        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
         PF  bls,    61f
    .endif
   .endif
-60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
+60:     PF  pld,    [\ptr, #32*(prefetch_distance+1)]
 61:
  .endif
 .endm
 
 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
 .macro preload_middle   bpp, base, scratch_holds_offset
- .if bpp > 0
+ .if \bpp > 0
         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
-  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
-   .if scratch_holds_offset
-        PF  pld,    [base, SCRATCH]
+  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
+   .if \scratch_holds_offset
+        PF  pld,    [\base, SCRATCH]
    .else
-        PF  bic,    SCRATCH, base, #31
+        PF  bic,    SCRATCH, \base, #31
         PF  pld,    [SCRATCH, #32*prefetch_distance]
    .endif
   .endif
@@ -244,28 +244,28 @@
 .endm
 
 .macro preload_trailing  bpp, bpp_shift, base
- .if bpp > 0
-  .if bpp*pix_per_block > 256
+ .if \bpp > 0
+  .if \bpp*pix_per_block > 256
         /* Calculations are more complex if more than one fetch per block */
-        PF  and,    WK1, base, #31
-        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
-        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
-        PF  bic,    SCRATCH, base, #31
+        PF  and,    WK1, \base, #31
+        PF  add,    WK1, WK1, WK0, lsl #\bpp_shift
+        PF  add,    WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
+        PF  bic,    SCRATCH, \base, #31
 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
         PF  add,    SCRATCH, SCRATCH, #32
         PF  subs,   WK1, WK1, #32
         PF  bhi,    80b
   .else
         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
-        PF  mov,    SCRATCH, base, lsl #32-5
-        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
-        PF  adceqs, SCRATCH, SCRATCH, #0
+        PF  mov,    SCRATCH, \base, lsl #32-5
+        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
+        PF  adcseq, SCRATCH, SCRATCH, #0
         /* The instruction above has two effects: ensures Z is only
          * set if C was clear (so Z indicates that both shifted quantities
          * were 0), and clears C if Z was set (so C indicates that the sum
          * of the shifted quantities was greater and not equal to 32) */
         PF  beq,    82f
-        PF  bic,    SCRATCH, base, #31
+        PF  bic,    SCRATCH, \base, #31
         PF  bcc,    81f
         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
@@ -288,12 +288,12 @@
  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
  * "base" - base address register of channel to preload (SRC, MASK or DST)
  */
- .if bpp > 0
-  .if narrow_case && (bpp <= dst_w_bpp)
+ .if \bpp > 0
+  .if \narrow_case && (\bpp <= dst_w_bpp)
         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
-        PF  bic,    WK0, base, #31
+        PF  bic,    WK0, \base, #31
         PF  pld,    [WK0]
-        PF  add,    WK1, base, X, LSL #bpp_shift
+        PF  add,    WK1, \base, X, LSL #\bpp_shift
         PF  sub,    WK1, WK1, #1
         PF  bic,    WK1, WK1, #31
         PF  cmp,    WK1, WK0
@@ -301,9 +301,9 @@
         PF  pld,    [WK1]
 90:
   .else
-        PF  bic,    WK0, base, #31
+        PF  bic,    WK0, \base, #31
         PF  pld,    [WK0]
-        PF  add,    WK1, base, X, lsl #bpp_shift
+        PF  add,    WK1, \base, X, lsl #\bpp_shift
         PF  sub,    WK1, WK1, #1
         PF  bic,    WK1, WK1, #31
         PF  cmp,    WK1, WK0
@@ -319,56 +319,56 @@
 
 
 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
-        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
- .if decrementx
-        sub&cond X, X, #8*numbytes/dst_w_bpp
+        \process_head  \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
+ .if \decrementx
+        sub\()\cond X, X, #8*\numbytes/dst_w_bpp
  .endif
-        process_tail  cond, numbytes, firstreg
+        \process_tail  \cond, \numbytes, \firstreg
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
-        pixst   cond, numbytes, firstreg, DST
+        pixst   \cond, \numbytes, \firstreg, DST
  .endif
 .endm
 
 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
  .if (flags) & FLAG_BRANCH_OVER
-  .ifc cond,mi
+  .ifc \cond,mi
         bpl     100f
   .endif
-  .ifc cond,cs
+  .ifc \cond,cs
         bcc     100f
   .endif
-  .ifc cond,ne
+  .ifc \cond,ne
         beq     100f
   .endif
-        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        conditional_process1_helper  , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
 100:
  .else
-        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        conditional_process1_helper  \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
  .endif
 .endm
 
 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
         /* Can't interleave reads and writes */
-        test
-        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+        \test
+        conditional_process1  \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
-        test
+        \test
   .endif
-        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+        conditional_process1  \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
  .else
         /* Can interleave reads and writes for better scheduling */
-        test
-        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
-        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
-  .if decrementx
-        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
-        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
-  .endif
-        process_tail  cond1, numbytes1, firstreg1
-        process_tail  cond2, numbytes2, firstreg2
-        pixst   cond1, numbytes1, firstreg1, DST
-        pixst   cond2, numbytes2, firstreg2, DST
+        \test
+        \process_head  \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
+        \process_head  \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
+  .if \decrementx
+        sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
+        sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
+  .endif
+        \process_tail  \cond1, \numbytes1, \firstreg1
+        \process_tail  \cond2, \numbytes2, \firstreg2
+        pixst   \cond1, \numbytes1, \firstreg1, DST
+        pixst   \cond2, \numbytes2, \firstreg2, DST
  .endif
 .endm
 
@@ -400,12 +400,12 @@
  .endif
         /* Use unaligned loads in all cases for simplicity */
  .if dst_w_bpp == 8
-        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+        conditional_process2  test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
  .elseif dst_w_bpp == 16
         test_bits_1_0_ptr
-        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+        conditional_process1  cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
  .endif
-        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+        conditional_process2  test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
         ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
  .endif
@@ -424,12 +424,12 @@
 .endm
 
 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
-        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+        conditional_process2  test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
  .if dst_w_bpp == 16
         test_bits_1_0_pix
-        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+        conditional_process1  cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
  .elseif dst_w_bpp == 8
-        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+        conditional_process2  test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
  .endif
 .endm
 
@@ -438,7 +438,7 @@
 110:
  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
  .rept pix_per_block*dst_w_bpp/128
-        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
+        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 1
   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
         preload_middle  src_bpp, SRC, 1
   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -453,9 +453,9 @@
          * preloads for, to achieve staggered prefetches for multiple channels, because there are
          * always two STMs per prefetch, so there is always an opposite STM on which to put the
          * preload. Note, no need to BIC the base register here */
-        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
+        PF  pld,    [DST, #32*prefetch_distance - \dst_alignment]
   .endif
-        process_tail  , 16, 0
+        \process_tail  , 16, 0
   .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 16, 0, DST
   .endif
@@ -470,11 +470,11 @@
  .if dst_r_bpp > 0
         tst     DST, #16
         bne     111f
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
+        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
         b       112f
 111:
  .endif
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
+        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
 112:
         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
@@ -487,13 +487,13 @@
  .endif
         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
         /* The remainder of the line is handled identically to the medium case */
-        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+        medium_case_inner_loop_and_trailing_pixels  \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
 .endm
 
 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 120:
-        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
-        process_tail  , 16, 0
+        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 0
+        \process_tail  , 16, 0
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 16, 0, DST
  .endif
@@ -501,16 +501,16 @@
         bhs     120b
         /* Trailing pixels */
         tst     X, #128/dst_w_bpp - 1
-        beq     exit_label
-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        beq     \exit_label
+        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
 .endm
 
 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
         tst     X, #16*8/dst_w_bpp
-        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+        conditional_process1  ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
         /* Trailing pixels */
         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
 .endm
 
 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
@@ -523,37 +523,37 @@
         tst     SRC, #3
         bne     140f
   .endif
-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
   .if src_bpp == 8 || src_bpp == 16
-        b       exit_label
+        b       \exit_label
 140:
-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
   .endif
  .if mask_bpp == 8 || mask_bpp == 16
-        b       exit_label
+        b       \exit_label
 141:
   .if src_bpp == 8 || src_bpp == 16
         tst     SRC, #3
         bne     142f
   .endif
-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
   .if src_bpp == 8 || src_bpp == 16
-        b       exit_label
+        b       \exit_label
 142:
-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
   .endif
  .endif
 .endm
 
 
 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
- .if vars_spilled
+ .if \vars_spilled
         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
         /* This is ldmia sp,{} */
         .word   0xE89D0000 | LINE_SAVED_REGS
  .endif
         subs    Y, Y, #1
- .if vars_spilled
+ .if \vars_spilled
   .if (LINE_SAVED_REGS) & (1<<1)
         str     Y, [sp]
   .endif
@@ -565,18 +565,18 @@
  .if mask_bpp > 0
         add     MASK, MASK, STRIDE_M
  .endif
- .if restore_x
+ .if \restore_x
         mov     X, ORIG_W
  .endif
-        bhs     loop_label
- .ifc "last_one",""
-  .if vars_spilled
+        bhs     \loop_label
+ .ifc "\last_one",""
+  .if \vars_spilled
         b       197f
   .else
         b       198f
   .endif
  .else
-  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+  .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
         b       198f
   .endif
  .endif
@@ -596,17 +596,17 @@
                                    process_tail, \
                                    process_inner_loop
 
-    pixman_asm_function fname
+    pixman_asm_function \fname
 
 /*
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set flags, flags_
- .set prefetch_distance, prefetch_distance_
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set flags, \flags_
+ .set prefetch_distance, \prefetch_distance_
 
 /*
  * Select prefetch type for this function.
@@ -732,7 +732,7 @@
         sub     Y, Y, #1
 #endif
 
-        init
+        \init
 
  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
         /* Reserve a word in which to store X during leading pixels */
@@ -773,7 +773,7 @@
    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
   .endif
 151:    /* New line */
-        newline
+        \newline
         preload_leading_step1  src_bpp, WK1, SRC
         preload_leading_step1  mask_bpp, WK2, MASK
   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -790,7 +790,7 @@
         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
   .endif
 
-        leading_15bytes  process_head, process_tail
+        leading_15bytes  \process_head, \process_tail
         
 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -800,10 +800,10 @@
         and     SCRATCH, MASK, #31
         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
   .endif
-  .ifc "process_inner_loop",""
-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+  .ifc "\process_inner_loop",""
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
   .else
-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
   .endif
 
 157:    /* Check for another line */
@@ -825,7 +825,7 @@
   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
  .endif
 161:    /* New line */
-        newline
+        \newline
         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 0, mask_bpp, mask_bpp_shift, MASK
  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -837,10 +837,10 @@
         beq     164f
         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
         
-        leading_15bytes  process_head, process_tail
+        leading_15bytes  \process_head, \process_tail
         
 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
-        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
         
 167:    /* Check for another line */
         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
@@ -856,7 +856,7 @@
         .word   0xE92D0000 | LINE_SAVED_REGS
  .endif
 171:    /* New line */
-        newline
+        \newline
         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 1, mask_bpp, mask_bpp_shift, MASK
  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -868,8 +868,8 @@
         beq     174f
 172:    subs    X, X, #1
         blo     177f
-        process_head  , 1, 0, 1, 1, 0
-        process_tail  , 1, 0
+        \process_head  , 1, 0, 1, 1, 0
+        \process_tail  , 1, 0
   .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 1, 0, DST
   .endif
@@ -880,15 +880,15 @@
         beq     174f
         subs    X, X, #1
         blo     177f
-        process_head  , 2, 0, 1, 1, 0
-        process_tail  , 2, 0
+        \process_head  , 2, 0, 1, 1, 0
+        \process_tail  , 2, 0
   .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 2, 0, DST
   .endif
  .endif
 
 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
-        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
 
 177:    /* Check for another line */
         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
@@ -908,7 +908,7 @@
         add     sp, sp, #4
  .endif
 
-        cleanup
+        \cleanup
 
 #ifdef DEBUG_PARAMS
         add     sp, sp, #9*4 /* junk the debug copy of arguments */
@@ -932,13 +932,13 @@
     .unreq  WK3
     .unreq  SCRATCH
     .unreq  ORIG_W
-    .endfunc
+    pixman_end_asm_function
 .endm
 
 .macro line_saved_regs  x:vararg
  .set LINE_SAVED_REGS, 0
  .set LINE_SAVED_REG_COUNT, 0
- .irp SAVED_REG,x
+ .irp SAVED_REG,\x
   .ifc "SAVED_REG","Y"
    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1


More information about the xorg-commit mailing list