[Pixman] [PATCH 3/5] ARMv6: Rewrite of non-scaled fast paths
Ben Avison
bavison at riscosopen.org
Fri Dec 21 10:51:03 PST 2012
There is very little in common with the previous revision of this source
file, but I present it as a patch nevertheless.
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index b438001..8700da9 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,14 +1,14 @@
/*
- * Copyright © 2008 Mozilla Corporation
- * Copyright © 2010 Nokia Corporation
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
+ * documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Mozilla Corporation makes no
+ * specific, written prior permission. The copyright holders make no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
- * Author: Jeff Muizelaar (jeff at infidigm.net)
+ * Author: Ben Avison (bavison at riscosopen.org)
*
*/
@@ -37,412 +37,552 @@
.altmacro
.p2align 2
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
- .func fname
- .global fname
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
-.endm
+#include "pixman-arm-simd-asm.h"
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ * cond ARM condition code for code block
+ * numbytes Number of output bytes that should be generated this time
+ * firstreg First WK register in which to place output
+ * unaligned_src Whether to use non-wordaligned loads of source image
+ * unaligned_mask Whether to use non-wordaligned loads of mask image
+ * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
*/
-pixman_asm_function pixman_composite_add_8_8_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- mov r10, r1
- sub sp, sp, #4
- subs r10, r10, #1
- mov r11, r0
- mov r8, r2
- str r3, [sp]
- ldr r7, [sp, #36]
- bcc 0f
-6: cmp r11, #0
- beq 1f
- orr r3, r8, r7
- tst r3, #3
- beq 2f
- mov r1, r8
- mov r0, r7
- mov r12, r11
- b 3f
-5: tst r3, #3
- beq 4f
-3: ldrb r2, [r0], #1
- subs r12, r12, #1
- ldrb r3, [r1]
- uqadd8 r3, r2, r3
- strb r3, [r1], #1
- orr r3, r1, r0
- bne 5b
-1: ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
-10: subs r10, r10, #1
- bcs 6b
-0: add sp, sp, #4
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-2: mov r12, r11
- mov r1, r8
- mov r0, r7
-4: cmp r12, #3
- subgt r6, r12, #4
- movgt r9, r12
- lsrgt r5, r6, #2
- addgt r3, r5, #1
- movgt r12, #0
- lslgt r4, r3, #2
- ble 7f
-8: ldr r3, [r0, r12]
- ldr r2, [r1, r12]
- uqadd8 r3, r3, r2
- str r3, [r1, r12]
- add r12, r12, #4
- cmp r12, r4
- bne 8b
- sub r3, r9, #4
- bic r3, r3, #3
- add r3, r3, #4
- subs r12, r6, r5, lsl #2
- add r1, r1, r3
- add r0, r0, r3
- beq 1b
-7: mov r4, #0
-9: ldrb r3, [r1, r4]
- ldrb r2, [r0, r4]
- uqadd8 r3, r2, r3
- strb r3, [r1, r4]
- add r4, r4, #1
- cmp r4, r12
- bne 9b
- ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
- b 10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #20
- cmp r1, #0
- mov r12, r2
- str r1, [sp, #12]
- str r0, [sp, #16]
- ldr r2, [sp, #52]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp]
- ldr r3, [sp, #56]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-6: ldr r11, [sp, #8]
-1: ldr r9, [sp]
- mov r0, r12
- add r12, r12, r9
- mov r1, r2
- str r12, [sp, #4]
- add r2, r2, r11
- ldr r12, [sp, #16]
- ldr r3, =0x00800080
- ldr r9, =0xff00ff00
- mov r11, #255
- cmp r12, #0
- beq 4f
-5: ldr r5, [r1], #4
- ldr r4, [r0]
- sub r8, r11, r5, lsr #24
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mla r6, r6, r8, r3
- mla r7, r7, r8, r3
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- and r7, r7, r9
- uxtab16 r6, r7, r6, ror #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 5b
-4: ldr r3, [sp, #12]
- add r10, r10, #1
- cmp r10, r3
- ldr r12, [sp, #4]
- bne 6b
-0: add sp, sp, #20
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- str r1, [sp, #12]
- ldrb r1, [sp, #71]
- mov r12, r2
- str r0, [sp, #16]
- ldr r2, [sp, #60]
- str r1, [sp, #24]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #20]
- ldr r3, [sp, #64]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-5: ldr r11, [sp, #8]
-1: ldr r4, [sp, #20]
- mov r0, r12
- mov r1, r2
- add r12, r12, r4
- add r2, r2, r11
- str r12, [sp]
- str r2, [sp, #4]
- ldr r12, [sp, #16]
- ldr r2, =0x00800080
- ldr r3, [sp, #24]
- mov r11, #255
- cmp r12, #0
- beq 3f
-4: ldr r5, [r1], #4
- ldr r4, [r0]
- uxtb16 r6, r5
- uxtb16 r7, r5, ror #8
- mla r6, r6, r3, r2
- mla r7, r7, r3, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- sub r8, r11, r5, lsr #24
- mla r6, r6, r8, r2
- mla r7, r7, r8, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r1, [sp, #12]
- add r10, r10, #1
- cmp r10, r1
- ldr r12, [sp]
- ldr r2, [sp, #4]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- ldr r9, [sp, #60]
- str r1, [sp, #12]
- bic r1, r9, #-16777216
- str r1, [sp, #20]
- mov r12, r2
- lsr r1, r9, #8
- ldr r2, [sp, #20]
- bic r1, r1, #-16777216
- bic r2, r2, #65280
- bic r1, r1, #65280
- str r2, [sp, #20]
- str r0, [sp, #16]
- str r1, [sp, #4]
- ldr r2, [sp, #68]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #24]
- mov r0, #0
- b 1f
-5: ldr r3, [sp, #24]
-1: ldr r4, [sp, #72]
- mov r10, r12
- mov r1, r2
- add r12, r12, r3
- add r2, r2, r4
- str r12, [sp, #8]
- str r2, [sp]
- ldr r12, [sp, #16]
- ldr r11, =0x00800080
- ldr r2, [sp, #4]
- ldr r3, [sp, #20]
- cmp r12, #0
- beq 3f
-4: ldrb r5, [r1], #1
- ldr r4, [r10]
- mla r6, r3, r5, r11
- mla r7, r2, r5, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mvn r8, r5
- lsr r8, r8, #24
- mla r6, r6, r8, r11
- mla r7, r7, r8, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r10], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r4, [sp, #12]
- add r0, r0, #1
- cmp r0, r4
- ldr r12, [sp, #8]
- ldr r2, [sp]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
+.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
-/*
- * Note: This code is only using armv5te instructions (not even armv6),
- * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
- * be split into a few variants, tuned for each microarchitecture.
- *
- * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
- * have efficient write combining), it needs to be changed to use 16-byte
- * aligned writes using STM instruction.
- *
- * Nearest scanline scaler macro template uses the following arguments:
- * fname - name of the function to generate
- * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
- * t - type suffix for LDR/STR instructions
- * prefetch_distance - prefetch in the source image by that many
- * pixels ahead
- * prefetch_braking_distance - stop prefetching when that many pixels are
- * remaining before the end of scanline
+generate_composite_function \
+ pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ blit_process_head, \
+ nop_macro /* process tail */
+
+generate_composite_function \
+ pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ blit_process_head, \
+ nop_macro /* process tail */
+
+generate_composite_function \
+ pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ blit_process_head, \
+ nop_macro /* process tail */
+
+/******************************************************************************/
+
+.macro src_n_8888_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+ ldrh SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+ ldrb SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #8
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail cond, numbytes, firstreg
+ WK4 .req SRC
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+ pixst cond, numbytes, 4, DST
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_8888_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_0565_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_8_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+/******************************************************************************/
+
+.macro src_x888_8888_pixel, cond, reg
+ orr&cond WK®, WK®, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
+ src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+ src_x888_8888_pixel cond, %(firstreg+1)
+ .if numbytes == 16
+ src_x888_8888_pixel cond, %(firstreg+2)
+ src_x888_8888_pixel cond, %(firstreg+3)
+ .endif
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ pixman_composite_src_x888_8888_process_head, \
+ pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+ /* Hold loop invariants in MASK and STRIDE_M */
+ ldr MASK, =0x07E007E0
+ mov STRIDE_M, #0xFF000000
+ /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+ ldr SCRATCH, =0x80008000
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+ and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+ bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+ sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+ pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+ sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+ orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+ It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+ and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+ bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+ mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+ mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+ orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+ pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+ sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+ bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb
+ and WK®, WK®, MASK @ 000000000000000000000gggggg00000
+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000
+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+ pixld , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+ pixld , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+ src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_0565_8888_process_head, \
+ src_0565_8888_process_tail
+
+/******************************************************************************/
+
+.macro add_8_8_8pixels cond, dst1, dst2
+ uqadd8&cond WK&dst1, WK&dst1, MASK
+ uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
+.endm
+
+.macro add_8_8_4pixels cond, dst
+ uqadd8&cond WK&dst, WK&dst, MASK
+.endm
+
+.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req MASK
+ WK5 .req STRIDE_M
+ .if numbytes == 16
+ pixld cond, 8, 4, SRC, unaligned_src
+ pixld cond, 16, firstreg, DST, 0
+ add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ pixld cond, 8, 4, SRC, unaligned_src
+ .else
+ pixld cond, numbytes, 4, SRC, unaligned_src
+ pixld cond, numbytes, firstreg, DST, 0
+ .endif
+ .unreq WK4
+ .unreq WK5
+.endm
+
+.macro add_8_8_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .else
+ add_8_8_4pixels cond, firstreg
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER, \
+ 2, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ add_8_8_process_head, \
+ add_8_8_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x00800080
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req STRIDE_M
+ WK7 .req ORIG_W
+ pixld , numbytes, %(4+firstreg), SRC, unaligned_src
+ pixld , numbytes, firstreg, DST, 0
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
+ /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
+ teq WK®0, #0
+ .if numbytes > 4
+ teqeq WK®1, #0
+ .if numbytes > 8
+ teqeq WK®2, #0
+ teqeq WK®3, #0
+ .endif
+ .endif
+.endm
+
+.macro over_8888_8888_prepare next
+ mov WK&next, WK&next, lsr #24
+.endm
+
+.macro over_8888_8888_1pixel src, dst, offset, next
+ /* src = destination component multiplier */
+ rsb WK&src, WK&src, #255
+ /* Split even/odd bytes of dst into SCRATCH/dst */
+ uxtb16 SCRATCH, WK&dst
+ uxtb16 WK&dst, WK&dst, ror #8
+ /* Multiply through, adding 0.5 to the upper byte of result for rounding */
+ mla SCRATCH, SCRATCH, WK&src, MASK
+ mla WK&dst, WK&dst, WK&src, MASK
+ /* Where we would have had a stall between the result of the first MLA and the shifter input,
+ * reload the complete source pixel */
+ ldr WK&src, [SRC, #offset]
+ /* Multiply by 257/256 to approximate 256/255 */
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ /* In this stall, start processing the next pixel */
+ .if offset < -4
+ mov WK&next, WK&next, lsr #24
+ .endif
+ uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+ /* Recombine even/odd bytes of multiplied destination */
+ mov SCRATCH, SCRATCH, ror #8
+ sel WK&dst, SCRATCH, WK&dst
+ /* Saturated add of source to multiplied destination */
+ uqadd8 WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_8888_process_tail cond, numbytes, firstreg
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req STRIDE_M
+ WK7 .req ORIG_W
+ over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+ beq 10f
+ over_8888_8888_prepare %(4+firstreg)
+ .set PROCESS_REG, firstreg
+ .set PROCESS_OFF, -numbytes
+ .rept numbytes / 4
+ over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+ pixst , numbytes, firstreg, DST
+10:
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+ 2, /* prefetch distance */ \
+ over_8888_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ over_8888_8888_process_head, \
+ over_8888_8888_process_tail
+
+/******************************************************************************/
+
+/* Multiply each byte of a word by a byte.
+ * Useful when there aren't any obvious ways to fill the stalls with other instructions.
+ * word Register containing 4 bytes
+ * byte Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp Scratch register
+ * half Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
*/
+.macro mul_8888_8 word, byte, tmp, half
+ /* Split even/odd bytes of word apart */
+ uxtb16 tmp, word
+ uxtb16 word, word, ror #8
+ /* Multiply bytes together with rounding, then by 257/256 */
+ mla tmp, tmp, byte, half
+ mla word, word, byte, half /* 1 stall follows */
+ uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
+ uxtab16 word, word, word, ror #8
+ /* Recombine bytes */
+ mov tmp, tmp, ror #8
+ sel word, tmp, word
+.endm
+
+/******************************************************************************/
+
+.macro over_8888_n_8888_init
+ /* Mask is constant */
+ ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
+ /* Hold loop invariant in STRIDE_M */
+ ldr STRIDE_M, =0x00800080
+ /* We only want the alpha bits of the constant mask */
+ mov MASK, MASK, lsr #24
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, STRIDE_M, STRIDE_M
+ line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req Y
+ WK5 .req STRIDE_D
+ WK6 .req STRIDE_S
+ WK7 .req ORIG_W
+ pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+ pixld , numbytes, firstreg, DST, 0
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+.macro over_8888_n_8888_1pixel src, dst
+ mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
+ sub WK7, WK6, WK&src, lsr #24
+ mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
+ uqadd8 WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_n_8888_process_tail cond, numbytes, firstreg
+ WK4 .req Y
+ WK5 .req STRIDE_D
+ WK6 .req STRIDE_S
+ WK7 .req ORIG_W
+ over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+ beq 10f
+ mov WK6, #255
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+ .if numbytes == 16 && PROCESS_REG == 2
+ /* We're using WK6 and WK7 as temporaries, so half way through
+ * 4 pixels, reload the second two source pixels but this time
+ * into WK4 and WK5 */
+ ldmdb SRC, {WK4, WK5}
+ .endif
+ over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ pixst , numbytes, firstreg, DST
+10:
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+ 2, /* prefetch distance */ \
+ over_8888_n_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ over_8888_n_8888_process_head, \
+ over_8888_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8_8888_init
+ /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
+ ldr SCRATCH, =0x00800080
+ uxtb16 STRIDE_S, SRC
+ uxtb16 SRC, SRC, ror #8
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+ line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8_8888_newline
+ ldr STRIDE_D, =0x00800080
+ b 1f
+ .ltorg
+1:
+.endm
+
+.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_M
+ pixld , numbytes/4, 4, MASK, unaligned_mask
+ pixld , numbytes, firstreg, DST, 0
+ .unreq WK4
+.endm
+
+.macro over_n_8_8888_1pixel src, dst
+ uxtb Y, WK4, ror #src*8
+ /* Trailing part of multiplication of source */
+ mla SCRATCH, STRIDE_S, Y, STRIDE_D
+ mla Y, SRC, Y, STRIDE_D
+ mov ORIG_W, #255
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 Y, Y, Y, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sub ORIG_W, ORIG_W, Y, lsr #24
+ sel Y, SCRATCH, Y
+ /* Then multiply the destination */
+ mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
+ uqadd8 WK&dst, WK&dst, Y
+.endm
+
+.macro over_n_8_8888_process_tail cond, numbytes, firstreg
+ WK4 .req STRIDE_M
+ teq WK4, #0
+ beq 10f
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+ over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ pixst , numbytes, firstreg, DST
+10:
+ .unreq WK4
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_ONLY_PRELOAD_WIDE \
+ 2, /* prefetch distance */ \
+ over_n_8_8888_init, \
+ over_n_8_8888_newline, \
+ nop_macro, /* cleanup */ \
+ over_n_8_8888_process_head, \
+ over_n_8_8888_process_tail
+
+/******************************************************************************/
-.macro generate_nearest_scanline_func fname, bpp_shift, t, \
- prefetch_distance, \
- prefetch_braking_distance
-
-pixman_asm_function fname
- W .req r0
- DST .req r1
- SRC .req r2
- VX .req r3
- UNIT_X .req ip
- TMP1 .req r4
- TMP2 .req r5
- VXMASK .req r6
- PF_OFFS .req r7
- SRC_WIDTH_FIXED .req r8
-
- ldr UNIT_X, [sp]
- push {r4, r5, r6, r7, r8, r10}
- mvn VXMASK, #((1 << bpp_shift) - 1)
- ldr SRC_WIDTH_FIXED, [sp, #28]
-
- /* define helper macro */
- .macro scale_2_pixels
- ldr&t TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
- adds VX, VX, UNIT_X
- str&t TMP1, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
- bpl 9b
-
- ldr&t TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
- adds VX, VX, UNIT_X
- str&t TMP2, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
- bpl 9b
- .endm
-
- /* now do the scaling */
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
- adds VX, VX, UNIT_X
-9: subpls VX, VX, SRC_WIDTH_FIXED
- bpl 9b
- subs W, W, #(8 + prefetch_braking_distance)
- blt 2f
- /* calculate prefetch offset */
- mov PF_OFFS, #prefetch_distance
- mla PF_OFFS, UNIT_X, PF_OFFS, VX
-1: /* main loop, process 8 pixels per iteration with prefetch */
- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
- add PF_OFFS, UNIT_X, lsl #3
- scale_2_pixels
- scale_2_pixels
- scale_2_pixels
- scale_2_pixels
- subs W, W, #8
- bge 1b
-2:
- subs W, W, #(4 - 8 - prefetch_braking_distance)
- blt 2f
-1: /* process the remaining pixels */
- scale_2_pixels
- scale_2_pixels
- subs W, W, #4
- bge 1b
-2:
- tst W, #2
- beq 2f
- scale_2_pixels
-2:
- tst W, #1
- ldrne&t TMP1, [SRC, TMP1]
- strne&t TMP1, [DST]
- /* cleanup helper macro */
- .purgem scale_2_pixels
- .unreq DST
- .unreq SRC
- .unreq W
- .unreq VX
- .unreq UNIT_X
- .unreq TMP1
- .unreq TMP2
- .unreq VXMASK
- .unreq PF_OFFS
- .unreq SRC_WIDTH_FIXED
- /* return */
- pop {r4, r5, r6, r7, r8, r10}
- bx lr
-.endfunc
-.endm
-
-generate_nearest_scanline_func \
- pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
-
-generate_nearest_scanline_func \
- pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
More information about the Pixman
mailing list