pixman: Branch 'master' - 5 commits
Siarhei Siamashka
siamashka at kemper.freedesktop.org
Tue Jan 29 12:00:52 PST 2013
pixman/Makefile.am | 4
pixman/pixman-arm-simd-asm-scaled.S | 165 ++++++
pixman/pixman-arm-simd-asm.S | 981 +++++++++++++++++++++---------------
pixman/pixman-arm-simd-asm.h | 908 +++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 505 ++++++------------
5 files changed, 1814 insertions(+), 749 deletions(-)
New commits:
commit 7e53e5866458fe592fc109cb1455c21c4b61dee9
Author: Ben Avison <bavison at riscosopen.org>
Date: Sat Jan 19 16:16:53 2013 +0000
ARMv6: Replacement add_8_8, over_8888_8888, over_8888_n_8888 and over_n_8_8888 routines
Improved by adding preloads, combining writes and using the SEL
instruction.
add_8_8
Before After
Mean StdDev Mean StdDev Confidence Change
L1 62.1 0.2 543.4 12.4 100.0% +774.9%
L2 38.7 0.4 116.8 1.7 100.0% +201.8%
M 40.0 0.1 110.1 0.5 100.0% +175.3%
HT 30.9 0.2 43.4 0.5 100.0% +40.4%
VT 30.6 0.3 39.2 0.5 100.0% +28.0%
R 21.3 0.2 35.4 0.4 100.0% +66.6%
RT 8.6 0.2 10.2 0.3 100.0% +19.4%
over_8888_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 32.3 0.1 38.0 0.2 100.0% +17.7%
L2 15.9 0.4 30.6 0.5 100.0% +92.8%
M 13.3 0.0 25.6 0.0 100.0% +92.9%
HT 10.5 0.1 15.5 0.1 100.0% +47.1%
VT 10.4 0.1 14.6 0.1 100.0% +40.8%
R 10.3 0.1 15.8 0.1 100.0% +53.3%
RT 6.0 0.1 7.6 0.1 100.0% +25.9%
over_8888_n_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 17.6 0.1 21.0 0.1 100.0% +19.2%
L2 11.2 0.2 19.2 0.1 100.0% +71.2%
M 10.2 0.0 19.6 0.0 100.0% +92.6%
HT 8.4 0.0 11.9 0.1 100.0% +41.7%
VT 8.3 0.0 11.3 0.1 100.0% +36.4%
R 8.3 0.0 11.8 0.1 100.0% +43.1%
RT 5.1 0.1 6.2 0.1 100.0% +21.3%
over_n_8_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 17.5 0.1 22.8 0.8 100.0% +30.1%
L2 14.2 0.3 21.7 0.2 100.0% +52.6%
M 12.0 0.0 22.3 0.0 100.0% +84.8%
HT 10.5 0.1 14.1 0.1 100.0% +34.5%
VT 10.0 0.1 13.5 0.1 100.0% +35.3%
R 9.4 0.0 12.9 0.2 100.0% +37.7%
RT 5.5 0.1 6.5 0.2 100.0% +19.2%
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index b438001..7110995 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -49,289 +49,6 @@ fname:
.endm
/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
- */
-
-pixman_asm_function pixman_composite_add_8_8_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- mov r10, r1
- sub sp, sp, #4
- subs r10, r10, #1
- mov r11, r0
- mov r8, r2
- str r3, [sp]
- ldr r7, [sp, #36]
- bcc 0f
-6: cmp r11, #0
- beq 1f
- orr r3, r8, r7
- tst r3, #3
- beq 2f
- mov r1, r8
- mov r0, r7
- mov r12, r11
- b 3f
-5: tst r3, #3
- beq 4f
-3: ldrb r2, [r0], #1
- subs r12, r12, #1
- ldrb r3, [r1]
- uqadd8 r3, r2, r3
- strb r3, [r1], #1
- orr r3, r1, r0
- bne 5b
-1: ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
-10: subs r10, r10, #1
- bcs 6b
-0: add sp, sp, #4
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-2: mov r12, r11
- mov r1, r8
- mov r0, r7
-4: cmp r12, #3
- subgt r6, r12, #4
- movgt r9, r12
- lsrgt r5, r6, #2
- addgt r3, r5, #1
- movgt r12, #0
- lslgt r4, r3, #2
- ble 7f
-8: ldr r3, [r0, r12]
- ldr r2, [r1, r12]
- uqadd8 r3, r3, r2
- str r3, [r1, r12]
- add r12, r12, #4
- cmp r12, r4
- bne 8b
- sub r3, r9, #4
- bic r3, r3, #3
- add r3, r3, #4
- subs r12, r6, r5, lsl #2
- add r1, r1, r3
- add r0, r0, r3
- beq 1b
-7: mov r4, #0
-9: ldrb r3, [r1, r4]
- ldrb r2, [r0, r4]
- uqadd8 r3, r2, r3
- strb r3, [r1, r4]
- add r4, r4, #1
- cmp r4, r12
- bne 9b
- ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
- b 10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #20
- cmp r1, #0
- mov r12, r2
- str r1, [sp, #12]
- str r0, [sp, #16]
- ldr r2, [sp, #52]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp]
- ldr r3, [sp, #56]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-6: ldr r11, [sp, #8]
-1: ldr r9, [sp]
- mov r0, r12
- add r12, r12, r9
- mov r1, r2
- str r12, [sp, #4]
- add r2, r2, r11
- ldr r12, [sp, #16]
- ldr r3, =0x00800080
- ldr r9, =0xff00ff00
- mov r11, #255
- cmp r12, #0
- beq 4f
-5: ldr r5, [r1], #4
- ldr r4, [r0]
- sub r8, r11, r5, lsr #24
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mla r6, r6, r8, r3
- mla r7, r7, r8, r3
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- and r7, r7, r9
- uxtab16 r6, r7, r6, ror #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 5b
-4: ldr r3, [sp, #12]
- add r10, r10, #1
- cmp r10, r3
- ldr r12, [sp, #4]
- bne 6b
-0: add sp, sp, #20
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- str r1, [sp, #12]
- ldrb r1, [sp, #71]
- mov r12, r2
- str r0, [sp, #16]
- ldr r2, [sp, #60]
- str r1, [sp, #24]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #20]
- ldr r3, [sp, #64]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-5: ldr r11, [sp, #8]
-1: ldr r4, [sp, #20]
- mov r0, r12
- mov r1, r2
- add r12, r12, r4
- add r2, r2, r11
- str r12, [sp]
- str r2, [sp, #4]
- ldr r12, [sp, #16]
- ldr r2, =0x00800080
- ldr r3, [sp, #24]
- mov r11, #255
- cmp r12, #0
- beq 3f
-4: ldr r5, [r1], #4
- ldr r4, [r0]
- uxtb16 r6, r5
- uxtb16 r7, r5, ror #8
- mla r6, r6, r3, r2
- mla r7, r7, r3, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- sub r8, r11, r5, lsr #24
- mla r6, r6, r8, r2
- mla r7, r7, r8, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r1, [sp, #12]
- add r10, r10, #1
- cmp r10, r1
- ldr r12, [sp]
- ldr r2, [sp, #4]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- ldr r9, [sp, #60]
- str r1, [sp, #12]
- bic r1, r9, #-16777216
- str r1, [sp, #20]
- mov r12, r2
- lsr r1, r9, #8
- ldr r2, [sp, #20]
- bic r1, r1, #-16777216
- bic r2, r2, #65280
- bic r1, r1, #65280
- str r2, [sp, #20]
- str r0, [sp, #16]
- str r1, [sp, #4]
- ldr r2, [sp, #68]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #24]
- mov r0, #0
- b 1f
-5: ldr r3, [sp, #24]
-1: ldr r4, [sp, #72]
- mov r10, r12
- mov r1, r2
- add r12, r12, r3
- add r2, r2, r4
- str r12, [sp, #8]
- str r2, [sp]
- ldr r12, [sp, #16]
- ldr r11, =0x00800080
- ldr r2, [sp, #4]
- ldr r3, [sp, #20]
- cmp r12, #0
- beq 3f
-4: ldrb r5, [r1], #1
- ldr r4, [r10]
- mla r6, r3, r5, r11
- mla r7, r2, r5, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mvn r8, r5
- lsr r8, r8, #24
- mla r6, r6, r8, r11
- mla r7, r7, r8, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r10], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r4, [sp, #12]
- add r0, r0, #1
- cmp r0, r4
- ldr r12, [sp, #8]
- ldr r2, [sp]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-/*
* Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
* be split into a few variants, tuned for each microarchitecture.
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a3e2d04..c209688 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -303,3 +303,311 @@ generate_composite_function \
/******************************************************************************/
+.macro add_8_8_8pixels cond, dst1, dst2
+ uqadd8&cond WK&dst1, WK&dst1, MASK
+ uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
+.endm
+
+.macro add_8_8_4pixels cond, dst
+ uqadd8&cond WK&dst, WK&dst, MASK
+.endm
+
+.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req MASK
+ WK5 .req STRIDE_M
+ .if numbytes == 16
+ pixld cond, 8, 4, SRC, unaligned_src
+ pixld cond, 16, firstreg, DST, 0
+ add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ pixld cond, 8, 4, SRC, unaligned_src
+ .else
+ pixld cond, numbytes, 4, SRC, unaligned_src
+ pixld cond, numbytes, firstreg, DST, 0
+ .endif
+ .unreq WK4
+ .unreq WK5
+.endm
+
+.macro add_8_8_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .else
+ add_8_8_4pixels cond, firstreg
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 2, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ add_8_8_process_head, \
+ add_8_8_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x00800080
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req STRIDE_M
+ WK7 .req ORIG_W
+ pixld , numbytes, %(4+firstreg), SRC, unaligned_src
+ pixld , numbytes, firstreg, DST, 0
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
+ /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
+ teq WK®0, #0
+ .if numbytes > 4
+ teqeq WK®1, #0
+ .if numbytes > 8
+ teqeq WK®2, #0
+ teqeq WK®3, #0
+ .endif
+ .endif
+.endm
+
+.macro over_8888_8888_prepare next
+ mov WK&next, WK&next, lsr #24
+.endm
+
+.macro over_8888_8888_1pixel src, dst, offset, next
+ /* src = destination component multiplier */
+ rsb WK&src, WK&src, #255
+ /* Split even/odd bytes of dst into SCRATCH/dst */
+ uxtb16 SCRATCH, WK&dst
+ uxtb16 WK&dst, WK&dst, ror #8
+ /* Multiply through, adding 0.5 to the upper byte of result for rounding */
+ mla SCRATCH, SCRATCH, WK&src, MASK
+ mla WK&dst, WK&dst, WK&src, MASK
+ /* Where we would have had a stall between the result of the first MLA and the shifter input,
+ * reload the complete source pixel */
+ ldr WK&src, [SRC, #offset]
+ /* Multiply by 257/256 to approximate 256/255 */
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ /* In this stall, start processing the next pixel */
+ .if offset < -4
+ mov WK&next, WK&next, lsr #24
+ .endif
+ uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+ /* Recombine even/odd bytes of multiplied destination */
+ mov SCRATCH, SCRATCH, ror #8
+ sel WK&dst, SCRATCH, WK&dst
+ /* Saturated add of source to multiplied destination */
+ uqadd8 WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_8888_process_tail cond, numbytes, firstreg
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req STRIDE_M
+ WK7 .req ORIG_W
+ over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+ beq 10f
+ over_8888_8888_prepare %(4+firstreg)
+ .set PROCESS_REG, firstreg
+ .set PROCESS_OFF, -numbytes
+ .rept numbytes / 4
+ over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+ pixst , numbytes, firstreg, DST
+10:
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+ 2, /* prefetch distance */ \
+ over_8888_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ over_8888_8888_process_head, \
+ over_8888_8888_process_tail
+
+/******************************************************************************/
+
+/* Multiply each byte of a word by a byte.
+ * Useful when there aren't any obvious ways to fill the stalls with other instructions.
+ * word Register containing 4 bytes
+ * byte Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp Scratch register
+ * half Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
+ */
+.macro mul_8888_8 word, byte, tmp, half
+ /* Split even/odd bytes of word apart */
+ uxtb16 tmp, word
+ uxtb16 word, word, ror #8
+ /* Multiply bytes together with rounding, then by 257/256 */
+ mla tmp, tmp, byte, half
+ mla word, word, byte, half /* 1 stall follows */
+ uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
+ uxtab16 word, word, word, ror #8
+ /* Recombine bytes */
+ mov tmp, tmp, ror #8
+ sel word, tmp, word
+.endm
+
+/******************************************************************************/
+
+.macro over_8888_n_8888_init
+ /* Mask is constant */
+ ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
+ /* Hold loop invariant in STRIDE_M */
+ ldr STRIDE_M, =0x00800080
+ /* We only want the alpha bits of the constant mask */
+ mov MASK, MASK, lsr #24
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, STRIDE_M, STRIDE_M
+ line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req Y
+ WK5 .req STRIDE_D
+ WK6 .req STRIDE_S
+ WK7 .req ORIG_W
+ pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+ pixld , numbytes, firstreg, DST, 0
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+.macro over_8888_n_8888_1pixel src, dst
+ mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
+ sub WK7, WK6, WK&src, lsr #24
+ mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
+ uqadd8 WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_n_8888_process_tail cond, numbytes, firstreg
+ WK4 .req Y
+ WK5 .req STRIDE_D
+ WK6 .req STRIDE_S
+ WK7 .req ORIG_W
+ over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+ beq 10f
+ mov WK6, #255
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+ .if numbytes == 16 && PROCESS_REG == 2
+ /* We're using WK6 and WK7 as temporaries, so half way through
+ * 4 pixels, reload the second two source pixels but this time
+ * into WK4 and WK5 */
+ ldmdb SRC, {WK4, WK5}
+ .endif
+ over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ pixst , numbytes, firstreg, DST
+10:
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+ 2, /* prefetch distance */ \
+ over_8888_n_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ over_8888_n_8888_process_head, \
+ over_8888_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8_8888_init
+ /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
+ ldr SCRATCH, =0x00800080
+ uxtb16 STRIDE_S, SRC
+ uxtb16 SRC, SRC, ror #8
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+ line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8_8888_newline
+ ldr STRIDE_D, =0x00800080
+ b 1f
+ .ltorg
+1:
+.endm
+
+.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_M
+ pixld , numbytes/4, 4, MASK, unaligned_mask
+ pixld , numbytes, firstreg, DST, 0
+ .unreq WK4
+.endm
+
+.macro over_n_8_8888_1pixel src, dst
+ uxtb Y, WK4, ror #src*8
+ /* Trailing part of multiplication of source */
+ mla SCRATCH, STRIDE_S, Y, STRIDE_D
+ mla Y, SRC, Y, STRIDE_D
+ mov ORIG_W, #255
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 Y, Y, Y, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sub ORIG_W, ORIG_W, Y, lsr #24
+ sel Y, SCRATCH, Y
+ /* Then multiply the destination */
+ mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
+ uqadd8 WK&dst, WK&dst, Y
+.endm
+
+.macro over_n_8_8888_process_tail cond, numbytes, firstreg
+ WK4 .req STRIDE_M
+ teq WK4, #0
+ beq 10f
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+ over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ pixst , numbytes, firstreg, DST
+10:
+ .unreq WK4
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+ 2, /* prefetch distance */ \
+ over_n_8_8888_init, \
+ over_n_8_8888_newline, \
+ nop_macro, /* cleanup */ \
+ over_n_8_8888_process_head, \
+ over_n_8_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 09a5036..af062e1 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -31,351 +31,6 @@
#include "pixman-arm-common.h"
#include "pixman-inlines.h"
-#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
-
-void
-pixman_composite_add_8_8_asm_armv6 (int32_t width,
- int32_t height,
- uint8_t *dst_line,
- int32_t dst_stride,
- uint8_t *src_line,
- int32_t src_stride)
-{
- uint8_t *dst, *src;
- int32_t w;
- uint8_t s, d;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
- /* ensure both src and dst are properly aligned before doing 32 bit reads
- * we'll stay in this loop if src and dst have differing alignments
- */
- while (w && (((uintptr_t)dst & 3) || ((uintptr_t)src & 3)))
- {
- s = *src;
- d = *dst;
- asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
- *dst = d;
-
- dst++;
- src++;
- w--;
- }
-
- while (w >= 4)
- {
- asm ("uqadd8 %0, %1, %2"
- : "=r" (*(uint32_t*)dst)
- : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
- dst += 4;
- src += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *src;
- d = *dst;
- asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
- *dst = d;
-
- dst++;
- src++;
- w--;
- }
- }
-
-}
-
-void
-pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
- int32_t height,
- uint32_t *dst_line,
- int32_t dst_stride,
- uint32_t *src_line,
- int32_t src_stride)
-{
- uint32_t *dst;
- uint32_t *src;
- int32_t w;
- uint32_t component_half = 0x800080;
- uint32_t upper_component_mask = 0xff00ff00;
- uint32_t alpha_mask = 0xff;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
-/* #define inner_branch */
- asm volatile (
- "cmp %[w], #0\n\t"
- "beq 2f\n\t"
- "1:\n\t"
- /* load src */
- "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
- /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
- * The 0x0 case also allows us to avoid doing an unecessary data
- * write which is more valuable so we only check for that
- */
- "cmp r5, #0\n\t"
- "beq 3f\n\t"
-
- /* = 255 - alpha */
- "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
- "ldr r4, [%[dest]] \n\t"
-
-#else
- "ldr r4, [%[dest]] \n\t"
-
- /* = 255 - alpha */
- "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-#endif
- "uxtb16 r6, r4\n\t"
- "uxtb16 r7, r4, ror #8\n\t"
-
- /* multiply by 257 and divide by 65536 */
- "mla r6, r6, r8, %[component_half]\n\t"
- "mla r7, r7, r8, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- /* recombine the 0xff00ff00 bytes of r6 and r7 */
- "and r7, r7, %[upper_component_mask]\n\t"
- "uxtab16 r6, r7, r6, ror #8\n\t"
-
- "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
- "3:\n\t"
-
-#endif
- "str r5, [%[dest]], #4\n\t"
- /* increment counter and jmp to top */
- "subs %[w], %[w], #1\n\t"
- "bne 1b\n\t"
- "2:\n\t"
- : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
- : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
- [alpha_mask] "r" (alpha_mask)
- : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
- );
- }
-}
-
-void
-pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
- int32_t height,
- uint32_t *dst_line,
- int32_t dst_stride,
- uint32_t *src_line,
- int32_t src_stride,
- uint32_t mask)
-{
- uint32_t *dst;
- uint32_t *src;
- int32_t w;
- uint32_t component_half = 0x800080;
- uint32_t alpha_mask = 0xff;
-
- mask = (mask) >> 24;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- src = src_line;
- src_line += src_stride;
- w = width;
-
-/* #define inner_branch */
- asm volatile (
- "cmp %[w], #0\n\t"
- "beq 2f\n\t"
- "1:\n\t"
- /* load src */
- "ldr r5, [%[src]], #4\n\t"
-#ifdef inner_branch
- /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
- * The 0x0 case also allows us to avoid doing an unecessary data
- * write which is more valuable so we only check for that
- */
- "cmp r5, #0\n\t"
- "beq 3f\n\t"
-
-#endif
- "ldr r4, [%[dest]] \n\t"
-
- "uxtb16 r6, r5\n\t"
- "uxtb16 r7, r5, ror #8\n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
- "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r5, r6, r7, lsl #8\n\t"
-
- "uxtb16 r6, r4\n\t"
- "uxtb16 r7, r4, ror #8\n\t"
-
- /* 255 - alpha */
- "sub r8, %[alpha_mask], r5, lsr #24\n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, r6, r8, %[component_half]\n\t"
- "mla r7, r7, r8, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r6, r6, r7, lsl #8\n\t"
-
- "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
- "3:\n\t"
-
-#endif
- "str r5, [%[dest]], #4\n\t"
- /* increment counter and jmp to top */
- "subs %[w], %[w], #1\n\t"
- "bne 1b\n\t"
- "2:\n\t"
- : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
- : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
- [alpha_mask] "r" (alpha_mask)
- : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
- );
- }
-}
-
-void
-pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
- int32_t height,
- uint32_t *dst_line,
- int32_t dst_stride,
- uint32_t src,
- int32_t unused,
- uint8_t *mask_line,
- int32_t mask_stride)
-{
- uint32_t srca;
- uint32_t *dst;
- uint8_t *mask;
- int32_t w;
-
- srca = src >> 24;
-
- uint32_t component_mask = 0xff00ff;
- uint32_t component_half = 0x800080;
-
- uint32_t src_hi = (src >> 8) & component_mask;
- uint32_t src_lo = src & component_mask;
-
- while (height--)
- {
- dst = dst_line;
- dst_line += dst_stride;
- mask = mask_line;
- mask_line += mask_stride;
- w = width;
-
-/* #define inner_branch */
- asm volatile (
- "cmp %[w], #0\n\t"
- "beq 2f\n\t"
- "1:\n\t"
- /* load mask */
- "ldrb r5, [%[mask]], #1\n\t"
-#ifdef inner_branch
- /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
- * The 0x0 case also allows us to avoid doing an unecessary data
- * write which is more valuable so we only check for that
- */
- "cmp r5, #0\n\t"
- "beq 3f\n\t"
-
-#endif
- "ldr r4, [%[dest]] \n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, %[src_lo], r5, %[component_half]\n\t"
- "mla r7, %[src_hi], r5, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r5, r6, r7, lsl #8\n\t"
-
- "uxtb16 r6, r4\n\t"
- "uxtb16 r7, r4, ror #8\n\t"
-
- /* we could simplify this to use 'sub' if we were
- * willing to give up a register for alpha_mask
- */
- "mvn r8, r5\n\t"
- "mov r8, r8, lsr #24\n\t"
-
- /* multiply by alpha (r8) then by 257 and divide by 65536 */
- "mla r6, r6, r8, %[component_half]\n\t"
- "mla r7, r7, r8, %[component_half]\n\t"
-
- "uxtab16 r6, r6, r6, ror #8\n\t"
- "uxtab16 r7, r7, r7, ror #8\n\t"
-
- "uxtb16 r6, r6, ror #8\n\t"
- "uxtb16 r7, r7, ror #8\n\t"
-
- /* recombine */
- "orr r6, r6, r7, lsl #8\n\t"
-
- "uqadd8 r5, r6, r5\n\t"
-
-#ifdef inner_branch
- "3:\n\t"
-
-#endif
- "str r5, [%[dest]], #4\n\t"
- /* increment counter and jmp to top */
- "subs %[w], %[w], #1\n\t"
- "bne 1b\n\t"
- "2:\n\t"
- : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
- : [component_half] "r" (component_half),
- [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
- : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
- }
-}
-
-#endif
-
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
commit f87dfd6f37a29c69320edd92f28aed5334b09366
Author: Ben Avison <bavison at riscosopen.org>
Date: Sat Jan 19 16:16:52 2013 +0000
ARMv6: New conversion routines
There was no previous attempt at accelerating these specifically for
ARMv6.
src_x888_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 96.7 0.5 270.4 2.6 100.0% +179.5%
L2 44.6 2.7 110.6 9.7 100.0% +148.0%
M 26.9 0.1 87.6 0.5 100.0% +226.1%
HT 19.3 0.2 37.5 0.4 100.0% +93.7%
VT 18.6 0.1 33.7 0.4 100.0% +81.6%
R 18.4 0.1 32.2 0.3 100.0% +75.2%
RT 9.2 0.2 12.1 0.3 100.0% +31.4%
src_0565_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 37.0 0.3 66.9 0.2 100.0% +80.8%
L2 30.3 0.2 55.9 0.3 100.0% +84.4%
M 25.9 0.0 62.3 0.2 100.0% +140.3%
HT 15.2 0.1 33.1 0.3 100.0% +116.9%
VT 15.1 0.1 30.7 0.3 100.0% +103.6%
R 14.2 0.1 27.6 0.3 100.0% +94.0%
RT 6.0 0.1 11.2 0.3 100.0% +87.2%
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 49993b5..a3e2d04 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -180,3 +180,126 @@ generate_composite_function \
/******************************************************************************/
+.macro src_x888_8888_pixel, cond, reg
+ orr&cond WK®, WK®, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
+ src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+ src_x888_8888_pixel cond, %(firstreg+1)
+ .if numbytes == 16
+ src_x888_8888_pixel cond, %(firstreg+2)
+ src_x888_8888_pixel cond, %(firstreg+3)
+ .endif
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ pixman_composite_src_x888_8888_process_head, \
+ pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+ /* Hold loop invariants in MASK and STRIDE_M */
+ ldr MASK, =0x07E007E0
+ mov STRIDE_M, #0xFF000000
+ /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+ ldr SCRATCH, =0x80008000
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+ and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+ bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+ sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+ pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+ sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+ orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+ It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+ and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+ bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+ mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+ mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+ orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+ pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+ sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+ bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb
+ and WK®, WK®, MASK @ 000000000000000000000gggggg00000
+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000
+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+ pixld , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+ pixld , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+ src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_0565_8888_process_head, \
+ src_0565_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index de66e57..09a5036 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -378,10 +378,14 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
uint16_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
+ uint16_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
@@ -523,6 +527,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888),
+
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565),
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565),
PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565),
@@ -549,6 +556,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
+
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
commit a0f59f3b2884b056428229363576666f158a9bb4
Author: Ben Avison <bavison at riscosopen.org>
Date: Sat Jan 19 16:16:51 2013 +0000
ARMv6: New blit routines
These are usable either as various composite operations, or via the
top-level function pixman_blt() which now does some blitting for the
first time on an ARMv6 platform (previously it just returned FALSE).
src_8888_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 414.5 9.4 445.8 3.6 100.0% +7.6%
L2 93.3 20.7 114.5 12.9 100.0% +22.7%
M 57.0 0.2 89.2 0.5 100.0% +56.4%
HT 28.7 0.3 39.6 0.4 100.0% +37.9%
VT 25.5 0.2 35.3 0.4 100.0% +38.4%
R 20.1 0.1 33.8 0.3 100.0% +67.8%
RT 7.8 0.2 12.7 0.4 100.0% +62.7%
src_0565_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 397.4 6.1 412.5 5.2 100.0% +3.8%
L2 143.2 10.9 141.9 6.5 68.9% -0.9% (insignificant)
M 90.7 0.4 133.5 0.7 100.0% +47.1%
HT 38.6 0.3 53.7 0.7 100.0% +39.0%
VT 33.0 0.3 47.3 0.6 100.0% +43.3%
R 25.7 0.2 42.1 0.5 100.0% +64.1%
RT 8.0 0.2 13.3 0.3 100.0% +65.6%
src_8_8
Before After
Mean StdDev Mean StdDev Confidence Change
L1 716.5 9.8 768.2 20.4 100.0% +7.2%
L2 246.2 12.7 260.5 8.8 100.0% +5.8%
M 146.8 0.7 227.9 0.7 100.0% +55.2%
HT 44.9 0.6 62.1 1.0 100.0% +38.2%
VT 35.6 0.4 53.4 0.7 100.0% +50.0%
R 29.7 0.3 48.2 0.6 100.0% +62.2%
RT 8.6 0.2 12.9 0.4 100.0% +49.3%
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 64588a1..49993b5 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -51,6 +51,67 @@
* preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
*/
+.macro blit_init
+ line_saved_regs STRIDE_D, STRIDE_S
+.endm
+
+.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+110: pixld , 16, 0, SRC, unaligned_src
+ pixld , 16, 4, SRC, unaligned_src
+ pld [SRC, SCRATCH]
+ pixst , 16, 0, DST
+ pixst , 16, 4, DST
+ subs X, X, #32*8/src_bpp
+ bhs 110b
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 4, /* prefetch distance */ \
+ blit_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ blit_process_head, \
+ nop_macro, /* process tail */ \
+ blit_inner_loop
+
+generate_composite_function \
+ pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 4, /* prefetch distance */ \
+ blit_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ blit_process_head, \
+ nop_macro, /* process tail */ \
+ blit_inner_loop
+
+generate_composite_function \
+ pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 3, /* prefetch distance */ \
+ blit_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ blit_process_head, \
+ nop_macro, /* process tail */ \
+ blit_inner_loop
+
+/******************************************************************************/
+
.macro src_n_8888_init
ldr SRC, [sp, #ARGS_STACK_OFFSET]
mov STRIDE_S, SRC
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index f313df3..de66e57 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -376,6 +376,13 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
#endif
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
+ uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
+ uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
+ uint8_t, 1, uint8_t, 1)
+
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
@@ -458,8 +465,90 @@ arm_simd_fill (pixman_implementation_t *imp,
}
}
+static pixman_bool_t
+arm_simd_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride, /* in 32-bit words */
+ int dst_stride, /* in 32-bit words */
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dest_x,
+ int dest_y,
+ int width,
+ int height)
+{
+ if (src_bpp != dst_bpp)
+ return FALSE;
+
+ switch (src_bpp)
+ {
+ case 8:
+ pixman_composite_src_8_8_asm_armv6 (
+ width, height,
+ (uint8_t *)(((char *) dst_bits) +
+ dest_y * dst_stride * 4 + dest_x * 1), dst_stride * 4,
+ (uint8_t *)(((char *) src_bits) +
+ src_y * src_stride * 4 + src_x * 1), src_stride * 4);
+ return TRUE;
+ case 16:
+ pixman_composite_src_0565_0565_asm_armv6 (
+ width, height,
+ (uint16_t *)(((char *) dst_bits) +
+ dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+ (uint16_t *)(((char *) src_bits) +
+ src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+ return TRUE;
+ case 32:
+ pixman_composite_src_8888_8888_asm_armv6 (
+ width, height,
+ (uint32_t *)(((char *) dst_bits) +
+ dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+ (uint32_t *)(((char *) src_bits) +
+ src_y * src_stride * 4 + src_x * 4), src_stride);
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
static const pixman_fast_path_t arm_simd_fast_paths[] =
{
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a1b5g5r5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, a4r4g4b4, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, a4b4g4r4, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565),
+
+ PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, r3g3b2, null, r3g3b2, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, b2g3r3, null, b2g3r3, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, a2r2g2b2, null, a2r2g2b2, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, a2b2g2r2, null, a2b2g2r2, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, c8, null, c8, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, g8, null, g8, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, x4a4, null, x4a4, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
+
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
@@ -494,6 +583,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+ imp->blt = arm_simd_blt;
imp->fill = arm_simd_fill;
return imp;
commit 3cff56c5b091d2e584503e7887414e224876de37
Author: Ben Avison <bavison at riscosopen.org>
Date: Sat Jan 19 16:16:50 2013 +0000
ARMv6: New fill routines
Note that this also effectively accelerates src_n_8888, src_n_0565 and
src_n_8 composite types, because of the fast paths in
pixman-fast-path.c implemented by fast_composite_solid_fill(), which
end up dispatching these platform-specific fill routines.
src_n_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 157.3 1.1 574.2 8.7 100.0% +265.0%
L2 94.2 0.5 364.8 4.2 100.0% +287.3%
M 92.7 0.4 358.7 1.1 100.0% +287.1%
HT 68.5 0.9 133.6 4.0 100.0% +95.2%
VT 61.3 0.8 111.8 2.6 100.0% +82.4%
R 61.1 0.9 108.7 2.8 100.0% +78.1%
RT 24.6 1.0 28.6 1.6 100.0% +16.0%
src_n_0565
Before After
Mean StdDev Mean StdDev Confidence Change
L1 157.4 1.0 983.1 38.5 100.0% +524.6%
L2 93.6 0.5 696.0 14.3 100.0% +643.4%
M 92.7 0.4 680.5 1.0 100.0% +634.0%
HT 68.3 0.9 160.3 6.6 100.0% +134.6%
VT 61.1 0.8 130.1 3.4 100.0% +112.9%
R 61.0 0.8 125.4 4.1 100.0% +105.7%
RT 24.9 1.3 29.5 1.5 100.0% +18.2%
src_n_8
Before After
Mean StdDev Mean StdDev Confidence Change
L1 154.7 1.0 1324.4 48.5 100.0% +756.3%
L2 92.4 0.4 1178.4 10.9 100.0% +1175.6%
M 92.9 0.4 1275.7 2.1 100.0% +1273.5%
HT 68.2 1.0 169.8 5.5 100.0% +149.0%
VT 61.2 1.0 138.5 3.6 100.0% +126.3%
R 61.3 0.9 130.1 3.8 100.0% +112.4%
RT 25.5 1.3 29.2 1.9 100.0% +14.6%
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index e69de29..64588a1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison at riscosopen.org)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+#include "pixman-arm-simd-asm.h"
+
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ * cond ARM condition code for code block
+ * numbytes Number of output bytes that should be generated this time
+ * firstreg First WK register in which to place output
+ * unaligned_src Whether to use non-wordaligned loads of source image
+ * unaligned_mask Whether to use non-wordaligned loads of mask image
+ * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
+ */
+
+.macro src_n_8888_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+ ldrh SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+ ldrb SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #8
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail cond, numbytes, firstreg
+ WK4 .req SRC
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+ pixst cond, numbytes, 4, DST
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_8888_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_0565_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ src_n_8_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 94f9a0c..f313df3 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -392,6 +392,72 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
uint32_t, uint32_t)
+void
+pixman_composite_src_n_8888_asm_armv6 (int32_t w,
+ int32_t h,
+ uint32_t *dst,
+ int32_t dst_stride,
+ uint32_t src);
+
+void
+pixman_composite_src_n_0565_asm_armv6 (int32_t w,
+ int32_t h,
+ uint16_t *dst,
+ int32_t dst_stride,
+ uint16_t src);
+
+void
+pixman_composite_src_n_8_asm_armv6 (int32_t w,
+ int32_t h,
+ uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t src);
+
+static pixman_bool_t
+arm_simd_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride, /* in 32-bit words */
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t _xor)
+{
+ /* stride is always multiple of 32bit units in pixman */
+ uint32_t byte_stride = stride * sizeof(uint32_t);
+
+ switch (bpp)
+ {
+ case 8:
+ pixman_composite_src_n_8_asm_armv6 (
+ width,
+ height,
+ (uint8_t *)(((char *) bits) + y * byte_stride + x),
+ byte_stride,
+ _xor & 0xff);
+ return TRUE;
+ case 16:
+ pixman_composite_src_n_0565_asm_armv6 (
+ width,
+ height,
+ (uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+ byte_stride / 2,
+ _xor & 0xffff);
+ return TRUE;
+ case 32:
+ pixman_composite_src_n_8888_asm_armv6 (
+ width,
+ height,
+ (uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+ byte_stride / 4,
+ _xor);
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
static const pixman_fast_path_t arm_simd_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -428,5 +494,7 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+ imp->fill = arm_simd_fill;
+
return imp;
}
commit 2e173326aaf232d84ed71faf3517bd7989680e27
Author: Ben Avison <bavison at riscosopen.org>
Date: Mon Jan 28 17:03:50 2013 +0000
ARMv6: Lay the groundwork for later patches in the series
Move the entire contents of pixman-arm-simd-asm.S to a new file;
ultimately this will only retain the scaled operations, so it is
named pixman-arm-simd-asm-scaled.S. Added new header file
pixman-arm-simd-asm.h, containing the macros which are the basis of
all the new ARMv6 implementations, although at this point in the
series, nothing uses them and the library should be binary-identical.
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index beebdd0..b9ea754 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -58,7 +58,9 @@ noinst_LTLIBRARIES += libpixman-arm-simd.la
libpixman_arm_simd_la_SOURCES = \
pixman-arm-simd.c \
pixman-arm-common.h \
- pixman-arm-simd-asm.S
+ pixman-arm-simd-asm.S \
+ pixman-arm-simd-asm-scaled.S \
+ pixman-arm-simd-asm.h
libpixman_1_la_LIBADD += libpixman-arm-simd.la
ASM_CFLAGS_arm_simd=
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
new file mode 100644
index 0000000..b438001
--- /dev/null
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -0,0 +1,448 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff at infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+ .arm
+ .altmacro
+ .p2align 2
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ mov r10, r1
+ sub sp, sp, #4
+ subs r10, r10, #1
+ mov r11, r0
+ mov r8, r2
+ str r3, [sp]
+ ldr r7, [sp, #36]
+ bcc 0f
+6: cmp r11, #0
+ beq 1f
+ orr r3, r8, r7
+ tst r3, #3
+ beq 2f
+ mov r1, r8
+ mov r0, r7
+ mov r12, r11
+ b 3f
+5: tst r3, #3
+ beq 4f
+3: ldrb r2, [r0], #1
+ subs r12, r12, #1
+ ldrb r3, [r1]
+ uqadd8 r3, r2, r3
+ strb r3, [r1], #1
+ orr r3, r1, r0
+ bne 5b
+1: ldr r3, [sp]
+ add r8, r8, r3
+ ldr r3, [sp, #40]
+ add r7, r7, r3
+10: subs r10, r10, #1
+ bcs 6b
+0: add sp, sp, #4
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+2: mov r12, r11
+ mov r1, r8
+ mov r0, r7
+4: cmp r12, #3
+ subgt r6, r12, #4
+ movgt r9, r12
+ lsrgt r5, r6, #2
+ addgt r3, r5, #1
+ movgt r12, #0
+ lslgt r4, r3, #2
+ ble 7f
+8: ldr r3, [r0, r12]
+ ldr r2, [r1, r12]
+ uqadd8 r3, r3, r2
+ str r3, [r1, r12]
+ add r12, r12, #4
+ cmp r12, r4
+ bne 8b
+ sub r3, r9, #4
+ bic r3, r3, #3
+ add r3, r3, #4
+ subs r12, r6, r5, lsl #2
+ add r1, r1, r3
+ add r0, r0, r3
+ beq 1b
+7: mov r4, #0
+9: ldrb r3, [r1, r4]
+ ldrb r2, [r0, r4]
+ uqadd8 r3, r2, r3
+ strb r3, [r1, r4]
+ add r4, r4, #1
+ cmp r4, r12
+ bne 9b
+ ldr r3, [sp]
+ add r8, r8, r3
+ ldr r3, [sp, #40]
+ add r7, r7, r3
+ b 10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #20
+ cmp r1, #0
+ mov r12, r2
+ str r1, [sp, #12]
+ str r0, [sp, #16]
+ ldr r2, [sp, #52]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp]
+ ldr r3, [sp, #56]
+ mov r10, #0
+ lsl r3, r3, #2
+ str r3, [sp, #8]
+ mov r11, r3
+ b 1f
+6: ldr r11, [sp, #8]
+1: ldr r9, [sp]
+ mov r0, r12
+ add r12, r12, r9
+ mov r1, r2
+ str r12, [sp, #4]
+ add r2, r2, r11
+ ldr r12, [sp, #16]
+ ldr r3, =0x00800080
+ ldr r9, =0xff00ff00
+ mov r11, #255
+ cmp r12, #0
+ beq 4f
+5: ldr r5, [r1], #4
+ ldr r4, [r0]
+ sub r8, r11, r5, lsr #24
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ mla r6, r6, r8, r3
+ mla r7, r7, r8, r3
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ and r7, r7, r9
+ uxtab16 r6, r7, r6, ror #8
+ uqadd8 r5, r6, r5
+ str r5, [r0], #4
+ subs r12, r12, #1
+ bne 5b
+4: ldr r3, [sp, #12]
+ add r10, r10, #1
+ cmp r10, r3
+ ldr r12, [sp, #4]
+ bne 6b
+0: add sp, sp, #20
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #28
+ cmp r1, #0
+ str r1, [sp, #12]
+ ldrb r1, [sp, #71]
+ mov r12, r2
+ str r0, [sp, #16]
+ ldr r2, [sp, #60]
+ str r1, [sp, #24]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp, #20]
+ ldr r3, [sp, #64]
+ mov r10, #0
+ lsl r3, r3, #2
+ str r3, [sp, #8]
+ mov r11, r3
+ b 1f
+5: ldr r11, [sp, #8]
+1: ldr r4, [sp, #20]
+ mov r0, r12
+ mov r1, r2
+ add r12, r12, r4
+ add r2, r2, r11
+ str r12, [sp]
+ str r2, [sp, #4]
+ ldr r12, [sp, #16]
+ ldr r2, =0x00800080
+ ldr r3, [sp, #24]
+ mov r11, #255
+ cmp r12, #0
+ beq 3f
+4: ldr r5, [r1], #4
+ ldr r4, [r0]
+ uxtb16 r6, r5
+ uxtb16 r7, r5, ror #8
+ mla r6, r6, r3, r2
+ mla r7, r7, r3, r2
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r5, r6, r7, lsl #8
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ sub r8, r11, r5, lsr #24
+ mla r6, r6, r8, r2
+ mla r7, r7, r8, r2
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r6, r6, r7, lsl #8
+ uqadd8 r5, r6, r5
+ str r5, [r0], #4
+ subs r12, r12, #1
+ bne 4b
+3: ldr r1, [sp, #12]
+ add r10, r10, #1
+ cmp r10, r1
+ ldr r12, [sp]
+ ldr r2, [sp, #4]
+ bne 5b
+0: add sp, sp, #28
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ sub sp, sp, #28
+ cmp r1, #0
+ ldr r9, [sp, #60]
+ str r1, [sp, #12]
+ bic r1, r9, #-16777216
+ str r1, [sp, #20]
+ mov r12, r2
+ lsr r1, r9, #8
+ ldr r2, [sp, #20]
+ bic r1, r1, #-16777216
+ bic r2, r2, #65280
+ bic r1, r1, #65280
+ str r2, [sp, #20]
+ str r0, [sp, #16]
+ str r1, [sp, #4]
+ ldr r2, [sp, #68]
+ beq 0f
+ lsl r3, r3, #2
+ str r3, [sp, #24]
+ mov r0, #0
+ b 1f
+5: ldr r3, [sp, #24]
+1: ldr r4, [sp, #72]
+ mov r10, r12
+ mov r1, r2
+ add r12, r12, r3
+ add r2, r2, r4
+ str r12, [sp, #8]
+ str r2, [sp]
+ ldr r12, [sp, #16]
+ ldr r11, =0x00800080
+ ldr r2, [sp, #4]
+ ldr r3, [sp, #20]
+ cmp r12, #0
+ beq 3f
+4: ldrb r5, [r1], #1
+ ldr r4, [r10]
+ mla r6, r3, r5, r11
+ mla r7, r2, r5, r11
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r5, r6, r7, lsl #8
+ uxtb16 r6, r4
+ uxtb16 r7, r4, ror #8
+ mvn r8, r5
+ lsr r8, r8, #24
+ mla r6, r6, r8, r11
+ mla r7, r7, r8, r11
+ uxtab16 r6, r6, r6, ror #8
+ uxtab16 r7, r7, r7, ror #8
+ uxtb16 r6, r6, ror #8
+ uxtb16 r7, r7, ror #8
+ orr r6, r6, r7, lsl #8
+ uqadd8 r5, r6, r5
+ str r5, [r10], #4
+ subs r12, r12, #1
+ bne 4b
+3: ldr r4, [sp, #12]
+ add r0, r0, #1
+ cmp r0, r4
+ ldr r12, [sp, #8]
+ ldr r2, [sp]
+ bne 5b
+0: add sp, sp, #28
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr
+.endfunc
+
+/*
+ * Note: This code is only using armv5te instructions (not even armv6),
+ * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ * be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
+ * t - type suffix for LDR/STR instructions
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
+ */
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+pixman_asm_function fname
+ W .req r0
+ DST .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ VXMASK .req r6
+ PF_OFFS .req r7
+ SRC_WIDTH_FIXED .req r8
+
+ ldr UNIT_X, [sp]
+ push {r4, r5, r6, r7, r8, r10}
+ mvn VXMASK, #((1 << bpp_shift) - 1)
+ ldr SRC_WIDTH_FIXED, [sp, #28]
+
+ /* define helper macro */
+ .macro scale_2_pixels
+ ldr&t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+ adds VX, VX, UNIT_X
+ str&t TMP1, [DST], #(1 << bpp_shift)
+9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
+
+ ldr&t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ adds VX, VX, UNIT_X
+ str&t TMP2, [DST], #(1 << bpp_shift)
+9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
+ .endm
+
+ /* now do the scaling */
+ and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ adds VX, VX, UNIT_X
+9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
+ subs W, W, #(8 + prefetch_braking_distance)
+ blt 2f
+ /* calculate prefetch offset */
+ mov PF_OFFS, #prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
+1: /* main loop, process 8 pixels per iteration with prefetch */
+ pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
+ add PF_OFFS, UNIT_X, lsl #3
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ subs W, W, #8
+ bge 1b
+2:
+ subs W, W, #(4 - 8 - prefetch_braking_distance)
+ blt 2f
+1: /* process the remaining pixels */
+ scale_2_pixels
+ scale_2_pixels
+ subs W, W, #4
+ bge 1b
+2:
+ tst W, #2
+ beq 2f
+ scale_2_pixels
+2:
+ tst W, #1
+ ldrne&t TMP1, [SRC, TMP1]
+ strne&t TMP1, [DST]
+ /* cleanup helper macro */
+ .purgem scale_2_pixels
+ .unreq DST
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq VXMASK
+ .unreq PF_OFFS
+ .unreq SRC_WIDTH_FIXED
+ /* return */
+ pop {r4, r5, r6, r7, r8, r10}
+ bx lr
+.endfunc
+.endm
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index b438001..e69de29 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,448 +0,0 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- * Copyright © 2010 Nokia Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission. Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose. It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Jeff Muizelaar (jeff at infidigm.net)
- *
- */
-
-/* Prevent the stack from becoming executable */
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack,"",%progbits
-#endif
-
- .text
- .arch armv6
- .object_arch armv4
- .arm
- .altmacro
- .p2align 2
-
-/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
- .func fname
- .global fname
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
-.endm
-
-/*
- * The code below was generated by gcc 4.3.4 from the commented out
- * functions in 'pixman-arm-simd.c' file with the following optimization
- * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
- *
- * TODO: replace gcc generated code with hand tuned versions because
- * the code quality is not very good, introduce symbolic register
- * aliases for better readability and maintainability.
- */
-
-pixman_asm_function pixman_composite_add_8_8_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- mov r10, r1
- sub sp, sp, #4
- subs r10, r10, #1
- mov r11, r0
- mov r8, r2
- str r3, [sp]
- ldr r7, [sp, #36]
- bcc 0f
-6: cmp r11, #0
- beq 1f
- orr r3, r8, r7
- tst r3, #3
- beq 2f
- mov r1, r8
- mov r0, r7
- mov r12, r11
- b 3f
-5: tst r3, #3
- beq 4f
-3: ldrb r2, [r0], #1
- subs r12, r12, #1
- ldrb r3, [r1]
- uqadd8 r3, r2, r3
- strb r3, [r1], #1
- orr r3, r1, r0
- bne 5b
-1: ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
-10: subs r10, r10, #1
- bcs 6b
-0: add sp, sp, #4
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-2: mov r12, r11
- mov r1, r8
- mov r0, r7
-4: cmp r12, #3
- subgt r6, r12, #4
- movgt r9, r12
- lsrgt r5, r6, #2
- addgt r3, r5, #1
- movgt r12, #0
- lslgt r4, r3, #2
- ble 7f
-8: ldr r3, [r0, r12]
- ldr r2, [r1, r12]
- uqadd8 r3, r3, r2
- str r3, [r1, r12]
- add r12, r12, #4
- cmp r12, r4
- bne 8b
- sub r3, r9, #4
- bic r3, r3, #3
- add r3, r3, #4
- subs r12, r6, r5, lsl #2
- add r1, r1, r3
- add r0, r0, r3
- beq 1b
-7: mov r4, #0
-9: ldrb r3, [r1, r4]
- ldrb r2, [r0, r4]
- uqadd8 r3, r2, r3
- strb r3, [r1, r4]
- add r4, r4, #1
- cmp r4, r12
- bne 9b
- ldr r3, [sp]
- add r8, r8, r3
- ldr r3, [sp, #40]
- add r7, r7, r3
- b 10b
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #20
- cmp r1, #0
- mov r12, r2
- str r1, [sp, #12]
- str r0, [sp, #16]
- ldr r2, [sp, #52]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp]
- ldr r3, [sp, #56]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-6: ldr r11, [sp, #8]
-1: ldr r9, [sp]
- mov r0, r12
- add r12, r12, r9
- mov r1, r2
- str r12, [sp, #4]
- add r2, r2, r11
- ldr r12, [sp, #16]
- ldr r3, =0x00800080
- ldr r9, =0xff00ff00
- mov r11, #255
- cmp r12, #0
- beq 4f
-5: ldr r5, [r1], #4
- ldr r4, [r0]
- sub r8, r11, r5, lsr #24
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mla r6, r6, r8, r3
- mla r7, r7, r8, r3
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- and r7, r7, r9
- uxtab16 r6, r7, r6, ror #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 5b
-4: ldr r3, [sp, #12]
- add r10, r10, #1
- cmp r10, r3
- ldr r12, [sp, #4]
- bne 6b
-0: add sp, sp, #20
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- str r1, [sp, #12]
- ldrb r1, [sp, #71]
- mov r12, r2
- str r0, [sp, #16]
- ldr r2, [sp, #60]
- str r1, [sp, #24]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #20]
- ldr r3, [sp, #64]
- mov r10, #0
- lsl r3, r3, #2
- str r3, [sp, #8]
- mov r11, r3
- b 1f
-5: ldr r11, [sp, #8]
-1: ldr r4, [sp, #20]
- mov r0, r12
- mov r1, r2
- add r12, r12, r4
- add r2, r2, r11
- str r12, [sp]
- str r2, [sp, #4]
- ldr r12, [sp, #16]
- ldr r2, =0x00800080
- ldr r3, [sp, #24]
- mov r11, #255
- cmp r12, #0
- beq 3f
-4: ldr r5, [r1], #4
- ldr r4, [r0]
- uxtb16 r6, r5
- uxtb16 r7, r5, ror #8
- mla r6, r6, r3, r2
- mla r7, r7, r3, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- sub r8, r11, r5, lsr #24
- mla r6, r6, r8, r2
- mla r7, r7, r8, r2
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r0], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r1, [sp, #12]
- add r10, r10, #1
- cmp r10, r1
- ldr r12, [sp]
- ldr r2, [sp, #4]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
- push {r4, r5, r6, r7, r8, r9, r10, r11}
- sub sp, sp, #28
- cmp r1, #0
- ldr r9, [sp, #60]
- str r1, [sp, #12]
- bic r1, r9, #-16777216
- str r1, [sp, #20]
- mov r12, r2
- lsr r1, r9, #8
- ldr r2, [sp, #20]
- bic r1, r1, #-16777216
- bic r2, r2, #65280
- bic r1, r1, #65280
- str r2, [sp, #20]
- str r0, [sp, #16]
- str r1, [sp, #4]
- ldr r2, [sp, #68]
- beq 0f
- lsl r3, r3, #2
- str r3, [sp, #24]
- mov r0, #0
- b 1f
-5: ldr r3, [sp, #24]
-1: ldr r4, [sp, #72]
- mov r10, r12
- mov r1, r2
- add r12, r12, r3
- add r2, r2, r4
- str r12, [sp, #8]
- str r2, [sp]
- ldr r12, [sp, #16]
- ldr r11, =0x00800080
- ldr r2, [sp, #4]
- ldr r3, [sp, #20]
- cmp r12, #0
- beq 3f
-4: ldrb r5, [r1], #1
- ldr r4, [r10]
- mla r6, r3, r5, r11
- mla r7, r2, r5, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r5, r6, r7, lsl #8
- uxtb16 r6, r4
- uxtb16 r7, r4, ror #8
- mvn r8, r5
- lsr r8, r8, #24
- mla r6, r6, r8, r11
- mla r7, r7, r8, r11
- uxtab16 r6, r6, r6, ror #8
- uxtab16 r7, r7, r7, ror #8
- uxtb16 r6, r6, ror #8
- uxtb16 r7, r7, ror #8
- orr r6, r6, r7, lsl #8
- uqadd8 r5, r6, r5
- str r5, [r10], #4
- subs r12, r12, #1
- bne 4b
-3: ldr r4, [sp, #12]
- add r0, r0, #1
- cmp r0, r4
- ldr r12, [sp, #8]
- ldr r2, [sp]
- bne 5b
-0: add sp, sp, #28
- pop {r4, r5, r6, r7, r8, r9, r10, r11}
- bx lr
-.endfunc
-
-/*
- * Note: This code is only using armv5te instructions (not even armv6),
- * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
- * be split into a few variants, tuned for each microarchitecture.
- *
- * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
- * have efficient write combining), it needs to be changed to use 16-byte
- * aligned writes using STM instruction.
- *
- * Nearest scanline scaler macro template uses the following arguments:
- * fname - name of the function to generate
- * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
- * t - type suffix for LDR/STR instructions
- * prefetch_distance - prefetch in the source image by that many
- * pixels ahead
- * prefetch_braking_distance - stop prefetching when that many pixels are
- * remaining before the end of scanline
- */
-
-.macro generate_nearest_scanline_func fname, bpp_shift, t, \
- prefetch_distance, \
- prefetch_braking_distance
-
-pixman_asm_function fname
- W .req r0
- DST .req r1
- SRC .req r2
- VX .req r3
- UNIT_X .req ip
- TMP1 .req r4
- TMP2 .req r5
- VXMASK .req r6
- PF_OFFS .req r7
- SRC_WIDTH_FIXED .req r8
-
- ldr UNIT_X, [sp]
- push {r4, r5, r6, r7, r8, r10}
- mvn VXMASK, #((1 << bpp_shift) - 1)
- ldr SRC_WIDTH_FIXED, [sp, #28]
-
- /* define helper macro */
- .macro scale_2_pixels
- ldr&t TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
- adds VX, VX, UNIT_X
- str&t TMP1, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
- bpl 9b
-
- ldr&t TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
- adds VX, VX, UNIT_X
- str&t TMP2, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
- bpl 9b
- .endm
-
- /* now do the scaling */
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
- adds VX, VX, UNIT_X
-9: subpls VX, VX, SRC_WIDTH_FIXED
- bpl 9b
- subs W, W, #(8 + prefetch_braking_distance)
- blt 2f
- /* calculate prefetch offset */
- mov PF_OFFS, #prefetch_distance
- mla PF_OFFS, UNIT_X, PF_OFFS, VX
-1: /* main loop, process 8 pixels per iteration with prefetch */
- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
- add PF_OFFS, UNIT_X, lsl #3
- scale_2_pixels
- scale_2_pixels
- scale_2_pixels
- scale_2_pixels
- subs W, W, #8
- bge 1b
-2:
- subs W, W, #(4 - 8 - prefetch_braking_distance)
- blt 2f
-1: /* process the remaining pixels */
- scale_2_pixels
- scale_2_pixels
- subs W, W, #4
- bge 1b
-2:
- tst W, #2
- beq 2f
- scale_2_pixels
-2:
- tst W, #1
- ldrne&t TMP1, [SRC, TMP1]
- strne&t TMP1, [DST]
- /* cleanup helper macro */
- .purgem scale_2_pixels
- .unreq DST
- .unreq SRC
- .unreq W
- .unreq VX
- .unreq UNIT_X
- .unreq TMP1
- .unreq TMP2
- .unreq VXMASK
- .unreq PF_OFFS
- .unreq SRC_WIDTH_FIXED
- /* return */
- pop {r4, r5, r6, r7, r8, r10}
- bx lr
-.endfunc
-.endm
-
-generate_nearest_scanline_func \
- pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
-
-generate_nearest_scanline_func \
- pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
new file mode 100644
index 0000000..6543606
--- /dev/null
+++ b/pixman/pixman-arm-simd-asm.h
@@ -0,0 +1,908 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison at riscosopen.org)
+ *
+ */
+
+/*
+ * Because the alignment of pixel data to cachelines, and even the number of
+ * cachelines per row can vary from row to row, and because of the need to
+ * preload each scanline once and only once, this prefetch strategy treats
+ * each row of pixels independently. When a pixel row is long enough, there
+ * are three distinct phases of prefetch:
+ * * an inner loop section, where each time a cacheline of data is
+ * processed, another cacheline is preloaded (the exact distance ahead is
+ * determined empirically using profiling results from lowlevel-blt-bench)
+ * * a leading section, where enough cachelines are preloaded to ensure no
+ * cachelines escape being preloaded when the inner loop starts
+ * * a trailing section, where a limited number (0 or more) of cachelines
+ * are preloaded to deal with data (if any) that hangs off the end of the
+ * last iteration of the inner loop, plus any trailing bytes that were not
+ * enough to make up one whole iteration of the inner loop
+ *
+ * There are (in general) three distinct code paths, selected between
+ * depending upon how long the pixel row is. If it is long enough that there
+ * is at least one iteration of the inner loop (as described above) then
+ * this is described as the "wide" case. If it is shorter than that, but
+ * there are still enough bytes output that there is at least one 16-byte-
+ * long, 16-byte-aligned write to the destination (the optimum type of
+ * write), then this is the "medium" case. If it is not even this long, then
+ * this is the "narrow" case, and there is no attempt to align writes to
+ * 16-byte boundaries. In the "medium" and "narrow" cases, all the
+ * cachelines containing data from the pixel row are prefetched up-front.
+ */
+
+/*
+ * Determine whether we put the arguments on the stack for debugging.
+ */
+#undef DEBUG_PARAMS
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY, 0
+.set FLAG_DST_READWRITE, 1
+.set FLAG_COND_EXEC, 0
+.set FLAG_BRANCH_OVER, 2
+.set FLAG_PROCESS_PRESERVES_PSR, 0
+.set FLAG_PROCESS_CORRUPTS_PSR, 4
+.set FLAG_PROCESS_DOESNT_STORE, 0
+.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
+.set FLAG_NO_SPILL_LINE_VARS, 0
+.set FLAG_SPILL_LINE_VARS_WIDE, 16
+.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
+.set FLAG_SPILL_LINE_VARS, 48
+.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
+.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+
+/*
+ * Offset into stack where mask and source pointer/stride can be accessed.
+ */
+#ifdef DEBUG_PARAMS
+.set ARGS_STACK_OFFSET, (9*4+9*4)
+#else
+.set ARGS_STACK_OFFSET, (9*4)
+#endif
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE, 0
+.set PREFETCH_TYPE_STANDARD, 1
+
+/*
+ * Definitions of macros for load/store of pixel data.
+ */
+
+.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
+ .if numbytes == 16
+ .if unaligned == 1
+ op&r&cond WK®0, [base], #4
+ op&r&cond WK®1, [base], #4
+ op&r&cond WK®2, [base], #4
+ op&r&cond WK®3, [base], #4
+ .else
+ op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3}
+ .endif
+ .elseif numbytes == 8
+ .if unaligned == 1
+ op&r&cond WK®0, [base], #4
+ op&r&cond WK®1, [base], #4
+ .else
+ op&m&cond&ia base!, {WK®0,WK®1}
+ .endif
+ .elseif numbytes == 4
+ op&r&cond WK®0, [base], #4
+ .elseif numbytes == 2
+ op&r&cond&h WK®0, [base], #2
+ .elseif numbytes == 1
+ op&r&cond&b WK®0, [base], #1
+ .else
+ .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
+ .if numbytes == 16
+ stm&cond&db base, {WK®0,WK®1,WK®2,WK®3}
+ .elseif numbytes == 8
+ stm&cond&db base, {WK®0,WK®1}
+ .elseif numbytes == 4
+ str&cond WK®0, [base, #-4]
+ .elseif numbytes == 2
+ str&cond&h WK®0, [base, #-2]
+ .elseif numbytes == 1
+ str&cond&b WK®0, [base, #-1]
+ .else
+ .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixld cond, numbytes, firstreg, base, unaligned
+ pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+.endm
+
+.macro pixst cond, numbytes, firstreg, base
+ .if (flags) & FLAG_DST_READWRITE
+ pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .else
+ pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .endif
+.endm
+
+.macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
+ a x
+ .endif
+.endm
+
+
+.macro preload_leading_step1 bpp, ptr, base
+/* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if bpp > 0
+ PF bic, ptr, base, #31
+ .set OFFSET, 0
+ .rept prefetch_distance+1
+ PF pld, [ptr, #OFFSET]
+ .set OFFSET, OFFSET+32
+ .endr
+ .endif
+.endm
+
+.macro preload_leading_step2 bpp, bpp_shift, ptr, base
+/* However, if the destination is not 16-byte aligned, we may need to
+ * preload more cache lines than that. The question we need to ask is:
+ * are the bytes corresponding to the leading pixels more than the amount
+ * by which the source pointer will be rounded down for preloading, and if
+ * so, by how many cache lines? Effectively, we want to calculate
+ * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
+ * inner_loop_offset = (src+leading_bytes)&31
+ * extra_needed = leading_bytes - inner_loop_offset
+ * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
+ * possible when there are 4 src bytes for every 1 dst byte).
+ */
+ .if bpp > 0
+ .ifc base,DST
+ /* The test can be simplified further when preloading the destination */
+ PF tst, base, #16
+ PF beq, 61f
+ .else
+ .if bpp/dst_w_bpp == 4
+ PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+ PF and, SCRATCH, SCRATCH, #31
+ PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+ PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
+ PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */
+ PF bcs, 61f
+ PF bpl, 60f
+ PF pld, [ptr, #32*(prefetch_distance+2)]
+ .else
+ PF mov, SCRATCH, base, lsl #32-5
+ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+ PF bls, 61f
+ .endif
+ .endif
+60: PF pld, [ptr, #32*(prefetch_distance+1)]
+61:
+ .endif
+.endm
+
+#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
+.macro preload_middle bpp, base, scratch_holds_offset
+ .if bpp > 0
+ /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
+ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
+ .if scratch_holds_offset
+ PF pld, [base, SCRATCH]
+ .else
+ PF bic, SCRATCH, base, #31
+ PF pld, [SCRATCH, #32*prefetch_distance]
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro preload_trailing bpp, bpp_shift, base
+ .if bpp > 0
+ .if bpp*pix_per_block > 256
+ /* Calculations are more complex if more than one fetch per block */
+ PF and, WK1, base, #31
+ PF add, WK1, WK1, WK0, lsl #bpp_shift
+ PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
+ PF bic, SCRATCH, base, #31
+80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
+ PF add, SCRATCH, SCRATCH, #32
+ PF subs, WK1, WK1, #32
+ PF bhi, 80b
+ .else
+ /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
+ PF mov, SCRATCH, base, lsl #32-5
+ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
+ PF adceqs, SCRATCH, SCRATCH, #0
+ /* The instruction above has two effects: ensures Z is only
+ * set if C was clear (so Z indicates that both shifted quantities
+ * were 0), and clears C if Z was set (so C indicates that the sum
+ * of the shifted quantities was greater and not equal to 32) */
+ PF beq, 82f
+ PF bic, SCRATCH, base, #31
+ PF bcc, 81f
+ PF pld, [SCRATCH, #32*(prefetch_distance+2)]
+81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
+82:
+ .endif
+ .endif
+.endm
+
+
+.macro preload_line narrow_case, bpp, bpp_shift, base
+/* "narrow_case" - just means that the macro was invoked from the "narrow"
+ * code path rather than the "medium" one - because in the narrow case,
+ * the row of pixels is known to output no more than 30 bytes, then
+ * (assuming the source pixels are no wider than the the destination
+ * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
+ * meaning there's no need for a loop.
+ * "bpp" - number of bits per pixel in the channel (source, mask or
+ * destination) that's being preloaded, or 0 if this channel is not used
+ * for reading
+ * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
+ * "base" - base address register of channel to preload (SRC, MASK or DST)
+ */
+ .if bpp > 0
+ .if narrow_case && (bpp <= dst_w_bpp)
+ /* In these cases, each line for each channel is in either 1 or 2 cache lines */
+ PF bic, WK0, base, #31
+ PF pld, [WK0]
+ PF add, WK1, base, X, LSL #bpp_shift
+ PF sub, WK1, WK1, #1
+ PF bic, WK1, WK1, #31
+ PF cmp, WK1, WK0
+ PF beq, 90f
+ PF pld, [WK1]
+90:
+ .else
+ PF bic, WK0, base, #31
+ PF pld, [WK0]
+ PF add, WK1, base, X, lsl #bpp_shift
+ PF sub, WK1, WK1, #1
+ PF bic, WK1, WK1, #31
+ PF cmp, WK1, WK0
+ PF beq, 92f
+91: PF add, WK0, WK0, #32
+ PF cmp, WK0, WK1
+ PF pld, [WK0]
+ PF bne, 91b
+92:
+ .endif
+ .endif
+.endm
+
+
+.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
+ .if decrementx
+ sub&cond X, X, #8*numbytes/dst_w_bpp
+ .endif
+ process_tail cond, numbytes, firstreg
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst cond, numbytes, firstreg, DST
+ .endif
+.endm
+
+.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_BRANCH_OVER
+ .ifc cond,mi
+ bpl 100f
+ .endif
+ .ifc cond,cs
+ bcc 100f
+ .endif
+ .ifc cond,ne
+ beq 100f
+ .endif
+ conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+100:
+ .else
+ conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .endif
+.endm
+
+.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
+ /* Can't interleave reads and writes */
+ test
+ conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
+ test
+ .endif
+ conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .else
+ /* Can interleave reads and writes for better scheduling */
+ test
+ process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
+ process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
+ .if decrementx
+ sub&cond1 X, X, #8*numbytes1/dst_w_bpp
+ sub&cond2 X, X, #8*numbytes2/dst_w_bpp
+ .endif
+ process_tail cond1, numbytes1, firstreg1
+ process_tail cond2, numbytes2, firstreg2
+ pixst cond1, numbytes1, firstreg1, DST
+ pixst cond2, numbytes2, firstreg2, DST
+ .endif
+.endm
+
+
+.macro test_bits_1_0_ptr
+ movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
+.endm
+
+.macro test_bits_3_2_ptr
+ movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
+.endm
+
+.macro leading_15bytes process_head, process_tail
+ /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ /* Use unaligned loads in all cases for simplicity */
+ .if dst_w_bpp == 8
+ conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+ .elseif dst_w_bpp == 16
+ test_bits_1_0_ptr
+ conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
+ .endif
+ conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
+.endm
+
+.macro test_bits_3_2_pix
+ movs SCRATCH, X, lsl #dst_bpp_shift+32-3
+.endm
+
+.macro test_bits_1_0_pix
+ .if dst_w_bpp == 8
+ movs SCRATCH, X, lsl #dst_bpp_shift+32-1
+ .else
+ movs SCRATCH, X, lsr #1
+ .endif
+.endm
+
+.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+ conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ .if dst_w_bpp == 16
+ test_bits_1_0_pix
+ conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ .elseif dst_w_bpp == 8
+ conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ .endif
+.endm
+
+
+.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110:
+ .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
+ .rept pix_per_block*dst_w_bpp/128
+ process_head , 16, 0, unaligned_src, unaligned_mask, 1
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ preload_middle src_bpp, SRC, 1
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ preload_middle mask_bpp, MASK, 1
+ .else
+ preload_middle src_bpp, SRC, 0
+ preload_middle mask_bpp, MASK, 0
+ .endif
+ .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+ /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
+ * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
+ * preloads for, to achieve staggered prefetches for multiple channels, because there are
+ * always two STMs per prefetch, so there is always an opposite STM on which to put the
+ * preload. Note, no need to BIC the base register here */
+ PF pld, [DST, #32*prefetch_distance - dst_alignment]
+ .endif
+ process_tail , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 16, 0, DST
+ .endif
+ .set SUBBLOCK, SUBBLOCK+1
+ .endr
+ subs X, X, #pix_per_block
+ bhs 110b
+.endm
+
+.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
+ /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
+ .if dst_r_bpp > 0
+ tst DST, #16
+ bne 111f
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
+ b 112f
+111:
+ .endif
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
+112:
+ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
+ PF and, WK0, X, #pix_per_block-1
+ .endif
+ preload_trailing src_bpp, src_bpp_shift, SRC
+ preload_trailing mask_bpp, mask_bpp_shift, MASK
+ preload_trailing dst_r_bpp, dst_bpp_shift, DST
+ add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
+ /* The remainder of the line is handled identically to the medium case */
+ medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+.endm
+
+.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+120:
+ process_head , 16, 0, unaligned_src, unaligned_mask, 0
+ process_tail , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 16, 0, DST
+ .endif
+ subs X, X, #128/dst_w_bpp
+ bhs 120b
+ /* Trailing pixels */
+ tst X, #128/dst_w_bpp - 1
+ beq exit_label
+ trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+ tst X, #16*8/dst_w_bpp
+ conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+ /* Trailing pixels */
+ /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
+ trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
+ /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
+ .if mask_bpp == 8 || mask_bpp == 16
+ tst MASK, #3
+ bne 141f
+ .endif
+ .if src_bpp == 8 || src_bpp == 16
+ tst SRC, #3
+ bne 140f
+ .endif
+ action process_head, process_tail, process_inner_loop, exit_label, 0, 0
+ .if src_bpp == 8 || src_bpp == 16
+ b exit_label
+140:
+ action process_head, process_tail, process_inner_loop, exit_label, 1, 0
+ .endif
+ .if mask_bpp == 8 || mask_bpp == 16
+ b exit_label
+141:
+ .if src_bpp == 8 || src_bpp == 16
+ tst SRC, #3
+ bne 142f
+ .endif
+ action process_head, process_tail, process_inner_loop, exit_label, 0, 1
+ .if src_bpp == 8 || src_bpp == 16
+ b exit_label
+142:
+ action process_head, process_tail, process_inner_loop, exit_label, 1, 1
+ .endif
+ .endif
+.endm
+
+
+.macro end_of_line restore_x, vars_spilled, loop_label, last_one
+ .if vars_spilled
+ /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
+ /* This is ldmia sp,{} */
+ .word 0xE89D0000 | LINE_SAVED_REGS
+ .endif
+ subs Y, Y, #1
+ .if vars_spilled
+ .if (LINE_SAVED_REGS) & (1<<1)
+ str Y, [sp]
+ .endif
+ .endif
+ add DST, DST, STRIDE_D
+ .if src_bpp > 0
+ add SRC, SRC, STRIDE_S
+ .endif
+ .if mask_bpp > 0
+ add MASK, MASK, STRIDE_M
+ .endif
+ .if restore_x
+ mov X, ORIG_W
+ .endif
+ bhs loop_label
+ .ifc "last_one",""
+ .if vars_spilled
+ b 197f
+ .else
+ b 198f
+ .endif
+ .else
+ .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+ b 198f
+ .endif
+ .endif
+.endm
+
+
+.macro generate_composite_function fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags_, \
+ prefetch_distance_, \
+ init, \
+ newline, \
+ cleanup, \
+ process_head, \
+ process_tail, \
+ process_inner_loop
+
+ .func fname
+ .global fname
+ /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set flags, flags_
+ .set prefetch_distance, prefetch_distance_
+
+/*
+ * Select prefetch type for this function.
+ */
+ .if prefetch_distance == 0
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .else
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
+ .endif
+
+ .if src_bpp == 32
+ .set src_bpp_shift, 2
+ .elseif src_bpp == 24
+ .set src_bpp_shift, 0
+ .elseif src_bpp == 16
+ .set src_bpp_shift, 1
+ .elseif src_bpp == 8
+ .set src_bpp_shift, 0
+ .elseif src_bpp == 0
+ .set src_bpp_shift, -1
+ .else
+ .error "requested src bpp (src_bpp) is not supported"
+ .endif
+
+ .if mask_bpp == 32
+ .set mask_bpp_shift, 2
+ .elseif mask_bpp == 24
+ .set mask_bpp_shift, 0
+ .elseif mask_bpp == 8
+ .set mask_bpp_shift, 0
+ .elseif mask_bpp == 0
+ .set mask_bpp_shift, -1
+ .else
+ .error "requested mask bpp (mask_bpp) is not supported"
+ .endif
+
+ .if dst_w_bpp == 32
+ .set dst_bpp_shift, 2
+ .elseif dst_w_bpp == 24
+ .set dst_bpp_shift, 0
+ .elseif dst_w_bpp == 16
+ .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+ .set dst_bpp_shift, 0
+ .else
+ .error "requested dst bpp (dst_w_bpp) is not supported"
+ .endif
+
+ .if (((flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+ .else
+ .set dst_r_bpp, 0
+ .endif
+
+ .set pix_per_block, 16*8/dst_w_bpp
+ .if src_bpp != 0
+ .if 32*8/src_bpp > pix_per_block
+ .set pix_per_block, 32*8/src_bpp
+ .endif
+ .endif
+ .if mask_bpp != 0
+ .if 32*8/mask_bpp > pix_per_block
+ .set pix_per_block, 32*8/mask_bpp
+ .endif
+ .endif
+ .if dst_r_bpp != 0
+ .if 32*8/dst_r_bpp > pix_per_block
+ .set pix_per_block, 32*8/dst_r_bpp
+ .endif
+ .endif
+
+/* The standard entry conditions set up by pixman-arm-common.h are:
+ * r0 = width (pixels)
+ * r1 = height (rows)
+ * r2 = pointer to top-left pixel of destination
+ * r3 = destination stride (pixels)
+ * [sp] = source pixel value, or pointer to top-left pixel of source
+ * [sp,#4] = 0 or source stride (pixels)
+ * The following arguments are unused for non-mask operations
+ * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
+ * [sp,#12] = 0 or mask stride (pixels)
+ */
+
+/*
+ * Assign symbolic names to registers
+ */
+ X .req r0 /* pixels to go on this line */
+ Y .req r1 /* lines to go */
+ DST .req r2 /* destination pixel pointer */
+ STRIDE_D .req r3 /* destination stride (bytes, minus width) */
+ SRC .req r4 /* source pixel pointer */
+ STRIDE_S .req r5 /* source stride (bytes, minus width) */
+ MASK .req r6 /* mask pixel pointer (if applicable) */
+ STRIDE_M .req r7 /* mask stride (bytes, minus width) */
+ WK0 .req r8 /* pixel data registers */
+ WK1 .req r9
+ WK2 .req r10
+ WK3 .req r11
+ SCRATCH .req r12
+ ORIG_W .req r14 /* width (pixels) */
+
+fname:
+ push {r4-r11, lr} /* save all registers */
+
+ subs Y, Y, #1
+ blo 199f
+
+#ifdef DEBUG_PARAMS
+ sub sp, sp, #9*4
+#endif
+
+ .if src_bpp > 0
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
+ .endif
+ .if mask_bpp > 0
+ ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
+ ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
+ .endif
+
+#ifdef DEBUG_PARAMS
+ add Y, Y, #1
+ stmia sp, {r0-r7,pc}
+ sub Y, Y, #1
+#endif
+
+ init
+
+ lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
+ sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
+ .if src_bpp > 0
+ lsl STRIDE_S, #src_bpp_shift
+ sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
+ .endif
+ .if mask_bpp > 0
+ lsl STRIDE_M, #mask_bpp_shift
+ sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
+ .endif
+
+ /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
+ cmp X, #2*16*8/dst_w_bpp - 1
+ blo 170f
+ .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
+ /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
+ cmp X, #(prefetch_distance+3)*pix_per_block - 1
+ blo 160f
+
+ /* Wide case */
+ /* Adjust X so that the decrement instruction can also test for
+ * inner loop termination. We want it to stop when there are
+ * (prefetch_distance+1) complete blocks to go. */
+ sub X, X, #(prefetch_distance+2)*pix_per_block
+ mov ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .endif
+151: /* New line */
+ newline
+ preload_leading_step1 src_bpp, WK1, SRC
+ preload_leading_step1 mask_bpp, WK2, MASK
+ preload_leading_step1 dst_r_bpp, WK3, DST
+
+ tst DST, #15
+ beq 154f
+ rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+ .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
+ PF and, WK0, WK0, #15
+ .endif
+
+ preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
+ preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
+ preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
+
+ leading_15bytes process_head, process_tail
+
+154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ and SCRATCH, SRC, #31
+ rsb SCRATCH, SCRATCH, #32*prefetch_distance
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ and SCRATCH, MASK, #31
+ rsb SCRATCH, SCRATCH, #32*prefetch_distance
+ .endif
+ .ifc "process_inner_loop",""
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+ .else
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+ .endif
+
+157: /* Check for another line */
+ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .endif
+
+ .ltorg
+
+160: /* Medium case */
+ mov ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .endif
+161: /* New line */
+ newline
+ preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
+ preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+
+ sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
+ tst DST, #15
+ beq 164f
+ rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+
+ leading_15bytes process_head, process_tail
+
+164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+
+167: /* Check for another line */
+ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
+
+ .ltorg
+
+170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if dst_w_bpp < 32
+ mov ORIG_W, X
+ .endif
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .endif
+171: /* New line */
+ newline
+ preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
+ preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+
+ .if dst_w_bpp == 8
+ tst DST, #3
+ beq 174f
+172: subs X, X, #1
+ blo 177f
+ process_head , 1, 0, 1, 1, 0
+ process_tail , 1, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 1, 0, DST
+ .endif
+ tst DST, #3
+ bne 172b
+ .elseif dst_w_bpp == 16
+ tst DST, #2
+ beq 174f
+ subs X, X, #1
+ blo 177f
+ process_head , 2, 0, 1, 1, 0
+ process_tail , 2, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 2, 0, DST
+ .endif
+ .endif
+
+174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+
+177: /* Check for another line */
+ end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+
+197:
+ .if (flags) & FLAG_SPILL_LINE_VARS
+ add sp, sp, #LINE_SAVED_REG_COUNT*4
+ .endif
+198:
+ cleanup
+
+#ifdef DEBUG_PARAMS
+ add sp, sp, #9*4 /* junk the debug copy of arguments */
+#endif
+199:
+ pop {r4-r11, pc} /* exit */
+
+ .ltorg
+
+ .unreq X
+ .unreq Y
+ .unreq DST
+ .unreq STRIDE_D
+ .unreq SRC
+ .unreq STRIDE_S
+ .unreq MASK
+ .unreq STRIDE_M
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ .unreq SCRATCH
+ .unreq ORIG_W
+ .endfunc
+.endm
+
+.macro line_saved_regs x:vararg
+ .set LINE_SAVED_REGS, 0
+ .set LINE_SAVED_REG_COUNT, 0
+ .irp SAVED_REG,x
+ .ifc "SAVED_REG","Y"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_D"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_S"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_M"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","ORIG_W"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .endr
+.endm
+
+.macro nop_macro x:vararg
+.endm
More information about the xorg-commit
mailing list