[Pixman] [PATCH 13/32] armv6: Add over_8888_n_0565 fast path
Ben Avison
bavison at riscosopen.org
Thu Aug 7 09:50:09 PDT 2014
lowlevel-blt-bench results:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 5.7 0.0 20.6 0.1 100.0% +263.8%
L2 4.9 0.0 17.4 0.3 100.0% +254.0%
M 4.8 0.0 19.9 0.0 100.0% +312.9%
HT 4.5 0.0 12.4 0.1 100.0% +175.4%
VT 4.5 0.0 12.0 0.0 100.0% +168.9%
R 4.3 0.0 11.4 0.1 100.0% +163.3%
RT 2.9 0.0 6.0 0.1 100.0% +106.9%
---
pixman/pixman-arm-simd-asm.S | 69 +++++++++++++++++++++++++++++------------
pixman/pixman-arm-simd.c | 4 ++
2 files changed, 53 insertions(+), 20 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 43d3c63..f4b3a3e 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1403,7 +1403,7 @@ generate_composite_function \
/******************************************************************************/
-.macro over_8888_8_0565_init
+.macro over_8888_x_0565_init
.unreq WK2
.unreq WK3
HALF .req Y
@@ -1419,13 +1419,13 @@ generate_composite_function \
line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
.endm
-.macro over_8888_8_0565_newline
+.macro over_8888_x_0565_newline
ldr HALF, =0x00800080
ldr BITMSK5, =0x001f001f
ldr BITMSK6, =0xfc00fc00
.endm
-.macro over_8888_8_0565_cleanup
+.macro over_8888_x_0565_cleanup
.unreq HALF
.unreq WK2
.unreq WK3
@@ -1440,10 +1440,14 @@ generate_composite_function \
WK3 .req r11
.endm
-.macro over_8888_8_0565_2pixels_head
+.macro over_8888_x_0565_2pixels_head
ldmia SRC!, {WK0, WK1}
+ .if OVER_8888_X_0565_SOLID
+ ldrb WK2, [sp, #ARGS_STACK_OFFSET + 8 + 3]
+ .else
ldrb WK2, [MASK], #1
ldrb WK3, [MASK], #1
+ .endif
@ Because we'll be writing the destination in sub-cacheline
@ chunks either way, it needs to be preloaded, so there's no
@ penalty in loading its existing value even if it's unused
@@ -1452,15 +1456,20 @@ generate_composite_function \
.set COUNTER, 0
-.macro over_8888_8_0565_2pixels_tail
+.macro over_8888_x_0565_2pixels_tail
uxtb16 WK5, WK0 @ 00000000rrrrrrrr00000000bbbbbbbb
uxtb16 WK0, WK0, ror #8 @ 00000000aaaaaaaa00000000gggggggg
uxtb16 WK6, WK1 @ 00000000RRRRRRRR00000000BBBBBBBB
uxtb16 WK1, WK1, ror #8 @ 00000000AAAAAAAA00000000GGGGGGGG
mla WK5, WK5, WK2, HALF @ rrrrrrrrrrrrrrrrbbbbbbbbbbbbbbbb
mla WK0, WK0, WK2, HALF @ aaaaaaaaaaaaaaaagggggggggggggggg
+ .if OVER_8888_X_0565_SOLID
+ mla WK6, WK6, WK2, HALF @ RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
+ mla WK1, WK1, WK2, HALF @ AAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ .else
mla WK6, WK6, WK3, HALF @ RRRRRRRRRRRRRRRRBBBBBBBBBBBBBBBB
mla WK1, WK1, WK3, HALF @ AAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ .endif
uxtab16 WK5, WK5, WK5, ror #8
uxtab16 WK0, WK0, WK0, ror #8
uxtab16 WK6, WK6, WK6, ror #8
@@ -1517,27 +1526,31 @@ generate_composite_function \
str WK0, [DST, #-4]
.endm
-.macro over_8888_8_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+.macro over_8888_x_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.if numbytes == 2
ldr WK0, [SRC], #4
+ .if OVER_8888_X_0565_SOLID
+ ldrb WK1, [sp, #ARGS_STACK_OFFSET + 8 + 3]
+ .else
ldrb WK1, [MASK], #1
+ .endif
ldrh WK2, [DST], #2
.else
.if numbytes >= 8
.if numbytes == 16
- over_8888_8_0565_2pixels_head
- over_8888_8_0565_2pixels_tail
- over_8888_8_0565_2pixels_head
- over_8888_8_0565_2pixels_tail
+ over_8888_x_0565_2pixels_head
+ over_8888_x_0565_2pixels_tail
+ over_8888_x_0565_2pixels_head
+ over_8888_x_0565_2pixels_tail
.endif
- over_8888_8_0565_2pixels_head
- over_8888_8_0565_2pixels_tail
+ over_8888_x_0565_2pixels_head
+ over_8888_x_0565_2pixels_tail
.endif
- over_8888_8_0565_2pixels_head
+ over_8888_x_0565_2pixels_head
.endif
.endm
-.macro over_8888_8_0565_process_tail cond, numbytes, firstreg
+.macro over_8888_x_0565_process_tail cond, numbytes, firstreg
.if numbytes == 2
uxtb16 WK3, WK0 @ 00000000rrrrrrrr00000000bbbbbbbb
uxtb16 WK0, WK0, ror #8 @ 00000000aaaaaaaa00000000gggggggg
@@ -1568,18 +1581,34 @@ generate_composite_function \
orr WK0, WK3, WK0, lsr #5 @ 000000xxxxxxxxxxrrrrrggggggbbbbb
strh WK0, [DST, #-2]
.else
- over_8888_8_0565_2pixels_tail
+ over_8888_x_0565_2pixels_tail
.endif
.endm
+.set OVER_8888_X_0565_SOLID, 0
+
generate_composite_function \
pixman_composite_over_8888_8_0565_asm_armv6, 32, 8, 16, \
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
2, /* prefetch distance */ \
- over_8888_8_0565_init, \
- over_8888_8_0565_newline, \
- over_8888_8_0565_cleanup, \
- over_8888_8_0565_process_head, \
- over_8888_8_0565_process_tail
+ over_8888_x_0565_init, \
+ over_8888_x_0565_newline, \
+ over_8888_x_0565_cleanup, \
+ over_8888_x_0565_process_head, \
+ over_8888_x_0565_process_tail
+
+/******************************************************************************/
+
+.set OVER_8888_X_0565_SOLID, 1
+
+generate_composite_function \
+ pixman_composite_over_8888_n_0565_asm_armv6, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ over_8888_x_0565_init, \
+ over_8888_x_0565_newline, \
+ over_8888_x_0565_cleanup, \
+ over_8888_x_0565_process_head, \
+ over_8888_x_0565_process_tail
/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 63bf320..9b9b926 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -62,6 +62,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_0565,
+ uint32_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
@@ -250,6 +252,8 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, r5g6b5, armv6_composite_over_8888_8_0565),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, armv6_composite_over_8888_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, armv6_composite_over_8888_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, armv6_composite_over_8888_n_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888),
--
1.7.5.4
More information about the Pixman
mailing list