[Pixman] [PATCH 12/12] ARMv6: Add fast path for src_x888_0565
Ben Avison
bavison at riscosopen.org
Mon Mar 4 09:42:29 PST 2013
This isn't used in the trimmed cairo-perf-trace tests at all, but these are
the lowlevel-blt-bench results:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 68.5 1.0 116.3 0.6 100.0% +69.8%
L2 31.1 1.8 60.9 5.0 100.0% +96.1%
M 33.6 0.1 86.4 0.4 100.0% +157.0%
HT 19.1 0.1 35.3 0.4 100.0% +84.3%
VT 17.7 0.2 32.1 0.3 100.0% +81.3%
R 17.5 0.2 29.9 0.3 100.0% +70.7%
RT 7.0 0.1 11.8 0.3 100.0% +68.4%
---
pixman/pixman-arm-simd-asm.S | 77 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 7 ++++
2 files changed, 84 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 158de73..423a16d 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -303,6 +303,83 @@ generate_composite_function \
/******************************************************************************/
+.macro src_x888_0565_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x001F001F
+ line_saved_regs STRIDE_S, ORIG_W
+.endm
+
+.macro src_x888_0565_1pixel s, d
+ and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+ and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
+ orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+ orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ /* Top 16 bits are discarded during the following STRH */
+.endm
+
+.macro src_x888_0565_2pixels slo, shi, d, tmp
+ and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
+ and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
+ and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+ orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
+ orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
+ and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
+ orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+ orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+.endm
+
+.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_S
+ WK5 .req STRIDE_M
+ WK6 .req WK3
+ WK7 .req ORIG_W
+ .if numbytes == 16
+ pixld , 16, 4, SRC, 0
+ src_x888_0565_2pixels 4, 5, 0, 0
+ pixld , 8, 4, SRC, 0
+ src_x888_0565_2pixels 6, 7, 1, 1
+ pixld , 8, 6, SRC, 0
+ .else
+ pixld , numbytes*2, 4, SRC, 0
+ .endif
+.endm
+
+.macro src_x888_0565_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_x888_0565_2pixels 4, 5, 2, 2
+ src_x888_0565_2pixels 6, 7, 3, 4
+ .elseif numbytes == 8
+ src_x888_0565_2pixels 4, 5, 1, 1
+ src_x888_0565_2pixels 6, 7, 2, 2
+ .elseif numbytes == 4
+ src_x888_0565_2pixels 4, 5, 1, 1
+ .else
+ src_x888_0565_1pixel 4, 1
+ .endif
+ .if numbytes == 16
+ pixst , numbytes, 0, DST
+ .else
+ pixst , numbytes, 1, DST
+ .endif
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+ 3, /* prefetch distance */ \
+ src_x888_0565_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_x888_0565_process_head, \
+ src_x888_0565_process_tail
+
+/******************************************************************************/
+
.macro add_8_8_8pixels cond, dst1, dst2
uqadd8&cond WK&dst1, WK&dst1, MASK
uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index d227065..5a1708f 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -41,6 +41,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_0565,
+ uint32_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
@@ -227,6 +229,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565),
+
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
--
1.7.5.4
More information about the Pixman
mailing list