[Pixman] [PATCH 4/5] ARMv6: New conversion routines
Ben Avison
bavison at riscosopen.org
Sat Jan 19 08:16:52 PST 2013
There was no previous attempt at accelerating these specifically for
ARMv6.
src_x888_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 96.7 0.5 270.4 2.6 100.0% +179.5%
L2 44.6 2.7 110.6 9.7 100.0% +148.0%
M 26.9 0.1 87.6 0.5 100.0% +226.1%
HT 19.3 0.2 37.5 0.4 100.0% +93.7%
VT 18.6 0.1 33.7 0.4 100.0% +81.6%
R 18.4 0.1 32.2 0.3 100.0% +75.2%
RT 9.2 0.2 12.1 0.3 100.0% +31.4%
src_0565_8888
Before After
Mean StdDev Mean StdDev Confidence Change
L1 37.0 0.3 66.9 0.2 100.0% +80.8%
L2 30.3 0.2 55.9 0.3 100.0% +84.4%
M 25.9 0.0 62.3 0.2 100.0% +140.3%
HT 15.2 0.1 33.1 0.3 100.0% +116.9%
VT 15.1 0.1 30.7 0.3 100.0% +103.6%
R 14.2 0.1 27.6 0.3 100.0% +94.0%
RT 6.0 0.1 11.2 0.3 100.0% +87.2%
---
pixman/pixman-arm-simd-asm.S | 123 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 12 ++++
2 files changed, 135 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index c6d8263..42291da 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -180,3 +180,126 @@ generate_composite_function \
/******************************************************************************/
+.macro src_x888_8888_pixel, cond, reg
+ orr&cond WK®, WK®, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
+ src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+ src_x888_8888_pixel cond, %(firstreg+1)
+ .if numbytes == 16
+ src_x888_8888_pixel cond, %(firstreg+2)
+ src_x888_8888_pixel cond, %(firstreg+3)
+ .endif
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 3, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ pixman_composite_src_x888_8888_process_head, \
+ pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+ /* Hold loop invariants in MASK and STRIDE_M */
+ ldr MASK, =0x07E007E0
+ mov STRIDE_M, #0xFF000000
+ /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+ ldr SCRATCH, =0x80008000
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+ and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+ bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+ sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+ pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+ sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+ orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+ It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+ and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+ bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+ mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+ mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+ orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+ pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+ sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+ bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb
+ and WK®, WK®, MASK @ 000000000000000000000gggggg00000
+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000
+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+ pixld , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+ pixld , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+ src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+ src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_0565_8888_process_head, \
+ src_0565_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index de66e57..09a5036 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -378,10 +378,14 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
uint16_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
+ uint16_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
@@ -523,6 +527,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888),
+
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565),
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565),
PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565),
@@ -549,6 +556,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
+
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
--
1.7.5.4
More information about the Pixman
mailing list