[Pixman] [PATCH 18/32] armv6: Add ADD combiner
Ben Avison
bavison at riscosopen.org
Thu Aug 7 09:50:14 PDT 2014
lowlevel-blt-bench results for two example operations, with and without masks,
neither of which has a dedicated fast path at the time of writing:
add_n_1555
Before After
Mean StdDev Mean StdDev Confidence Change
L1 11.6 0.1 13.7 0.1 100.0% +18.4%
L2 11.6 0.1 13.6 0.2 100.0% +18.1%
M 10.5 0.0 12.5 0.0 100.0% +19.2%
HT 9.3 0.0 10.1 0.0 100.0% +8.1%
VT 9.3 0.0 10.0 0.0 100.0% +7.9%
R 8.9 0.0 9.6 0.0 100.0% +7.7%
RT 5.4 0.1 5.4 0.1 96.2% -0.5% (insignificant)
add_n_8_1555
Before After
Mean StdDev Mean StdDev Confidence Change
L1 6.2 0.0 9.3 0.0 100.0% +49.2%
L2 6.2 0.0 9.2 0.1 100.0% +47.9%
M 5.7 0.0 8.8 0.0 100.0% +52.7%
HT 5.3 0.0 6.8 0.0 100.0% +27.8%
VT 5.3 0.0 6.7 0.0 100.0% +26.7%
R 5.1 0.0 6.4 0.0 100.0% +26.1%
RT 3.1 0.0 3.3 0.0 100.0% +8.7%
---
pixman/pixman-arm-simd-asm.S | 95 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd.c | 4 ++
2 files changed, 99 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 37e9f33..ce0edfc 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -426,6 +426,16 @@ generate_composite_function \
add_8_8_process_head, \
add_8_8_process_tail
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_asm_armv6, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 2, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ add_8_8_process_head, \
+ add_8_8_process_tail
+
/******************************************************************************/
.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
@@ -1698,3 +1708,88 @@ generate_composite_function \
in_n_8888_process_tail
/******************************************************************************/
+
+.macro add_8888_8888_8888_init
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ msr CPSR_s, #0x50000
+ /* Point at alpha bytes in mask */
+ add MASK, MASK, #3
+ line_saved_regs Y, STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+.endm
+
+.macro add_8888_8888_8888_newline
+ ldr Y, =0x00800080
+.endm
+
+.macro add_8888_8888_8888_1pixel_head d, s, m
+ ldr s, [SRC], #4
+ ldrb m, [MASK], #4
+ ldr WK&d, [DST], #4
+.endm
+
+.macro add_8888_8888_8888_1pixel_tail d, s, m, tmp, half
+ mul_8888_8 s, m, tmp, half
+ uqadd8 WK&d, WK&d, s
+.endm
+
+.macro add_8888_8888_8888_2pixels_head d0, d1, s0, s1, tmp0, tmp1, half
+ ldm SRC!, {s0, s1}
+ ldrb WK&d0, [MASK], #4
+ ldrb WK&d1, [MASK], #4
+ uxtb16 tmp0, s0
+ uxtb16 s0, s0, ror #8
+ uxtb16 tmp1, s1
+ uxtb16 s1, s1, ror #8
+ mla tmp0, tmp0, WK&d0, half
+ mla s0, s0, WK&d0, half
+ mla tmp1, tmp1, WK&d1, half
+ mla s1, s1, WK&d1, half
+ ldm DST!, {WK&d0, WK&d1}
+.endm
+
+.macro add_8888_8888_8888_2pixels_tail d0, d1, s0, s1, tmp0, tmp1
+ uxtab16 tmp0, tmp0, tmp0, ror #8
+ uxtab16 s0, s0, s0, ror #8
+ uxtab16 tmp1, tmp1, tmp1, ror #8
+ uxtab16 s1, s1, s1, ror #8
+ mov tmp0, tmp0, ror #8
+ mov tmp1, tmp1, ror #8
+ sel s0, tmp0, s0
+ sel s1, tmp1, s1
+ uqadd8 WK&d0, WK&d0, s0
+ uqadd8 WK&d1, WK&d1, s1
+.endm
+
+.macro add_8888_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ add_8888_8888_8888_2pixels_head %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W, Y
+ add_8888_8888_8888_2pixels_tail %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+ add_8888_8888_8888_2pixels_head %(firstreg+2), %(firstreg+3), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W, Y
+ .elseif numbytes == 8
+ add_8888_8888_8888_2pixels_head %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W, Y
+ .else // numbytes == 4
+ add_8888_8888_8888_1pixel_head firstreg, STRIDE_S, STRIDE_M
+ .endif
+.endm
+
+.macro add_8888_8888_8888_process_tail cond, numbytes, firstreg
+ .if numbytes == 16
+ add_8888_8888_8888_2pixels_tail %(firstreg+2), %(firstreg+3), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+ .elseif numbytes == 8
+ add_8888_8888_8888_2pixels_tail %(firstreg+0), %(firstreg+1), STRIDE_D, STRIDE_S, STRIDE_M, ORIG_W
+ .else // numbytes == 4
+ add_8888_8888_8888_1pixel_tail firstreg, STRIDE_S, STRIDE_M, ORIG_W, Y
+ .endif
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_mask_asm_armv6, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER, \
+ 2, /* prefetch distance */ \
+ add_8888_8888_8888_init, \
+ add_8888_8888_8888_newline, \
+ nop_macro, /* cleanup */ \
+ add_8888_8888_8888_process_head, \
+ add_8888_8888_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 8bdda82..834995a 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -81,6 +81,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
uint32_t, uint32_t)
+PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
+
void
pixman_composite_src_n_8888_asm_armv6 (int32_t w,
int32_t h,
@@ -305,6 +307,8 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+ imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
+
imp->blt = arm_simd_blt;
imp->fill = arm_simd_fill;
--
1.7.5.4
More information about the Pixman
mailing list