[Pixman] [PATCH 11/14] ARMv6: Add fast path for in_reverse_8888_8888
Siarhei Siamashka
siarhei.siamashka at gmail.com
Sun Oct 13 18:47:28 PDT 2013
On Wed, 2 Oct 2013 00:00:31 +0100
Ben Avison <bavison at riscosopen.org> wrote:
> lowlevel-blt-bench results:
>
> Before After
> Mean StdDev Mean StdDev Confidence Change
> L1 21.3 0.1 32.5 0.2 100.0% +52.1%
> L2 12.1 0.2 19.5 0.5 100.0% +61.2%
> M 11.0 0.0 17.1 0.0 100.0% +54.6%
> HT 8.7 0.0 12.8 0.1 100.0% +46.9%
> VT 8.6 0.0 12.5 0.1 100.0% +46.0%
> R 8.6 0.0 12.0 0.1 100.0% +40.6%
> RT 5.1 0.1 6.6 0.1 100.0% +28.8%
>
> Trimmed cairo-perf-trace results:
>
> Before After
> Mean StdDev Mean StdDev Confidence Change
> t-firefox-paintball 18.9 0.1 15.4 0.2 100.0% +22.9%
> ---
> pixman/pixman-arm-simd-asm.S | 104 ++++++++++++++++++++++++++++++++++++++++++
> pixman/pixman-arm-simd.c | 7 +++
> 2 files changed, 111 insertions(+)
>
> diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
> index 9357e57..8080c9c 100644
> --- a/pixman/pixman-arm-simd-asm.S
> +++ b/pixman/pixman-arm-simd-asm.S
> @@ -923,6 +923,110 @@ generate_composite_function \
>
> /******************************************************************************/
>
> +.macro in_reverse_8888_8888_init
> + /* Hold loop invariant in MASK */
> + ldr MASK, =0x00800080
> + /* Set GE[3:0] to 0101 so SEL instructions do what we want */
> + uadd8 SCRATCH, MASK, MASK
> + /* Offset the source pointer: we only need the alpha bytes */
> + add SRC, SRC, #3
> + line_saved_regs ORIG_W
> +.endm
> +
> +.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
> + ldrb ORIG_W, [SRC], #4
> + .if numbytes >= 8
> + ldrb WK®1, [SRC], #4
> + .if numbytes == 16
> + ldrb WK®2, [SRC], #4
> + ldrb WK®3, [SRC], #4
> + .endif
> + .endif
> + add DST, DST, #numbytes
> +.endm
> +
> +.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
> + in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
> +.endm
> +
> +.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
> + .if is_only != 1
> + movs s, ORIG_W
> + .if offset != 0
> + ldrb ORIG_W, [SRC, #offset]
> + .endif
> + beq 01f
> + teq STRIDE_M, #0xFF
> + beq 02f
> + .endif
> + uxtb16 SCRATCH, d /* rb_dest */
> + uxtb16 d, d, ror #8 /* ag_dest */
> + mla SCRATCH, SCRATCH, s, MASK
> + mla d, d, s, MASK
> + uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
> + uxtab16 d, d, d, ror #8
> + mov SCRATCH, SCRATCH, ror #8
> + sel d, SCRATCH, d
> + b 02f
> + .if offset == 0
> +48: /* Last mov d,#0 of the set - used as part of shortcut for
> + * source values all 0 */
> + .endif
> +01: mov d, #0
> +02:
> +.endm
> +
> +.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
> + .if numbytes == 4
> + teq ORIG_W, ORIG_W, asr #32
> + ldrne WK®1, [DST, #-4]
> + .elseif numbytes == 8
> + teq ORIG_W, WK®1
> + teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
> + ldmnedb DST, {WK®1-WK®2}
> + .else
> + teq ORIG_W, WK®1
> + teqeq ORIG_W, WK®2
> + teqeq ORIG_W, WK®3
> + teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
> + ldmnedb DST, {WK®1-WK®4}
> + .endif
> + cmnne DST, #0 /* clear C if NE */
> + bcs 49f /* no writes to dest if source all -1 */
> + beq 48f /* set dest to all 0 if source all 0 */
> + .if numbytes == 4
> + in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1
> + str WK®1, [DST, #-4]
> + .elseif numbytes == 8
> + in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0
> + in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0
> + stmdb DST, {WK®1-WK®2}
> + .else
> + in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0
> + in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0
> + in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0
> + in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0
> + stmdb DST, {WK®1-WK®4}
> + .endif
> +49:
> +.endm
> +
> +.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
> + in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
> +.endm
> +
> +generate_composite_function \
> + pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
> + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
Is there a good justification for using the new FLAG_NO_PRELOAD_DST flag
here?
--
Best regards,
Siarhei Siamashka
More information about the Pixman
mailing list