[Pixman] [PATCH] MIPS: DSPr2: Added mips_dspr2_blt and mips_dspr2_fill routines.
Lukic, Nemanja
nlukic at mips.com
Tue Feb 28 10:20:18 PST 2012
Good point.
Only problem there is that address on which we are storing might not be 4-byte aligned (since we are doing memset on array of uint16_t).
But *dest can be aligned (with simple check) before the main loop, and then instead of 16 x sh, we can use 8 x sw.
I will do that, and resubmit the patch.
Thanks,
Nemanja Lukic
-----Original Message-----
From: Matt Turner [mailto:mattst88 at gmail.com]
Sent: Tuesday, February 28, 2012 6:01 PM
To: Nemanja Lukic
Cc: pixman at lists.freedesktop.org; Nemanja Lukic
Subject: Re: [Pixman] [PATCH] MIPS: DSPr2: Added mips_dspr2_blt and mips_dspr2_fill routines.
On Tue, Feb 28, 2012 at 7:47 AM, Nemanja Lukic <nlukic at mips.com> wrote:
> From: Nemanja Lukic <nemanja.lukic at rt-rk.com>
>
> Performance numbers before/after on MIPS-74kc @ 1GHz
>
> Referent (before):
> cairo-perf-trace:
> [ # ] backend test min(s) median(s) stddev. count
> [ # ] image: pixman 0.25.1
> [ 0] image gnome-system-monitor 268.460 269.712 0.22% 6/6
>
> Optimized:
> cairo-perf-trace:
> [ # ] backend test min(s) median(s) stddev. count
> [ # ] image: pixman 0.25.1
> [ 0] image gnome-system-monitor 246.565 246.706 0.04% 6/6
> ---
> pixman/pixman-mips-dspr2-asm.S | 114 ++++++++++++++++++++++++++++
> pixman/pixman-mips-dspr2.c | 163 ++++++++++++++++++++++++++++++++++++++++
> pixman/pixman-mips-dspr2.h | 4 +
> 3 files changed, 281 insertions(+), 0 deletions(-)
>
> diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
> index 0a4c87e..4125beb 100644
> --- a/pixman/pixman-mips-dspr2-asm.S
> +++ b/pixman/pixman-mips-dspr2-asm.S
> @@ -31,6 +31,120 @@
>
> #include "pixman-mips-dspr2-asm.h"
>
> +LEAF_MIPS32R2(pixman_fill_buff16_mips)
> +/*
> + * a0 - *dest
> + * a1 - count (bytes)
> + * a2 - value to fill buffer with
> + */
> +
> + beqz a1, 3f
> + nop
> + srl t1, a1, 5 /* t1 how many multiples of 32 bytes */
> + beqz t1, 2f
> + nop
> +1:
> + addiu t1, t1, -1
> + beqz t1, 11f
> + addiu a1, a1, -32
> + pref 30, 32(a0)
> + sh a2, 0(a0)
> + sh a2, 2(a0)
> + sh a2, 4(a0)
> + sh a2, 6(a0)
> + sh a2, 8(a0)
> + sh a2, 10(a0)
> + sh a2, 12(a0)
> + sh a2, 14(a0)
> + sh a2, 16(a0)
> + sh a2, 18(a0)
> + sh a2, 20(a0)
> + sh a2, 22(a0)
> + sh a2, 24(a0)
> + sh a2, 26(a0)
> + sh a2, 28(a0)
> + sh a2, 30(a0)
> + b 1b
> + addiu a0, a0, 32
> +11:
> + sh a2, 0(a0)
> + sh a2, 2(a0)
> + sh a2, 4(a0)
> + sh a2, 6(a0)
> + sh a2, 8(a0)
> + sh a2, 10(a0)
> + sh a2, 12(a0)
> + sh a2, 14(a0)
> + sh a2, 16(a0)
> + sh a2, 18(a0)
> + sh a2, 20(a0)
> + sh a2, 22(a0)
> + sh a2, 24(a0)
> + sh a2, 26(a0)
> + sh a2, 28(a0)
> + sh a2, 30(a0)
> + addiu a0, a0, 32
> +2:
> + blez a1, 3f
> + addiu a1, a1, -2
> + sh a2, 0(a0)
> + b 2b
> + addiu a0,a0, 2
> +3:
> + jr ra
> + nop
> +
> +END(pixman_fill_buff16_mips)
Couldn't we do 4-byte stores in the main loop? I would think that
would be faster.
> +
> +LEAF_MIPS32R2(pixman_fill_buff32_mips)
> +/*
> + * a0 - *dest
> + * a1 - count (bytes)
> + * a2 - value to fill buffer with
> + */
> +
> + beqz a1, 3f
> + nop
> + srl t1, a1, 5 /* t1 how many multiples of 32 bytes */
> + beqz t1, 2f
> + nop
> +1:
> + addiu t1, t1, -1
> + beqz t1, 11f
> + addiu a1, a1, -32
> + pref 30, 32(a0)
> + sw a2, 0(a0)
> + sw a2, 4(a0)
> + sw a2, 8(a0)
> + sw a2, 12(a0)
> + sw a2, 16(a0)
> + sw a2, 20(a0)
> + sw a2, 24(a0)
> + sw a2, 28(a0)
> + b 1b
> + addiu a0, a0, 32
> +11:
> + sw a2, 0(a0)
> + sw a2, 4(a0)
> + sw a2, 8(a0)
> + sw a2, 12(a0)
> + sw a2, 16(a0)
> + sw a2, 20(a0)
> + sw a2, 24(a0)
> + sw a2, 28(a0)
> + addiu a0, a0, 32
> +2:
> + blez a1, 3f
> + addiu a1, a1, -4
> + sw a2, 0(a0)
> + b 2b
> + addiu a0,a0, 4
> +3:
> + jr ra
> + nop
> +
> +END(pixman_fill_buff32_mips)
> +
> LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips)
> /*
> * a0 - dst (r5g6b5)
> diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
> index e331853..2beada3 100644
> --- a/pixman/pixman-mips-dspr2.c
> +++ b/pixman/pixman-mips-dspr2.c
> @@ -49,6 +49,119 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
> PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
> uint8_t, 3, uint8_t, 3)
>
> +static pixman_bool_t
> +pixman_fill_mips (uint32_t *bits,
> + int stride,
> + int bpp,
> + int x,
> + int y,
> + int width,
> + int height,
> + uint32_t _xor)
> +{
> + uint8_t *byte_line;
> + uint32_t byte_width;
> + switch (bpp)
> + {
> + case 16:
> + stride = stride * (int) sizeof (uint32_t) / 2;
> + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
> + byte_width = width * 2;
> + stride *= 2;
> +
> + while (height--)
> + {
> + uint8_t *dst = byte_line;
> + byte_line += stride;
> + pixman_fill_buff16_mips (dst, byte_width, _xor & 0xffff);
> + }
> + return TRUE;
> + case 32:
> + stride = stride * (int) sizeof (uint32_t) / 4;
> + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
> + byte_width = width * 4;
> + stride *= 4;
> +
> + while (height--)
> + {
> + uint8_t *dst = byte_line;
> + byte_line += stride;
> + pixman_fill_buff32_mips (dst, byte_width, _xor);
> + }
> + return TRUE;
> + default:
> + return FALSE;
> + }
> +}
> +
> +static pixman_bool_t
> +pixman_blt_mips (uint32_t *src_bits,
> + uint32_t *dst_bits,
> + int src_stride,
> + int dst_stride,
> + int src_bpp,
> + int dst_bpp,
> + int src_x,
> + int src_y,
> + int dest_x,
> + int dest_y,
> + int width,
> + int height)
> +{
> + if (src_bpp != dst_bpp)
> + return FALSE;
> +
> + uint8_t *src_bytes;
> + uint8_t *dst_bytes;
> + uint32_t byte_width;
> +
> + switch (src_bpp)
> + {
> + case 16:
> + src_stride = src_stride * (int) sizeof (uint32_t) / 2;
> + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
> + src_bytes =(uint8_t *)(((uint16_t *)src_bits)
> + + src_stride * (src_y) + (src_x));
> + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits)
> + + dst_stride * (dest_y) + (dest_x));
> + byte_width = width * 2;
> + src_stride *= 2;
> + dst_stride *= 2;
> +
> + while (height--)
> + {
> + uint8_t *src = src_bytes;
> + uint8_t *dst = dst_bytes;
> + src_bytes += src_stride;
> + dst_bytes += dst_stride;
> + pixman_mips_fast_memcpy (dst, src, byte_width);
> + }
> + return TRUE;
> + case 32:
> + src_stride = src_stride * (int) sizeof (uint32_t) / 4;
> + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
> + src_bytes = (uint8_t *)(((uint32_t *)src_bits)
> + + src_stride * (src_y) + (src_x));
> + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits)
> + + dst_stride * (dest_y) + (dest_x));
> + byte_width = width * 4;
> + src_stride *= 4;
> + dst_stride *= 4;
> +
> + while (height--)
> + {
> + uint8_t *src = src_bytes;
> + uint8_t *dst = dst_bytes;
> + src_bytes += src_stride;
> + dst_bytes += dst_stride;
> + pixman_mips_fast_memcpy (dst, src, byte_width);
> + }
> + return TRUE;
> + default:
> + return FALSE;
> + }
> +}
> +
> static const pixman_fast_path_t mips_dspr2_fast_paths[] =
> {
> PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mips_composite_src_0565_0565),
> @@ -74,11 +187,61 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
> { PIXMAN_OP_NONE },
> };
>
> +static pixman_bool_t
> +mips_dspr2_blt (pixman_implementation_t *imp,
> + uint32_t * src_bits,
> + uint32_t * dst_bits,
> + int src_stride,
> + int dst_stride,
> + int src_bpp,
> + int dst_bpp,
> + int src_x,
> + int src_y,
> + int dest_x,
> + int dest_y,
> + int width,
> + int height)
> +{
> + if (!pixman_blt_mips (
> + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
> + src_x, src_y, dest_x, dest_y, width, height))
> +
> + {
> + return _pixman_implementation_blt (
> + imp->delegate,
> + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
> + src_x, src_y, dest_x, dest_y, width, height);
> + }
> +
> + return TRUE;
> +}
> +
> +static pixman_bool_t
> +mips_dspr2_fill (pixman_implementation_t *imp,
> + uint32_t * bits,
> + int stride,
> + int bpp,
> + int x,
> + int y,
> + int width,
> + int height,
> + uint32_t xor)
> +{
> + if (pixman_fill_mips (bits, stride, bpp, x, y, width, height, xor))
> + return TRUE;
> +
> + return _pixman_implementation_fill (
> + imp->delegate, bits, stride, bpp, x, y, width, height, xor);
> +}
> +
> pixman_implementation_t *
> _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback)
> {
> pixman_implementation_t *imp =
> _pixman_implementation_create (fallback, mips_dspr2_fast_paths);
>
> + imp->blt = mips_dspr2_blt;
> + imp->fill = mips_dspr2_fill;
> +
> return imp;
> }
> diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
> index 449c42a..a40e7c8 100644
> --- a/pixman/pixman-mips-dspr2.h
> +++ b/pixman/pixman-mips-dspr2.h
> @@ -41,6 +41,10 @@
>
> void
> pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes);
> +void
> +pixman_fill_buff16_mips (void *dst, uint32_t n_bytes, uint16_t value);
> +void
> +pixman_fill_buff32_mips (void *dst, uint32_t n_bytes, uint32_t value);
>
> /****************************************************************/
>
> --
> 1.7.3
>
> _______________________________________________
> Pixman mailing list
> Pixman at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/pixman
More information about the Pixman
mailing list