[Pixman] [PATCH] vmx: implement fast path vmx_composite_over_n_8888

Fri Sep 4 06:05:48 PDT 2015

On Fri,  4 Sep 2015 15:39:00 +0300
Siarhei Siamashka <siarhei.siamashka at gmail.com> wrote:

> Running "lowlevel-blt-bench over_n_8888" on Playstation3 3.2GHz,
> Gentoo ppc (32-bit userland) gave the following results:
> 
> before:  over_n_8888 =  L1: 147.47  L2: 205.86  M:121.07
> after:   over_n_8888 =  L1: 287.27  L2: 261.09  M:133.48
> 
> Signed-off-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
> ---
>  pixman/pixman-vmx.c |   54 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 54 insertions(+), 0 deletions(-)
> 
> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
> index a9bd024..9e551b3 100644
> --- a/pixman/pixman-vmx.c
> +++ b/pixman/pixman-vmx.c
> @@ -2745,6 +2745,58 @@ vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
>  }
>  
>  static void
> +vmx_composite_over_n_8888 (pixman_implementation_t *imp,
> +                           pixman_composite_info_t *info)
> +{
> +    PIXMAN_COMPOSITE_ARGS (info);
> +    uint32_t *dst_line, *dst;
> +    uint32_t src, ia;
> +    int      i, w, dst_stride;
> +    vector unsigned int vdst, vsrc, via;
> +
> +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
> +
> +    if (src == 0)
> +	return;
> +
> +    PIXMAN_IMAGE_GET_LINE (
> +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
> +
> +    vsrc = (vector unsigned int){src, src, src, src};
> +    via = negate (splat_alpha (vsrc));
> +    ia = ALPHA_8 (~src);
> +
> +    while (height--)
> +    {
> +	dst = dst_line;
> +	dst_line += dst_stride;
> +	w = width;
> +
> +	while (w && ((uintptr_t)dst & 15))
> +	{
> +	    uint32_t d = *dst;
> +	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
> +	    *dst++ = d;
> +	    w--;
> +	}
> +
> +	for (i = w / 4; i > 0; i--)
> +	{

And BTW, there is a major speed boost on Playstation3 (Cell BE)
if we add software prefetch here:

           __builtin_prefetch (dst + 32, 0, 0);

In this case we get the following result from lowlevel-blt-bench:

             over_n_8888 =  L1: 293.84  L2: 303.40  M:224.09 (153.67%)
             HT:131.41  VT:105.68  R: 74.14  RT: 25.80 ( 214Kops/s)

The 'M' performance increases from ~133 MPix/s to ~224 MPix/s.

But I guess, POWER8 and other big desktop/server powerpc systems should
have automatic hardware prefetch and have no need for software prefetch.

> +	    vdst = pix_multiply (load_128_aligned (dst), via);
> +	    save_128_aligned (dst, pix_add (vsrc, vdst));
> +	    dst += 4;
> +	}
> +
> +	for (i = w % 4; --i >= 0;)
> +	{
> +	    uint32_t d = dst[i];
> +	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
> +	    dst[i] = d;
> +	}
> +    }
> +}
> +
> +static void
>  vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
>                                 pixman_composite_info_t *info)
>  {
> @@ -3079,6 +3131,8 @@ FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
>  
>  static const pixman_fast_path_t vmx_fast_paths[] =
>  {
> +    PIXMAN_STD_FAST_PATH (OVER, solid,    null, a8r8g8b8, vmx_composite_over_n_8888),
> +    PIXMAN_STD_FAST_PATH (OVER, solid,    null, x8r8g8b8, vmx_composite_over_n_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),

-- 
Best regards,
Siarhei Siamashka