[Pixman] [PATCH] sse2: Implement simple bilinear scaling for x8r8g8b8 to a8r8g8b8

Wed Jan 23 14:32:02 PST 2013

On Wed, 23 Jan 2013 14:37:41 +0000
Chris Wilson <chris at chris-wilson.co.uk> wrote:

> Improves firefon-tron on a IVB i7-3720qm: 68.6s to 45.2s.

Looks good, except for one possible additional optimization.

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  pixman/pixman-sse2.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index fc873cc..bc3e2f1 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -5679,6 +5679,67 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
>  			       NORMAL, FLAG_NONE)
>  
>  static force_inline void
> +scaled_bilinear_scanline_sse2_0888_8888_SRC (uint32_t *       dst,
> +					     const uint32_t * mask,
> +					     const uint32_t * src_top,
> +					     const uint32_t * src_bottom,
> +					     int32_t          w,
> +					     int              wt,
> +					     int              wb,
> +					     pixman_fixed_t   vx,
> +					     pixman_fixed_t   unit_x,
> +					     pixman_fixed_t   max_vx,
> +					     pixman_bool_t    zero_src)
> +{
> +    BILINEAR_DECLARE_VARIABLES;
> +    uint32_t pix1, pix2, pix3, pix4;
> +
> +    while ((w -= 4) >= 0)
> +    {
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
> +	*dst++ = pix1 | 0xff000000;
> +	*dst++ = pix2 | 0xff000000;
> +	*dst++ = pix3 | 0xff000000;
> +	*dst++ = pix4 | 0xff000000;
> +    }
> +
> +    if (w & 2)
> +    {
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
> +	*dst++ = pix1 | 0xff000000;
> +	*dst++ = pix2 | 0xff000000;
> +    }
> +
> +    if (w & 1)
> +    {
> +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

This particular part of code gets compiled into the following:

    ad3d:	c1 fa 10             	sar    $0x10,%edx
    ad40:	66 41 0f 6f d5       	movdqa %xmm13,%xmm2
    ad45:	48 63 d2             	movslq %edx,%rdx
    ad48:	f3 0f 7e 0c 91       	movq   (%rcx,%rdx,4),%xmm1
    ad4d:	66 0f 71 d2 09       	psrlw  $0x9,%xmm2
    ad52:	66 41 0f ef d3       	pxor   %xmm11,%xmm2
    ad57:	66 0f 60 c8          	punpcklbw %xmm0,%xmm1
    ad5b:	66 41 0f fd d2       	paddw  %xmm10,%xmm2
    ad60:	66 0f d5 f1          	pmullw %xmm1,%xmm6
    ad64:	f3 41 0f 7e 0c 91    	movq   (%r9,%rdx,4),%xmm1
    ad6a:	66 0f 60 c8          	punpcklbw %xmm0,%xmm1
    ad6e:	66 0f d5 e9          	pmullw %xmm1,%xmm5
    ad72:	66 0f fd ee          	paddw  %xmm6,%xmm5
    ad76:	66 0f 70 cd 4e       	pshufd $0x4e,%xmm5,%xmm1
    ad7b:	66 0f 69 cd          	punpckhwd %xmm5,%xmm1
    ad7f:	66 0f f5 ca          	pmaddwd %xmm2,%xmm1
    ad83:	66 0f 72 d1 0e       	psrld  $0xe,%xmm1
    ad88:	66 0f 6b c9          	packssdw %xmm1,%xmm1
    ad8c:	66 0f 67 c9          	packuswb %xmm1,%xmm1

> +	*dst = pix1 | 0xff000000;

    ad90:	66 0f 7e 4c 24 04    	movd   %xmm1,0x4(%rsp)
    ad96:	8b 54 24 04          	mov    0x4(%rsp),%edx
    ad9a:	81 ca 00 00 00 ff    	or     $0xff000000,%edx
    ada0:	89 10                	mov    %edx,(%rax)

Here it involves some gymnastics saving data to stack, reloading
it into a general purpose register, doing OR operation and finally
storing to the destination.

Doing the OR operation with XMM register directly could require less
instructions and be potentially faster.

But I guess a bigger job of making sure that no cycles are wasted
in SSE2 backend is still waiting for its champion :-)

> +    }
> +
> +}
> +
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_cover_SRC,
> +			       scaled_bilinear_scanline_sse2_0888_8888_SRC,
> +			       uint32_t, uint32_t, uint32_t,
> +			       COVER, FLAG_NONE)
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_pad_SRC,
> +			       scaled_bilinear_scanline_sse2_0888_8888_SRC,
> +			       uint32_t, uint32_t, uint32_t,
> +			       PAD, FLAG_NONE)
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_none_SRC,
> +			       scaled_bilinear_scanline_sse2_0888_8888_SRC,
> +			       uint32_t, uint32_t, uint32_t,
> +			       NONE, FLAG_NONE)
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_normal_SRC,
> +			       scaled_bilinear_scanline_sse2_0888_8888_SRC,
> +			       uint32_t, uint32_t, uint32_t,
> +			       NORMAL, FLAG_NONE)
> +
> +static force_inline void
>  scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
>  					      const uint32_t * mask,
>  					      const uint32_t * src_top,
> @@ -6185,6 +6246,8 @@ static const pixman_fast_path_t sse2_fast_paths[] =
>      SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
>      SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
>      SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
> +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8, sse2_0888_8888),
> +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8, sse2_0888_8888),
>  
>      SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
>      SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

-- 
Best regards,
Siarhei Siamashka