[Pixman] [PATCH] sse2: Implement simple bilinear scaling for x8r8g8b8 to a8r8g8b8
Siarhei Siamashka
siarhei.siamashka at gmail.com
Wed Jan 23 14:32:02 PST 2013
On Wed, 23 Jan 2013 14:37:41 +0000
Chris Wilson <chris at chris-wilson.co.uk> wrote:
> Improves firefon-tron on a IVB i7-3720qm: 68.6s to 45.2s.
Looks good, except for one possible additional optimization.
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> pixman/pixman-sse2.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 63 insertions(+)
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index fc873cc..bc3e2f1 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -5679,6 +5679,67 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
> NORMAL, FLAG_NONE)
>
> static force_inline void
> +scaled_bilinear_scanline_sse2_0888_8888_SRC (uint32_t * dst,
> + const uint32_t * mask,
> + const uint32_t * src_top,
> + const uint32_t * src_bottom,
> + int32_t w,
> + int wt,
> + int wb,
> + pixman_fixed_t vx,
> + pixman_fixed_t unit_x,
> + pixman_fixed_t max_vx,
> + pixman_bool_t zero_src)
> +{
> + BILINEAR_DECLARE_VARIABLES;
> + uint32_t pix1, pix2, pix3, pix4;
> +
> + while ((w -= 4) >= 0)
> + {
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
> + *dst++ = pix1 | 0xff000000;
> + *dst++ = pix2 | 0xff000000;
> + *dst++ = pix3 | 0xff000000;
> + *dst++ = pix4 | 0xff000000;
> + }
> +
> + if (w & 2)
> + {
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
> + *dst++ = pix1 | 0xff000000;
> + *dst++ = pix2 | 0xff000000;
> + }
> +
> + if (w & 1)
> + {
> + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
This particular part of code gets compiled into the following:
ad3d: c1 fa 10 sar $0x10,%edx
ad40: 66 41 0f 6f d5 movdqa %xmm13,%xmm2
ad45: 48 63 d2 movslq %edx,%rdx
ad48: f3 0f 7e 0c 91 movq (%rcx,%rdx,4),%xmm1
ad4d: 66 0f 71 d2 09 psrlw $0x9,%xmm2
ad52: 66 41 0f ef d3 pxor %xmm11,%xmm2
ad57: 66 0f 60 c8 punpcklbw %xmm0,%xmm1
ad5b: 66 41 0f fd d2 paddw %xmm10,%xmm2
ad60: 66 0f d5 f1 pmullw %xmm1,%xmm6
ad64: f3 41 0f 7e 0c 91 movq (%r9,%rdx,4),%xmm1
ad6a: 66 0f 60 c8 punpcklbw %xmm0,%xmm1
ad6e: 66 0f d5 e9 pmullw %xmm1,%xmm5
ad72: 66 0f fd ee paddw %xmm6,%xmm5
ad76: 66 0f 70 cd 4e pshufd $0x4e,%xmm5,%xmm1
ad7b: 66 0f 69 cd punpckhwd %xmm5,%xmm1
ad7f: 66 0f f5 ca pmaddwd %xmm2,%xmm1
ad83: 66 0f 72 d1 0e psrld $0xe,%xmm1
ad88: 66 0f 6b c9 packssdw %xmm1,%xmm1
ad8c: 66 0f 67 c9 packuswb %xmm1,%xmm1
> + *dst = pix1 | 0xff000000;
ad90: 66 0f 7e 4c 24 04 movd %xmm1,0x4(%rsp)
ad96: 8b 54 24 04 mov 0x4(%rsp),%edx
ad9a: 81 ca 00 00 00 ff or $0xff000000,%edx
ada0: 89 10 mov %edx,(%rax)
Here it involves some gymnastics saving data to stack, reloading
it into a general purpose register, doing OR operation and finally
storing to the destination.
Doing the OR operation with XMM register directly could require less
instructions and be potentially faster.
But I guess a bigger job of making sure that no cycles are wasted
in SSE2 backend is still waiting for its champion :-)
> + }
> +
> +}
> +
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_cover_SRC,
> + scaled_bilinear_scanline_sse2_0888_8888_SRC,
> + uint32_t, uint32_t, uint32_t,
> + COVER, FLAG_NONE)
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_pad_SRC,
> + scaled_bilinear_scanline_sse2_0888_8888_SRC,
> + uint32_t, uint32_t, uint32_t,
> + PAD, FLAG_NONE)
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_none_SRC,
> + scaled_bilinear_scanline_sse2_0888_8888_SRC,
> + uint32_t, uint32_t, uint32_t,
> + NONE, FLAG_NONE)
> +FAST_BILINEAR_MAINLOOP_COMMON (sse2_0888_8888_normal_SRC,
> + scaled_bilinear_scanline_sse2_0888_8888_SRC,
> + uint32_t, uint32_t, uint32_t,
> + NORMAL, FLAG_NONE)
> +
> +static force_inline void
> scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
> const uint32_t * mask,
> const uint32_t * src_top,
> @@ -6185,6 +6246,8 @@ static const pixman_fast_path_t sse2_fast_paths[] =
> SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
> SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
> SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
> + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8, sse2_0888_8888),
> + SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8, sse2_0888_8888),
>
> SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
> SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
--
Best regards,
Siarhei Siamashka
More information about the Pixman
mailing list