[Pixman] [PATCH] sse2: Using MMX and SSE 4.1

Mon May 7 23:34:21 PDT 2012

Hi, Matt.

Win64 MSVC target doesn't support MMX intrinsic.  If you add MMX code to 
pixman-sse2.c, please add USE_X86_MMX macro checking for all.

And if using MMX, you have to call _mm_empty() after MMX code is finished.

I think that you should split SSE4.1 code to another file 
(pixman-sse41.c?).  You know, gcc needs -msse4.1 option for it.

-- Makoto

(2012/05/03 12:42), Matt Turner wrote:
> I started porting my src_8888_0565 MMX function to SSE2, and in the
> process started thinking about using SSE3+. The useful instructions
> added post SSE2 that I see are
> 	SSE3:	lddqu - for unaligned loads across cache lines
> 	SSSE3:	palignr - for unaligned loads (but requires software
> 			  pipelining...)
> 		pmaddubsw - maybe?
> 	SSE4.1:	pextr*, pinsr*
> 		pcmpeqq, ptest
> 		packusdw - for 888 ->  565 packing
>
> I first wrote a basic src_8888_0565 for SSE2 and discovered that the
> performance was worse than MMX (which we've been saying has no use in
> modern systems -- oops!). I figured the cool pmadd algorithm of MMX was
> the cause, but I wondered if 16-byte SSE chunks are too large sometimes.
>
> I added an 8-byte MMX loop before and after the main 16-byte SSE loop
> and got a nice improvement.
>
> Porting the pmadd algorithm to SSE4.1 gave another (very large)
> improvement.
>
> fast:	src_8888_0565 = L1: 655.18  L2: 675.94  M:642.31  ( 23.44%) HT:403.00  VT:286.45  R:307.61  RT:150.59 (1675Kops/s)
> mmx:	src_8888_0565 = L1:2050.45  L2:1988.97  M:1586.16 ( 57.34%) HT:529.12  VT:374.28  R:412.09  RT:177.35 (1913Kops/s)
> sse2:	src_8888_0565 = L1:1518.61  L2:1493.10  M:1279.18 ( 46.24%) HT:433.65  VT:314.48  R:349.14  RT:151.84 (1685Kops/s)
> sse2mmx:src_8888_0565 = L1:1544.91  L2:1520.83  M:1307.79 ( 47.01%) HT:447.82  VT:326.81  R:379.60  RT:174.07 (1878Kops/s)
> sse4:	src_8888_0565 = L1:4654.11  L2:4202.98  M:1885.01 ( 69.35%) HT:540.65  VT:421.04  R:427.73  RT:161.45 (1773Kops/s)
> sse4mmx:src_8888_0565 = L1:4786.27  L2:4255.13  M:1920.18 ( 69.93%) HT:581.42  VT:447.99  R:482.27  RT:193.15 (2049Kops/s)
>
> I'd like to isolate exactly what the performance improvement given by
> the only SSE4.1 instruction (i.e., _mm_packus_epi32) is before declaring
> SSE4.1 a fantastic improvement. If you can come up with a reasonable way
> to pack the two xmm registers together in pack_565_2packedx128_128,
> please tell me. Shuffle only works on hi/lo 8-bytes, so it'd be a pain.
>
> This got me wondering how to proceed. I'd rather not duplicate a bunch
> of code from pixman-mmx.c, and I'd rather not add #ifdef USE_SSE41 to
> pixman-sse2.c and make it a compile-time option (or recompile the whole
> file to get a few improvements from SSE4.1).
>
> It seems like we need a generic solution that would say for each
> compositing function
> 	- this is what you do for 1-byte;
> 	- this is what you do for 8-bytes if you have MMX;
> 	- this is what you do for 16-bytes if you have SSE2;
> 	- this is what you do for 16-bytes if you have SSE3;
> 	- this is what you do for 16-bytes if you have SSE4.1.
> and then construct the functions for generic/MMX/SSE2/SSE4 at build time.
>
> Does this seem like a reasonable approach? *How* to do it -- suggestions
> welcome.
> ---
>   pixman/pixman-sse2.c |  152 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 files changed, 152 insertions(+), 0 deletions(-)
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index e217ca3..763c7b3 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -30,8 +30,12 @@
>   #include<config.h>
>   #endif
>
> +#include<mmintrin.h>
>   #include<xmmintrin.h>  /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
>   #include<emmintrin.h>  /* for SSE2 intrinsics */
> +#if USE_SSE41
> +#include<smmintrin.h>
> +#endif
>   #include "pixman-private.h"
>   #include "pixman-combine32.h"
>   #include "pixman-inlines.h"
> @@ -53,6 +57,9 @@ static __m128i mask_blue;
>   static __m128i mask_565_fix_rb;
>   static __m128i mask_565_fix_g;
>
> +static __m128i mask_565_rb;
> +static __m128i mask_565_pack_multiplier;
> +
>   static force_inline __m128i
>   unpack_32_1x128 (uint32_t data)
>   {
> @@ -120,7 +127,59 @@ pack_2x128_128 (__m128i lo, __m128i hi)
>       return _mm_packus_epi16 (lo, hi);
>   }
>
> +#if USE_X86_MMX
> +#define MC(x) ((__m64)mmx_ ## x)
> +
> +static force_inline __m64
> +pack_4xpacked565 (__m64 a, __m64 b)
> +{
> +    static const uint64_t mmx_565_pack_multiplier = 0x2000000420000004ULL;
> +    static const uint64_t mmx_packed_565_rb = 0x00f800f800f800f8ULL;
> +    static const uint64_t mmx_packed_565_g = 0x0000fc000000fc00ULL;
> +
> +    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
> +    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
> +
> +    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
> +    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
> +
> +    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
> +    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
> +
> +    t0 = _mm_or_si64 (t0, g0);
> +    t1 = _mm_or_si64 (t1, g1);
> +
> +    t0 = _mm_srli_si64 (t0, 5);
> +    t1 = _mm_slli_si64 (t1, 11);
> +    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
> +}
> +#endif
> +
> +#ifdef USE_SSE41
>   static force_inline __m128i
> +pack_565_2packedx128_128 (__m128i lo, __m128i hi)
> +{
> +    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
> +    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
> +
> +    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
> +    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
> +
> +    __m128i g0 = _mm_and_si128 (lo, mask_green);
> +    __m128i g1 = _mm_and_si128 (hi, mask_green);
> +
> +    t0 = _mm_or_si128 (t0, g0);
> +    t1 = _mm_or_si128 (t1, g1);
> +
> +    t0 = _mm_srli_epi32 (t0, 5);
> +    t1 = _mm_srli_epi32 (t1, 5);
> +
> +    /* XXX: maybe there's a way to do this relatively efficiently with SSE2? */
> +    return _mm_packus_epi32 (t0, t1);
> +}
> +#endif
> +
> +__m128i
>   pack_565_2x128_128 (__m128i lo, __m128i hi)
>   {
>       __m128i data;
> @@ -2832,6 +2891,93 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
>   }
>
>   static void
> +sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
> +                              pixman_composite_info_t *info)
> +{
> +    PIXMAN_COMPOSITE_ARGS (info);
> +    uint16_t    *dst_line, *dst;
> +    uint32_t    *src_line, *src, s;
> +    int dst_stride, src_stride;
> +    int32_t w;
> +
> +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
> +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
> +
> +    while (height--)
> +    {
> +	dst = dst_line;
> +	dst_line += dst_stride;
> +	src = src_line;
> +	src_line += src_stride;
> +	w = width;
> +
> +	while (w&&  (unsigned long)dst&  7)
> +	{
> +	    s = *src++;
> +	    *dst = CONVERT_8888_TO_0565 (s);
> +	    dst++;
> +	    w--;
> +	}
> +
> +#if USE_X86_MMX
> +	while (w>= 4&&  (unsigned long)dst&  15)
> +	{
> +	    __m64 vsrc0 = *(__m64 *)(src + 0);
> +	    __m64 vsrc1 = *(__m64 *)(src + 2);
> +
> +	    *(__m64 *)dst = pack_4xpacked565 (vsrc0, vsrc1);
> +
> +	    w -= 4;
> +	    src += 4;
> +	    dst += 4;
> +	}
> +#endif
> +
> +	while (w>= 8)
> +	{
> +	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
> +	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
> +
> +#if USE_SSE41
> +	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
> +#else
> +	    __m128i xmm_src0_lo, xmm_src0_hi, xmm_src1_lo, xmm_src1_hi;
> +	    unpack_128_2x128 (xmm_src0,&xmm_src0_lo,&xmm_src0_hi);
> +	    unpack_128_2x128 (xmm_src1,&xmm_src1_lo,&xmm_src1_hi);
> +
> +	    save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_src0_lo,&xmm_src0_hi,&xmm_src1_lo,&xmm_src1_hi));
> +#endif
> +
> +	    w -= 8;
> +	    src += 8;
> +	    dst += 8;
> +	}
> +
> +#if USE_X86_MMX
> +	while (w>= 4)
> +	{
> +	    __m64 vsrc0 = *(__m64 *)(src + 0);
> +	    __m64 vsrc1 = *(__m64 *)(src + 2);
> +
> +	    *(__m64 *)dst = pack_4xpacked565 (vsrc0, vsrc1);
> +
> +	    w -= 4;
> +	    src += 4;
> +	    dst += 4;
> +	}
> +#endif
> +
> +	while (w)
> +	{
> +	    s = *src++;
> +	    *dst = CONVERT_8888_TO_0565 (s);
> +	    dst++;
> +	    w--;
> +	}
> +    }
> +}
> +
> +static void
>   sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
>   			      pixman_composite_info_t *info)
>   {
> @@ -5727,6 +5873,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
>       PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
>       PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
>       PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
> +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
> +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
> +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
> +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
>       PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
>       PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
>       PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
> @@ -6035,6 +6185,8 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
>       mask_ffff = create_mask_16_128 (0xffff);
>       mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
>       mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
> +    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
> +    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
>
>       /* Set up function pointers */
>       imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;