[Pixman] [PATCH 2/3] Moving helper functions in pixman-sse2.c to pixman-sse2.h.

Matt Turner mattst88 at gmail.com
Tue Jan 22 01:14:46 UTC 2019


On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli
<raghuveer.devulapalli at intel.com> wrote:
>
> From: raghuveer devulapalli <raghuveer.devulapalli at intel.com>
>
> These helper function will be reused in pixman-avx2.c implementations in
> the future.
> ---
>  pixman/pixman-sse2.c | 504 +------------------------------------------
>  pixman/pixman-sse2.h | 502 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 503 insertions(+), 503 deletions(-)
>  create mode 100644 pixman/pixman-sse2.h
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index 8955103..8dea0c2 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -32,509 +32,7 @@
>
>  /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
>  #define PSHUFD_IS_FAST 0
> -
> -#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
> -#include <emmintrin.h> /* for SSE2 intrinsics */
> -#include "pixman-private.h"
> -#include "pixman-combine32.h"
> -#include "pixman-inlines.h"
> -
> -static __m128i mask_0080;
> -static __m128i mask_00ff;
> -static __m128i mask_0101;
> -static __m128i mask_ffff;
> -static __m128i mask_ff000000;
> -static __m128i mask_alpha;
> -
> -static __m128i mask_565_r;
> -static __m128i mask_565_g1, mask_565_g2;
> -static __m128i mask_565_b;
> -static __m128i mask_red;
> -static __m128i mask_green;
> -static __m128i mask_blue;
> -
> -static __m128i mask_565_fix_rb;
> -static __m128i mask_565_fix_g;
> -
> -static __m128i mask_565_rb;
> -static __m128i mask_565_pack_multiplier;
> -

These are moving to pixman-sse2.h to be used by the code below, which
is to be used by the AVX2 code. But they're initialized in
_pixman_implementation_create_sse2(), which means if you used
PIXMAN_DISABLE=sse2 the AVX2 paths would fail.

I suspect these constants do need to be prefixed with "sse2_", and in
_pixman_x86_get_implementations() you should disable avx2 if
PIXMAN_DISABLE=sse2.
> -static force_inline __m128i
> -unpack_32_1x128 (uint32_t data)
> -{
> -    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
> -}
> -
> -static force_inline void
> -unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
> -{
> -    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
> -    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
> -}
> -
> -static force_inline __m128i
> -unpack_565_to_8888 (__m128i lo)
> -{
> -    __m128i r, g, b, rb, t;
> -
> -    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
> -    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
> -    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
> -
> -    rb = _mm_or_si128 (r, b);
> -    t  = _mm_and_si128 (rb, mask_565_fix_rb);
> -    t  = _mm_srli_epi32 (t, 5);
> -    rb = _mm_or_si128 (rb, t);
> -
> -    t  = _mm_and_si128 (g, mask_565_fix_g);
> -    t  = _mm_srli_epi32 (t, 6);
> -    g  = _mm_or_si128 (g, t);
> -
> -    return _mm_or_si128 (rb, g);
> -}
> -
> -static force_inline void
> -unpack_565_128_4x128 (__m128i  data,
> -                      __m128i* data0,
> -                      __m128i* data1,
> -                      __m128i* data2,
> -                      __m128i* data3)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
> -    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
> -
> -    lo = unpack_565_to_8888 (lo);
> -    hi = unpack_565_to_8888 (hi);
> -
> -    unpack_128_2x128 (lo, data0, data1);
> -    unpack_128_2x128 (hi, data2, data3);
> -}
> -
> -static force_inline uint16_t
> -pack_565_32_16 (uint32_t pixel)
> -{
> -    return (uint16_t) (((pixel >> 8) & 0xf800) |
> -                      ((pixel >> 5) & 0x07e0) |
> -                      ((pixel >> 3) & 0x001f));
> -}
> -
> -static force_inline __m128i
> -pack_2x128_128 (__m128i lo, __m128i hi)
> -{
> -    return _mm_packus_epi16 (lo, hi);
> -}
> -
> -static force_inline __m128i
> -pack_565_2packedx128_128 (__m128i lo, __m128i hi)
> -{
> -    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
> -    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
> -
> -    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
> -    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
> -
> -    __m128i g0 = _mm_and_si128 (lo, mask_green);
> -    __m128i g1 = _mm_and_si128 (hi, mask_green);
> -
> -    t0 = _mm_or_si128 (t0, g0);
> -    t1 = _mm_or_si128 (t1, g1);
> -
> -    /* Simulates _mm_packus_epi32 */
> -    t0 = _mm_slli_epi32 (t0, 16 - 5);
> -    t1 = _mm_slli_epi32 (t1, 16 - 5);
> -    t0 = _mm_srai_epi32 (t0, 16);
> -    t1 = _mm_srai_epi32 (t1, 16);
> -    return _mm_packs_epi32 (t0, t1);
> -}
> -
> -static force_inline __m128i
> -pack_565_2x128_128 (__m128i lo, __m128i hi)
> -{
> -    __m128i data;
> -    __m128i r, g1, g2, b;
> -
> -    data = pack_2x128_128 (lo, hi);
> -
> -    r  = _mm_and_si128 (data, mask_565_r);
> -    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
> -    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
> -    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
> -
> -    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
> -}
> -
> -static force_inline __m128i
> -pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
> -{
> -    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
> -                            pack_565_2x128_128 (*xmm2, *xmm3));
> -}
> -
> -static force_inline int
> -is_opaque (__m128i x)
> -{
> -    __m128i ffs = _mm_cmpeq_epi8 (x, x);
> -
> -    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
> -}
> -
> -static force_inline int
> -is_zero (__m128i x)
> -{
> -    return _mm_movemask_epi8 (
> -       _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
> -}
> -
> -static force_inline int
> -is_transparent (__m128i x)
> -{
> -    return (_mm_movemask_epi8 (
> -               _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
> -}
> -
> -static force_inline __m128i
> -expand_pixel_32_1x128 (uint32_t data)
> -{
> -    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
> -}
> -
> -static force_inline __m128i
> -expand_alpha_1x128 (__m128i data)
> -{
> -    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
> -                                                    _MM_SHUFFLE (3, 3, 3, 3)),
> -                               _MM_SHUFFLE (3, 3, 3, 3));
> -}
> -
> -static force_inline void
> -expand_alpha_2x128 (__m128i  data_lo,
> -                    __m128i  data_hi,
> -                    __m128i* alpha_lo,
> -                    __m128i* alpha_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
> -    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
> -
> -    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
> -    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
> -}
> -
> -static force_inline void
> -expand_alpha_rev_2x128 (__m128i  data_lo,
> -                        __m128i  data_hi,
> -                        __m128i* alpha_lo,
> -                        __m128i* alpha_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
> -    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
> -    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
> -    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline void
> -pix_multiply_2x128 (__m128i* data_lo,
> -                    __m128i* data_hi,
> -                    __m128i* alpha_lo,
> -                    __m128i* alpha_hi,
> -                    __m128i* ret_lo,
> -                    __m128i* ret_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
> -    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
> -    lo = _mm_adds_epu16 (lo, mask_0080);
> -    hi = _mm_adds_epu16 (hi, mask_0080);
> -    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
> -    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
> -}
> -
> -static force_inline void
> -pix_add_multiply_2x128 (__m128i* src_lo,
> -                        __m128i* src_hi,
> -                        __m128i* alpha_dst_lo,
> -                        __m128i* alpha_dst_hi,
> -                        __m128i* dst_lo,
> -                        __m128i* dst_hi,
> -                        __m128i* alpha_src_lo,
> -                        __m128i* alpha_src_hi,
> -                        __m128i* ret_lo,
> -                        __m128i* ret_hi)
> -{
> -    __m128i t1_lo, t1_hi;
> -    __m128i t2_lo, t2_hi;
> -
> -    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
> -    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
> -
> -    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
> -    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
> -}
> -
> -static force_inline void
> -negate_2x128 (__m128i  data_lo,
> -              __m128i  data_hi,
> -              __m128i* neg_lo,
> -              __m128i* neg_hi)
> -{
> -    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
> -    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
> -}
> -
> -static force_inline void
> -invert_colors_2x128 (__m128i  data_lo,
> -                     __m128i  data_hi,
> -                     __m128i* inv_lo,
> -                     __m128i* inv_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
> -    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
> -    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
> -    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
> -}
> -
> -static force_inline void
> -over_2x128 (__m128i* src_lo,
> -            __m128i* src_hi,
> -            __m128i* alpha_lo,
> -            __m128i* alpha_hi,
> -            __m128i* dst_lo,
> -            __m128i* dst_hi)
> -{
> -    __m128i t1, t2;
> -
> -    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
> -
> -    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
> -
> -    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
> -    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
> -}
> -
> -static force_inline void
> -over_rev_non_pre_2x128 (__m128i  src_lo,
> -                        __m128i  src_hi,
> -                        __m128i* dst_lo,
> -                        __m128i* dst_hi)
> -{
> -    __m128i lo, hi;
> -    __m128i alpha_lo, alpha_hi;
> -
> -    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
> -
> -    lo = _mm_or_si128 (alpha_lo, mask_alpha);
> -    hi = _mm_or_si128 (alpha_hi, mask_alpha);
> -
> -    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
> -
> -    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
> -
> -    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
> -}
> -
> -static force_inline void
> -in_over_2x128 (__m128i* src_lo,
> -               __m128i* src_hi,
> -               __m128i* alpha_lo,
> -               __m128i* alpha_hi,
> -               __m128i* mask_lo,
> -               __m128i* mask_hi,
> -               __m128i* dst_lo,
> -               __m128i* dst_hi)
> -{
> -    __m128i s_lo, s_hi;
> -    __m128i a_lo, a_hi;
> -
> -    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
> -    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
> -
> -    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
> -}
> -
> -/* load 4 pixels from a 16-byte boundary aligned address */
> -static force_inline __m128i
> -load_128_aligned (__m128i* src)
> -{
> -    return _mm_load_si128 (src);
> -}
> -
> -/* load 4 pixels from a unaligned address */
> -static force_inline __m128i
> -load_128_unaligned (const __m128i* src)
> -{
> -    return _mm_loadu_si128 (src);
> -}
> -
> -/* save 4 pixels using Write Combining memory on a 16-byte
> - * boundary aligned address
> - */
> -static force_inline void
> -save_128_write_combining (__m128i* dst,
> -                          __m128i  data)
> -{
> -    _mm_stream_si128 (dst, data);
> -}
> -
> -/* save 4 pixels on a 16-byte boundary aligned address */
> -static force_inline void
> -save_128_aligned (__m128i* dst,
> -                  __m128i  data)
> -{
> -    _mm_store_si128 (dst, data);
> -}
> -
> -/* save 4 pixels on a unaligned address */
> -static force_inline void
> -save_128_unaligned (__m128i* dst,
> -                    __m128i  data)
> -{
> -    _mm_storeu_si128 (dst, data);
> -}
> -
> -static force_inline __m128i
> -load_32_1x128 (uint32_t data)
> -{
> -    return _mm_cvtsi32_si128 (data);
> -}
> -
> -static force_inline __m128i
> -expand_alpha_rev_1x128 (__m128i data)
> -{
> -    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline __m128i
> -expand_pixel_8_1x128 (uint8_t data)
> -{
> -    return _mm_shufflelo_epi16 (
> -       unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline __m128i
> -pix_multiply_1x128 (__m128i data,
> -                   __m128i alpha)
> -{
> -    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
> -                                           mask_0080),
> -                           mask_0101);
> -}
> -
> -static force_inline __m128i
> -pix_add_multiply_1x128 (__m128i* src,
> -                       __m128i* alpha_dst,
> -                       __m128i* dst,
> -                       __m128i* alpha_src)
> -{
> -    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
> -    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
> -
> -    return _mm_adds_epu8 (t1, t2);
> -}
> -
> -static force_inline __m128i
> -negate_1x128 (__m128i data)
> -{
> -    return _mm_xor_si128 (data, mask_00ff);
> -}
> -
> -static force_inline __m128i
> -invert_colors_1x128 (__m128i data)
> -{
> -    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
> -}
> -
> -static force_inline __m128i
> -over_1x128 (__m128i src, __m128i alpha, __m128i dst)
> -{
> -    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
> -}
> -
> -static force_inline __m128i
> -in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
> -{
> -    return over_1x128 (pix_multiply_1x128 (*src, *mask),
> -                      pix_multiply_1x128 (*alpha, *mask),
> -                      *dst);
> -}
> -
> -static force_inline __m128i
> -over_rev_non_pre_1x128 (__m128i src, __m128i dst)
> -{
> -    __m128i alpha = expand_alpha_1x128 (src);
> -
> -    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
> -                                          _mm_or_si128 (alpha, mask_alpha)),
> -                      alpha,
> -                      dst);
> -}
> -
> -static force_inline uint32_t
> -pack_1x128_32 (__m128i data)
> -{
> -    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
> -}
> -
> -static force_inline __m128i
> -expand565_16_1x128 (uint16_t pixel)
> -{
> -    __m128i m = _mm_cvtsi32_si128 (pixel);
> -
> -    m = unpack_565_to_8888 (m);
> -
> -    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
> -}
> -
> -static force_inline uint32_t
> -core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
> -{
> -    uint8_t a;
> -    __m128i xmms;
> -
> -    a = src >> 24;
> -
> -    if (a == 0xff)
> -    {
> -       return src;
> -    }
> -    else if (src)
> -    {
> -       xmms = unpack_32_1x128 (src);
> -       return pack_1x128_32 (
> -           over_1x128 (xmms, expand_alpha_1x128 (xmms),
> -                       unpack_32_1x128 (dst)));
> -    }
> -
> -    return dst;
> -}
> -
> -static force_inline uint32_t
> -combine1 (const uint32_t *ps, const uint32_t *pm)
> -{
> -    uint32_t s = *ps;
> -
> -    if (pm)
> -    {
> -       __m128i ms, mm;
> -
> -       mm = unpack_32_1x128 (*pm);
> -       mm = expand_alpha_1x128 (mm);
> -
> -       ms = unpack_32_1x128 (s);
> -       ms = pix_multiply_1x128 (ms, mm);
> -
> -       s = pack_1x128_32 (ms);
> -    }
> -
> -    return s;
> -}
> +#include "pixman-sse2.h"
>
>  static force_inline __m128i
>  combine4 (const __m128i *ps, const __m128i *pm)
> diff --git a/pixman/pixman-sse2.h b/pixman/pixman-sse2.h
> new file mode 100644
> index 0000000..37105e4
> --- /dev/null
> +++ b/pixman/pixman-sse2.h
> @@ -0,0 +1,502 @@
> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
> +#include <emmintrin.h> /* for SSE2 intrinsics */
> +#include "pixman-private.h"
> +#include "pixman-combine32.h"
> +#include "pixman-inlines.h"
> +
> +__m128i mask_0080;
> +__m128i mask_00ff;
> +__m128i mask_0101;
> +__m128i mask_ffff;
> +__m128i mask_ff000000;
> +__m128i mask_alpha;
> +
> +__m128i mask_565_r;
> +__m128i mask_565_g1, mask_565_g2;
> +__m128i mask_565_b;
> +__m128i mask_red;
> +__m128i mask_green;
> +__m128i mask_blue;
> +
> +__m128i mask_565_fix_rb;
> +__m128i mask_565_fix_g;
> +
> +__m128i mask_565_rb;
> +__m128i mask_565_pack_multiplier;

I think you want to make these declarations 'extern', and keep their
storage in pixman-sse2.c. As it is, we're now allocating space for
them in every translation unit this header is included in.


More information about the Pixman mailing list