[Pixman] [PATCH 2/3] Moving helper functions in pixman-sse2.c to pixman-sse2.h.
Matt Turner
mattst88 at gmail.com
Tue Jan 22 01:14:46 UTC 2019
On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli
<raghuveer.devulapalli at intel.com> wrote:
>
> From: raghuveer devulapalli <raghuveer.devulapalli at intel.com>
>
> These helper function will be reused in pixman-avx2.c implementations in
> the future.
> ---
> pixman/pixman-sse2.c | 504 +------------------------------------------
> pixman/pixman-sse2.h | 502 ++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 503 insertions(+), 503 deletions(-)
> create mode 100644 pixman/pixman-sse2.h
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index 8955103..8dea0c2 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -32,509 +32,7 @@
>
> /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
> #define PSHUFD_IS_FAST 0
> -
> -#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
> -#include <emmintrin.h> /* for SSE2 intrinsics */
> -#include "pixman-private.h"
> -#include "pixman-combine32.h"
> -#include "pixman-inlines.h"
> -
> -static __m128i mask_0080;
> -static __m128i mask_00ff;
> -static __m128i mask_0101;
> -static __m128i mask_ffff;
> -static __m128i mask_ff000000;
> -static __m128i mask_alpha;
> -
> -static __m128i mask_565_r;
> -static __m128i mask_565_g1, mask_565_g2;
> -static __m128i mask_565_b;
> -static __m128i mask_red;
> -static __m128i mask_green;
> -static __m128i mask_blue;
> -
> -static __m128i mask_565_fix_rb;
> -static __m128i mask_565_fix_g;
> -
> -static __m128i mask_565_rb;
> -static __m128i mask_565_pack_multiplier;
> -
These are moving to pixman-sse2.h to be used by the code below, which
is to be used by the AVX2 code. But they're initialized in
_pixman_implementation_create_sse2(), which means if you used
PIXMAN_DISABLE=sse2 the AVX2 paths would fail.
I suspect these constants do need to be prefixed with "sse2_", and in
_pixman_x86_get_implementations() you should disable avx2 if
PIXMAN_DISABLE=sse2.
> -static force_inline __m128i
> -unpack_32_1x128 (uint32_t data)
> -{
> - return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
> -}
> -
> -static force_inline void
> -unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
> -{
> - *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
> - *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
> -}
> -
> -static force_inline __m128i
> -unpack_565_to_8888 (__m128i lo)
> -{
> - __m128i r, g, b, rb, t;
> -
> - r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
> - g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
> - b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
> -
> - rb = _mm_or_si128 (r, b);
> - t = _mm_and_si128 (rb, mask_565_fix_rb);
> - t = _mm_srli_epi32 (t, 5);
> - rb = _mm_or_si128 (rb, t);
> -
> - t = _mm_and_si128 (g, mask_565_fix_g);
> - t = _mm_srli_epi32 (t, 6);
> - g = _mm_or_si128 (g, t);
> -
> - return _mm_or_si128 (rb, g);
> -}
> -
> -static force_inline void
> -unpack_565_128_4x128 (__m128i data,
> - __m128i* data0,
> - __m128i* data1,
> - __m128i* data2,
> - __m128i* data3)
> -{
> - __m128i lo, hi;
> -
> - lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
> - hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
> -
> - lo = unpack_565_to_8888 (lo);
> - hi = unpack_565_to_8888 (hi);
> -
> - unpack_128_2x128 (lo, data0, data1);
> - unpack_128_2x128 (hi, data2, data3);
> -}
> -
> -static force_inline uint16_t
> -pack_565_32_16 (uint32_t pixel)
> -{
> - return (uint16_t) (((pixel >> 8) & 0xf800) |
> - ((pixel >> 5) & 0x07e0) |
> - ((pixel >> 3) & 0x001f));
> -}
> -
> -static force_inline __m128i
> -pack_2x128_128 (__m128i lo, __m128i hi)
> -{
> - return _mm_packus_epi16 (lo, hi);
> -}
> -
> -static force_inline __m128i
> -pack_565_2packedx128_128 (__m128i lo, __m128i hi)
> -{
> - __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
> - __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
> -
> - __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
> - __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
> -
> - __m128i g0 = _mm_and_si128 (lo, mask_green);
> - __m128i g1 = _mm_and_si128 (hi, mask_green);
> -
> - t0 = _mm_or_si128 (t0, g0);
> - t1 = _mm_or_si128 (t1, g1);
> -
> - /* Simulates _mm_packus_epi32 */
> - t0 = _mm_slli_epi32 (t0, 16 - 5);
> - t1 = _mm_slli_epi32 (t1, 16 - 5);
> - t0 = _mm_srai_epi32 (t0, 16);
> - t1 = _mm_srai_epi32 (t1, 16);
> - return _mm_packs_epi32 (t0, t1);
> -}
> -
> -static force_inline __m128i
> -pack_565_2x128_128 (__m128i lo, __m128i hi)
> -{
> - __m128i data;
> - __m128i r, g1, g2, b;
> -
> - data = pack_2x128_128 (lo, hi);
> -
> - r = _mm_and_si128 (data, mask_565_r);
> - g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
> - g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
> - b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
> -
> - return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
> -}
> -
> -static force_inline __m128i
> -pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
> -{
> - return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
> - pack_565_2x128_128 (*xmm2, *xmm3));
> -}
> -
> -static force_inline int
> -is_opaque (__m128i x)
> -{
> - __m128i ffs = _mm_cmpeq_epi8 (x, x);
> -
> - return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
> -}
> -
> -static force_inline int
> -is_zero (__m128i x)
> -{
> - return _mm_movemask_epi8 (
> - _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
> -}
> -
> -static force_inline int
> -is_transparent (__m128i x)
> -{
> - return (_mm_movemask_epi8 (
> - _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
> -}
> -
> -static force_inline __m128i
> -expand_pixel_32_1x128 (uint32_t data)
> -{
> - return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
> -}
> -
> -static force_inline __m128i
> -expand_alpha_1x128 (__m128i data)
> -{
> - return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
> - _MM_SHUFFLE (3, 3, 3, 3)),
> - _MM_SHUFFLE (3, 3, 3, 3));
> -}
> -
> -static force_inline void
> -expand_alpha_2x128 (__m128i data_lo,
> - __m128i data_hi,
> - __m128i* alpha_lo,
> - __m128i* alpha_hi)
> -{
> - __m128i lo, hi;
> -
> - lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
> - hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
> -
> - *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
> - *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
> -}
> -
> -static force_inline void
> -expand_alpha_rev_2x128 (__m128i data_lo,
> - __m128i data_hi,
> - __m128i* alpha_lo,
> - __m128i* alpha_hi)
> -{
> - __m128i lo, hi;
> -
> - lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
> - hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
> - *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
> - *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline void
> -pix_multiply_2x128 (__m128i* data_lo,
> - __m128i* data_hi,
> - __m128i* alpha_lo,
> - __m128i* alpha_hi,
> - __m128i* ret_lo,
> - __m128i* ret_hi)
> -{
> - __m128i lo, hi;
> -
> - lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
> - hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
> - lo = _mm_adds_epu16 (lo, mask_0080);
> - hi = _mm_adds_epu16 (hi, mask_0080);
> - *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
> - *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
> -}
> -
> -static force_inline void
> -pix_add_multiply_2x128 (__m128i* src_lo,
> - __m128i* src_hi,
> - __m128i* alpha_dst_lo,
> - __m128i* alpha_dst_hi,
> - __m128i* dst_lo,
> - __m128i* dst_hi,
> - __m128i* alpha_src_lo,
> - __m128i* alpha_src_hi,
> - __m128i* ret_lo,
> - __m128i* ret_hi)
> -{
> - __m128i t1_lo, t1_hi;
> - __m128i t2_lo, t2_hi;
> -
> - pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
> - pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
> -
> - *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
> - *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
> -}
> -
> -static force_inline void
> -negate_2x128 (__m128i data_lo,
> - __m128i data_hi,
> - __m128i* neg_lo,
> - __m128i* neg_hi)
> -{
> - *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
> - *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
> -}
> -
> -static force_inline void
> -invert_colors_2x128 (__m128i data_lo,
> - __m128i data_hi,
> - __m128i* inv_lo,
> - __m128i* inv_hi)
> -{
> - __m128i lo, hi;
> -
> - lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
> - hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
> - *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
> - *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
> -}
> -
> -static force_inline void
> -over_2x128 (__m128i* src_lo,
> - __m128i* src_hi,
> - __m128i* alpha_lo,
> - __m128i* alpha_hi,
> - __m128i* dst_lo,
> - __m128i* dst_hi)
> -{
> - __m128i t1, t2;
> -
> - negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
> -
> - pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
> -
> - *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
> - *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
> -}
> -
> -static force_inline void
> -over_rev_non_pre_2x128 (__m128i src_lo,
> - __m128i src_hi,
> - __m128i* dst_lo,
> - __m128i* dst_hi)
> -{
> - __m128i lo, hi;
> - __m128i alpha_lo, alpha_hi;
> -
> - expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
> -
> - lo = _mm_or_si128 (alpha_lo, mask_alpha);
> - hi = _mm_or_si128 (alpha_hi, mask_alpha);
> -
> - invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
> -
> - pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
> -
> - over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
> -}
> -
> -static force_inline void
> -in_over_2x128 (__m128i* src_lo,
> - __m128i* src_hi,
> - __m128i* alpha_lo,
> - __m128i* alpha_hi,
> - __m128i* mask_lo,
> - __m128i* mask_hi,
> - __m128i* dst_lo,
> - __m128i* dst_hi)
> -{
> - __m128i s_lo, s_hi;
> - __m128i a_lo, a_hi;
> -
> - pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
> - pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
> -
> - over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
> -}
> -
> -/* load 4 pixels from a 16-byte boundary aligned address */
> -static force_inline __m128i
> -load_128_aligned (__m128i* src)
> -{
> - return _mm_load_si128 (src);
> -}
> -
> -/* load 4 pixels from a unaligned address */
> -static force_inline __m128i
> -load_128_unaligned (const __m128i* src)
> -{
> - return _mm_loadu_si128 (src);
> -}
> -
> -/* save 4 pixels using Write Combining memory on a 16-byte
> - * boundary aligned address
> - */
> -static force_inline void
> -save_128_write_combining (__m128i* dst,
> - __m128i data)
> -{
> - _mm_stream_si128 (dst, data);
> -}
> -
> -/* save 4 pixels on a 16-byte boundary aligned address */
> -static force_inline void
> -save_128_aligned (__m128i* dst,
> - __m128i data)
> -{
> - _mm_store_si128 (dst, data);
> -}
> -
> -/* save 4 pixels on a unaligned address */
> -static force_inline void
> -save_128_unaligned (__m128i* dst,
> - __m128i data)
> -{
> - _mm_storeu_si128 (dst, data);
> -}
> -
> -static force_inline __m128i
> -load_32_1x128 (uint32_t data)
> -{
> - return _mm_cvtsi32_si128 (data);
> -}
> -
> -static force_inline __m128i
> -expand_alpha_rev_1x128 (__m128i data)
> -{
> - return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline __m128i
> -expand_pixel_8_1x128 (uint8_t data)
> -{
> - return _mm_shufflelo_epi16 (
> - unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline __m128i
> -pix_multiply_1x128 (__m128i data,
> - __m128i alpha)
> -{
> - return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
> - mask_0080),
> - mask_0101);
> -}
> -
> -static force_inline __m128i
> -pix_add_multiply_1x128 (__m128i* src,
> - __m128i* alpha_dst,
> - __m128i* dst,
> - __m128i* alpha_src)
> -{
> - __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
> - __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
> -
> - return _mm_adds_epu8 (t1, t2);
> -}
> -
> -static force_inline __m128i
> -negate_1x128 (__m128i data)
> -{
> - return _mm_xor_si128 (data, mask_00ff);
> -}
> -
> -static force_inline __m128i
> -invert_colors_1x128 (__m128i data)
> -{
> - return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
> -}
> -
> -static force_inline __m128i
> -over_1x128 (__m128i src, __m128i alpha, __m128i dst)
> -{
> - return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
> -}
> -
> -static force_inline __m128i
> -in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
> -{
> - return over_1x128 (pix_multiply_1x128 (*src, *mask),
> - pix_multiply_1x128 (*alpha, *mask),
> - *dst);
> -}
> -
> -static force_inline __m128i
> -over_rev_non_pre_1x128 (__m128i src, __m128i dst)
> -{
> - __m128i alpha = expand_alpha_1x128 (src);
> -
> - return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
> - _mm_or_si128 (alpha, mask_alpha)),
> - alpha,
> - dst);
> -}
> -
> -static force_inline uint32_t
> -pack_1x128_32 (__m128i data)
> -{
> - return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
> -}
> -
> -static force_inline __m128i
> -expand565_16_1x128 (uint16_t pixel)
> -{
> - __m128i m = _mm_cvtsi32_si128 (pixel);
> -
> - m = unpack_565_to_8888 (m);
> -
> - return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
> -}
> -
> -static force_inline uint32_t
> -core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
> -{
> - uint8_t a;
> - __m128i xmms;
> -
> - a = src >> 24;
> -
> - if (a == 0xff)
> - {
> - return src;
> - }
> - else if (src)
> - {
> - xmms = unpack_32_1x128 (src);
> - return pack_1x128_32 (
> - over_1x128 (xmms, expand_alpha_1x128 (xmms),
> - unpack_32_1x128 (dst)));
> - }
> -
> - return dst;
> -}
> -
> -static force_inline uint32_t
> -combine1 (const uint32_t *ps, const uint32_t *pm)
> -{
> - uint32_t s = *ps;
> -
> - if (pm)
> - {
> - __m128i ms, mm;
> -
> - mm = unpack_32_1x128 (*pm);
> - mm = expand_alpha_1x128 (mm);
> -
> - ms = unpack_32_1x128 (s);
> - ms = pix_multiply_1x128 (ms, mm);
> -
> - s = pack_1x128_32 (ms);
> - }
> -
> - return s;
> -}
> +#include "pixman-sse2.h"
>
> static force_inline __m128i
> combine4 (const __m128i *ps, const __m128i *pm)
> diff --git a/pixman/pixman-sse2.h b/pixman/pixman-sse2.h
> new file mode 100644
> index 0000000..37105e4
> --- /dev/null
> +++ b/pixman/pixman-sse2.h
> @@ -0,0 +1,502 @@
> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
> +#include <emmintrin.h> /* for SSE2 intrinsics */
> +#include "pixman-private.h"
> +#include "pixman-combine32.h"
> +#include "pixman-inlines.h"
> +
> +__m128i mask_0080;
> +__m128i mask_00ff;
> +__m128i mask_0101;
> +__m128i mask_ffff;
> +__m128i mask_ff000000;
> +__m128i mask_alpha;
> +
> +__m128i mask_565_r;
> +__m128i mask_565_g1, mask_565_g2;
> +__m128i mask_565_b;
> +__m128i mask_red;
> +__m128i mask_green;
> +__m128i mask_blue;
> +
> +__m128i mask_565_fix_rb;
> +__m128i mask_565_fix_g;
> +
> +__m128i mask_565_rb;
> +__m128i mask_565_pack_multiplier;
I think you want to make these declarations 'extern', and keep their
storage in pixman-sse2.c. As it is, we're now allocating space for
them in every translation unit this header is included in.
More information about the Pixman
mailing list