[Pixman] [PATCH] sse2: faster bilinear scaling (use _mm_loadl_epi64)

Mon Jun 25 07:49:31 PDT 2012

On Mon, Jun 25, 2012 at 12:50 AM, Siarhei Siamashka
<siarhei.siamashka at gmail.com> wrote:
> Using _mm_loadl_epi64() to load two pixels at once (pairs of top
> and bottom pixels) is faster than loading each pixel separately
> and combining them with _mm_set_epi32().
>
> === cairo-perf-trace ===
>
> before: image             firefox-fishtank   66.912   66.931   0.13%    3/3
> after:  image             firefox-fishtank   57.584 58.349   0.74%    3/3
>
> === lowlevel-blt-bench ===
>
> before: src_8888_8888 =  L1: 181.10  L2: 179.14  M:178.08 ( 11.02%)  HT:153.22  VT:133.45  R:142.24  RT: 95.32
> after:  src_8888_8888 =  L1: 228.68  L2: 225.75  M:223.98 ( 14.23%)  HT:185.32  VT:155.06  R:162.73  RT:102.52
> ---
>  pixman/pixman-sse2.c |   15 +++++++--------
>  1 files changed, 7 insertions(+), 8 deletions(-)
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index 0604254..ef82a18 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -5377,17 +5377,16 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
>  #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                    \
>  do {                                                                           \
>     __m128i xmm_wh, xmm_lo, xmm_hi, a;                                         \
> -    /* fetch 2x2 pixel block into sse2 register */                             \
> -    uint32_t tl = src_top [pixman_fixed_to_int (vx)];                          \
> -    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];                      \
> -    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];                       \
> -    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];                   \
> -    a = _mm_set_epi32 (tr, tl, br, bl);                                                \
> +    /* fetch 2x2 pixel block into sse2 registers */                            \
> +    __m128i tltr = _mm_loadl_epi64 (                                           \
> +                           (__m128i *)&src_top[pixman_fixed_to_int (vx)]);     \
> +    __m128i blbr = _mm_loadl_epi64 (                                           \
> +                           (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);  \
>     vx += unit_x;                                                              \
>     /* vertical interpolation */                                               \
> -    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),       \
> +    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),    \
>                                        xmm_wt),                                \
> -                      _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),        \
> +                      _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),     \
>                                        xmm_wb));                               \
>     /* calculate horizontal weights */                                         \
>     xmm_wh = _mm_add_epi16 (xmm_addc,                                          \
> --
> 1.7.3.4

Great. I'm glad that suggestion actually made a difference.