[Pixman] [PATCH 2/3] SSE2 optimizations for scaled over_8888_8888 with nearest filter

Thu Sep 9 02:49:22 PDT 2010

On Wednesday 08 September 2010 10:45:07 Siarhei Siamashka wrote:
> +/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
> +static force_inline void
> +scaled_nearest_scanline_sse2_8888_8888_none_OVER (uint32_t*       pd,
> +                                                  const uint32_t* ps,
> +                                                  int32_t         w,
> +                                                  pixman_fixed_t  vx,
> +                                                  pixman_fixed_t  unit_x,
> +                                                  pixman_fixed_t  max_vx)
> +{
> +    uint32_t s, d;
> +    const uint32_t* pm = NULL;
> +
> +    __m128i xmm_dst_lo, xmm_dst_hi;
> +    __m128i xmm_src_lo, xmm_src_hi;
> +    __m128i xmm_alpha_lo, xmm_alpha_hi;
> +
> +    /* Align dst on a 16-byte boundary */
> +    while (w && ((unsigned long)pd & 15))
> +    {
> +	d = *pd;
> +	s = combine1 (ps + (vx >> 16), pm);
> +	vx += unit_x;
> +
> +	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
> +	if (pm)
> +	    pm++;
> +	w--;
> +    }
> +
> +    while (w >= 4)
> +    {
> +	__m128i tmp;
> +	uint32_t tmp1, tmp2, tmp3, tmp4;
> +
> +	tmp1 = ps[vx >> 16];
> +	vx += unit_x;
> +	tmp2 = ps[vx >> 16];
> +	vx += unit_x;
> +	tmp3 = ps[vx >> 16];
> +	vx += unit_x;
> +	tmp4 = ps[vx >> 16];
> +	vx += unit_x;
> +
> +	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
> +
> +	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
> +
> +	if (is_opaque (xmm_src_hi))
> +	{
> +	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
> +	}
> +	else if (!is_zero (xmm_src_hi))
> +	{
> +	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
> +
> +	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
> +	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
> +
> +	    expand_alpha_2x128 (
> +		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
> +
> +	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
> +			&xmm_alpha_lo, &xmm_alpha_hi,
> +			&xmm_dst_lo, &xmm_dst_hi);
> +
> +	    /* rebuid the 4 pixel data and save*/
> +	    save_128_aligned ((__m128i*)pd,
> +			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
> +	}
> +
> +	w -= 4;
> +	pd += 4;
> +	if (pm)
> +	    pm += 4;
> +    }
> +
> +    while (w)
> +    {
> +	d = *pd;
> +	s = combine1 (ps + (vx >> 16), pm);
> +	vx += unit_x;
> +
> +	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
> +	if (pm)
> +	    pm++;
> +
> +	w--;
> +    }
> +}

Actually there is a problem here which I discovered after also trying the
patch on a 32-bit x86 system. The floating point registers may become corrupted
unless _mm_empty() is added somewhere at the end of this scaling fast path
code.

I have sent a patch which can help to detect such issues automatically by just
running pixman test suite (maybe not totally bulletproof, but still quite
useful):
http://lists.freedesktop.org/archives/pixman/2010-September/000486.html

-- 
Best regards,
Siarhei Siamashka