[Pixman] [PATCH 8/8] SSE2 optimization for nearest scaled over_8888_n_8888

Siarhei Siamashka siarhei.siamashka at gmail.com
Thu Feb 3 18:11:01 PST 2011


From: Siarhei Siamashka <siarhei.siamashka at nokia.com>

This operation shows up a little bit in some of the html5 based
games from http://www.kesiev.com/akihabara/

=== Cairo trace of the game intro animation for 'Legend of Sadness' ===

before:
[  0]    image    firefox-legend-of-sadness   46.286   46.298   0.01%    5/6

after:
[  0]    image    firefox-legend-of-sadness   45.088   45.102   0.04%    6/6

=== Microbenchmark (scaling ~2000x~2000 -> ~2000x~2000) ===

before:
    translucent: op=3, src=8888, mask=s dst=8888, speed=131.30 MPix/s
    transparent: op=3, src=8888, mask=s dst=8888, speed=132.38 MPix/s
    opaque:      op=3, src=8888, mask=s dst=8888, speed=167.90 MPix/s
after:
    translucent: op=3, src=8888, mask=s dst=8888, speed=301.93 MPix/s
    transparent: op=3, src=8888, mask=s dst=8888, speed=770.70 MPix/s
    opaque:      op=3, src=8888, mask=s dst=8888, speed=301.80 MPix/s
---
 pixman/pixman-sse2.c |  118 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 118 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 6c494bc..cf17c0c 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5898,6 +5898,119 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
 		       uint32_t, uint32_t, PAD)
 
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+					       uint32_t *       dst,
+					       const uint32_t * src,
+					       int32_t          w,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (unsigned long)dst & 15)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+	    __m64 alpha     = expand_alpha_1x64 (ms);
+	    __m64 dest      = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 alpha_dst = unpack_32_1x64 (d);
+
+	    *dst = pack_1x64_32 (
+		in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+	}
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	if (!is_zero (xmm_src))
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			        &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+	    __m64 alpha = expand_alpha_1x64 (ms);
+	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 dest  = unpack_32_1x64 (d);
+
+	    *dst = pack_1x64_32 (
+		in_over_1x64 (&ms, &alpha, &mask, &dest));
+	}
+
+	dst++;
+	w--;
+    }
+
+    _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -5994,6 +6107,11 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_NEAREST_SOLIDMASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLIDMASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLIDMASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLIDMASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
     { PIXMAN_OP_NONE },
 };
 
-- 
1.7.3.4



More information about the Pixman mailing list