[Pixman] [PATCH 2/8] sse2: Convert all uses of MMX registers to use SSE2 registers instead.

Søren Sandmann Pedersen sandmann at cs.au.dk
Fri Feb 18 05:02:06 PST 2011


From: Søren Sandmann Pedersen <ssp at redhat.com>

By avoiding use of MMX registers we won't need to call emms all over
the place, which avoids various miscompilation issues.
---
 pixman/pixman-sse2.c |  788 ++++++++++++++++++++++++++++----------------------
 1 files changed, 440 insertions(+), 348 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index c4ff3c1..283e4c4 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -407,6 +407,12 @@ load_32_1x64 (uint32_t data)
     return _mm_cvtsi32_si64 (data);
 }
 
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
 static force_inline __m64
 unpack_32_1x64 (uint32_t data)
 {
@@ -425,6 +431,12 @@ expand_alpha_rev_1x64 (__m64 data)
     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
 static force_inline __m64
 expand_pixel_8_1x64 (uint8_t data)
 {
@@ -432,6 +444,13 @@ expand_pixel_8_1x64 (uint8_t data)
 	unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
 static force_inline __m64
 pix_multiply_1x64 (__m64 data,
                    __m64 alpha)
@@ -441,6 +460,15 @@ pix_multiply_1x64 (__m64 data,
                            mask_x0101);
 }
 
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
 static force_inline __m64
 pix_add_multiply_1x64 (__m64* src,
                        __m64* alpha_dst,
@@ -453,24 +481,54 @@ pix_add_multiply_1x64 (__m64* src,
     return _mm_adds_pu8 (t1, t2);
 }
 
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
 static force_inline __m64
 negate_1x64 (__m64 data)
 {
     return _mm_xor_si64 (data, mask_x00ff);
 }
 
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
 static force_inline __m64
 invert_colors_1x64 (__m64 data)
 {
     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
 static force_inline __m64
 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
 {
     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
 }
 
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
 static force_inline __m64
 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
 {
@@ -479,6 +537,14 @@ in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
                       *dst);
 }
 
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
 static force_inline __m64
 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
 {
@@ -490,12 +556,29 @@ over_rev_non_pre_1x64 (__m64 src, __m64 dst)
                       dst);
 }
 
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
 static force_inline uint32_t
 pack_1x64_32 (__m64 data)
 {
     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 }
 
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
  *
  *    00RR00GG00BB
@@ -529,6 +612,16 @@ expand565_16_1x64 (uint16_t pixel)
     return _mm_srli_pi16 (p, 8);
 }
 
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
 /* ----------------------------------------------------------------------------
  * Compose Core transformations
  */
@@ -536,7 +629,7 @@ static force_inline uint32_t
 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
     uint8_t a;
-    __m64 ms;
+    __m128i xmms;
 
     a = src >> 24;
 
@@ -546,9 +639,10 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
     }
     else if (src)
     {
-	ms = unpack_32_1x64 (src);
-	return pack_1x64_32 (
-	    over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
     }
 
     return dst;
@@ -561,15 +655,15 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
 
     if (pm)
     {
-	__m64 ms, mm;
+	__m128i ms, mm;
 
-	mm = unpack_32_1x64 (*pm);
-	mm = expand_alpha_1x64 (mm);
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
 
-	ms = unpack_32_1x64 (s);
-	ms = pix_multiply_1x64 (ms, mm);
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
 
-	s = pack_1x64_32 (ms);
+	s = pack_1x128_32 (ms);
     }
 
     return s;
@@ -857,9 +951,9 @@ core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
     }
     else if (maska != 0xff)
     {
-	return pack_1x64_32 (
-	    pix_multiply_1x64 (unpack_32_1x64 (dst),
-			       expand_alpha_1x64 (unpack_32_1x64 (src))));
+	return pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (dst),
+				expand_alpha_1x128 (unpack_32_1x128 (src))));
     }
 
     return dst;
@@ -994,10 +1088,10 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
 
 	if (pm)
 	    pm++;
@@ -1039,10 +1133,10 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
 	ps++;
 	if (pm)
 	    pm++;
@@ -1061,10 +1155,10 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (s), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
 	w--;
 	ps++;
 	if (pm)
@@ -1104,10 +1198,10 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	uint32_t s = combine1 (ps, pm);
 	uint32_t d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (s), negate_1x64 (
-		    expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
 	w--;
 	ps++;
 	if (pm)
@@ -1119,13 +1213,13 @@ static force_inline uint32_t
 core_combine_atop_u_pixel_sse2 (uint32_t src,
                                 uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
-    __m64 da = expand_alpha_1x64 (d);
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
 static force_inline void
@@ -1201,13 +1295,13 @@ static force_inline uint32_t
 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
                                         uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 sa = expand_alpha_1x64 (s);
-    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
 static force_inline void
@@ -1283,13 +1377,13 @@ static force_inline uint32_t
 core_combine_xor_u_pixel_sse2 (uint32_t src,
                                uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
-    __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
 }
 
 static force_inline void
@@ -1387,8 +1481,8 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	ps++;
 	if (pm)
 	    pm++;
-	*pd++ = _mm_cvtsi64_si32 (
-	    _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
 	w--;
     }
 
@@ -1414,8 +1508,8 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	d = *pd;
 
 	ps++;
-	*pd++ = _mm_cvtsi64_si32 (
-	    _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
 	if (pm)
 	    pm++;
     }
@@ -1425,18 +1519,18 @@ static force_inline uint32_t
 core_combine_saturate_u_pixel_sse2 (uint32_t src,
                                     uint32_t dst)
 {
-    __m64 ms = unpack_32_1x64 (src);
-    __m64 md = unpack_32_1x64 (dst);
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
     uint32_t sa = src >> 24;
     uint32_t da = ~dst >> 24;
 
     if (sa > da)
     {
-	ms = pix_multiply_1x64 (
-	    ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
+	ms = pix_multiply_1x128 (
+	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
     }
 
-    return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
 }
 
 static force_inline void
@@ -1540,8 +1634,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     {
 	s = *ps++;
 	m = *pm++;
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
 	w--;
     }
 
@@ -1570,8 +1664,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     {
 	s = *ps++;
 	m = *pm++;
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
 	w--;
     }
 }
@@ -1581,12 +1675,12 @@ core_combine_over_ca_pixel_sse2 (uint32_t src,
                                  uint32_t mask,
                                  uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 expAlpha = expand_alpha_1x64 (s);
-    __m64 unpk_mask = unpack_32_1x64 (mask);
-    __m64 unpk_dst  = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
 
-    return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
 }
 
 static force_inline void
@@ -1655,12 +1749,12 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
                                          uint32_t mask,
                                          uint32_t dst)
 {
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i d = unpack_32_1x128 (dst);
 
-    return pack_1x64_32 (
-	over_1x64 (d, expand_alpha_1x64 (d),
-		   pix_multiply_1x64 (unpack_32_1x64 (src),
-				      unpack_32_1x64 (mask))));
+    return pack_1x128_32 (
+	over_1x128 (d, expand_alpha_1x128 (d),
+		    pix_multiply_1x128 (unpack_32_1x128 (src),
+					unpack_32_1x128 (mask))));
 }
 
 static force_inline void
@@ -1745,10 +1839,10 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		expand_alpha_1x64 (unpack_32_1x64 (d))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
 
 	w--;
     }
@@ -1789,11 +1883,11 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		expand_alpha_1x64 (unpack_32_1x64 (d))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
 
 	w--;
     }
@@ -1818,11 +1912,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		pix_multiply_1x64 (unpack_32_1x64 (m),
-				   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
 	w--;
     }
 
@@ -1861,11 +1955,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		pix_multiply_1x64 (unpack_32_1x64 (m),
-				   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
 	w--;
     }
 }
@@ -1889,11 +1983,11 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
 	w--;
     }
 
@@ -1934,11 +2028,11 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
-		negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
 
 	w--;
     }
@@ -1963,12 +2057,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		negate_1x64 (pix_multiply_1x64 (
-				 unpack_32_1x64 (m),
-				 expand_alpha_1x64 (unpack_32_1x64 (s))))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
 	w--;
     }
 
@@ -2011,12 +2105,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    pix_multiply_1x64 (
-		unpack_32_1x64 (d),
-		negate_1x64 (pix_multiply_1x64 (
-				 unpack_32_1x64 (m),
-				 expand_alpha_1x64 (unpack_32_1x64 (s))))));
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
 	w--;
     }
 }
@@ -2026,16 +2120,16 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src,
                                  uint32_t mask,
                                  uint32_t dst)
 {
-    __m64 m = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
-    __m64 sa = expand_alpha_1x64 (s);
-    __m64 da = expand_alpha_1x64 (d);
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
 
-    s = pix_multiply_1x64 (s, m);
-    m = negate_1x64 (pix_multiply_1x64 (m, sa));
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
 static force_inline void
@@ -2116,17 +2210,17 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
                                          uint32_t mask,
                                          uint32_t dst)
 {
-    __m64 m = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
-    __m64 sa = expand_alpha_1x64 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
 
-    s = pix_multiply_1x64 (s, m);
-    m = pix_multiply_1x64 (m, sa);
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
 static force_inline void
@@ -2208,16 +2302,16 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src,
                                 uint32_t mask,
                                 uint32_t dst)
 {
-    __m64 a = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
-				       a, expand_alpha_1x64 (s)));
-    __m64 dest      = pix_multiply_1x64 (s, a);
-    __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+				       a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
                                                 &alpha_dst,
                                                 &dest,
                                                 &alpha_src));
@@ -2317,10 +2411,10 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-					     unpack_32_1x64 (m)),
-			  unpack_32_1x64 (d)));
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
 	w--;
     }
 
@@ -2355,10 +2449,10 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	m = *pm++;
 	d = *pd;
 
-	*pd++ = pack_1x64_32 (
-	    _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-					     unpack_32_1x64 (m)),
-			  unpack_32_1x64 (d)));
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
 	w--;
     }
 }
@@ -2711,9 +2805,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	while (w && (unsigned long)dst & 15)
 	{
 	    d = *dst;
-	    *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-	                                      _mm_movepi64_pi64 (xmm_alpha),
-	                                      unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -2738,9 +2832,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	while (w)
 	{
 	    d = *dst;
-	    *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-	                                      _mm_movepi64_pi64 (xmm_alpha),
-	                                      unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -2796,9 +2890,9 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	    d = *dst;
 
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-					 _mm_movepi64_pi64 (xmm_alpha),
-					 expand565_16_1x64 (d))));
+		pack_1x128_32 (over_1x128 (xmm_src,
+					   xmm_alpha,
+					   expand565_16_1x128 (d))));
 	    w--;
 	}
 
@@ -2829,9 +2923,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	{
 	    d = *dst;
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-					 _mm_movepi64_pi64 (xmm_alpha),
-					 expand565_16_1x64 (d))));
+		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+					   expand565_16_1x128 (d))));
 	}
     }
 
@@ -2866,7 +2959,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
     __m128i xmm_dst;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
     srca = src >> 24;
@@ -2882,8 +2975,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
     xmm_src = _mm_unpacklo_epi8 (
 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -2902,11 +2995,11 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    {
 		d = *pd;
 
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (
-		    _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
 	    }
 
 	    pd++;
@@ -2950,11 +3043,11 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    {
 		d = *pd;
 
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (
-		    _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
 	    }
 
 	    pd++;
@@ -2994,7 +3087,7 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
 
@@ -3009,8 +3102,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     xmm_src = _mm_unpacklo_epi8 (
 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -3028,10 +3121,10 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *pd;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
 		                                  &mmx_alpha,
 		                                  &mmx_mask,
 		                                  &mmx_dest));
@@ -3078,11 +3171,11 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *pd;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*pd = pack_1x64_32 (
-		    in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+		*pd = pack_1x128_32 (
+		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
 	    }
 
 	    pd++;
@@ -3148,13 +3241,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    {
 		uint32_t d = *dst;
 		
-		__m64 ms = unpack_32_1x64 (s);
-		__m64 alpha    = expand_alpha_1x64 (ms);
-		__m64 dest     = _mm_movepi64_pi64 (xmm_mask);
-		__m64 alpha_dst = unpack_32_1x64 (d);
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha    = expand_alpha_1x128 (ms);
+		__m128i dest     = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
 		
-		*dst = pack_1x64_32 (
-		    in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
 	    }
 	    dst++;
 	    w--;
@@ -3195,13 +3288,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    {
 		uint32_t d = *dst;
 		
-		__m64 ms = unpack_32_1x64 (s);
-		__m64 alpha = expand_alpha_1x64 (ms);
-		__m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-		__m64 dest  = unpack_32_1x64 (d);
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha = expand_alpha_1x128 (ms);
+		__m128i mask  = xmm_mask;
+		__m128i dest  = unpack_32_1x128 (d);
 		
-		*dst = pack_1x64_32 (
-		    in_over_1x64 (&ms, &alpha, &mask, &dest));
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &mask, &dest));
 	    }
 
 	    dst++;
@@ -3336,13 +3429,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    uint32_t s = (*src++) | 0xff000000;
 	    uint32_t d = *dst;
 
-	    __m64 src   = unpack_32_1x64 (s);
-	    __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 dest  = unpack_32_1x64 (d);
+	    __m128i src   = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
 
-	    *dst++ = pack_1x64_32 (
-		in_over_1x64 (&src, &alpha, &mask, &dest));
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
 
 	    w--;
 	}
@@ -3375,13 +3468,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    uint32_t s = (*src++) | 0xff000000;
 	    uint32_t d = *dst;
 
-	    __m64 src  = unpack_32_1x64 (s);
-	    __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 dest  = unpack_32_1x64 (d);
+	    __m128i src  = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
 
-	    *dst++ = pack_1x64_32 (
-		in_over_1x64 (&src, &alpha, &mask, &dest));
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
 
 	    w--;
 	}
@@ -3436,13 +3529,13 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
 static force_inline uint16_t
 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
-    __m64 ms;
+    __m128i ms;
 
-    ms = unpack_32_1x64 (src);
+    ms = unpack_32_1x128 (src);
     return pack_565_32_16 (
-	pack_1x64_32 (
-	    over_1x64 (
-		ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
+	pack_1x128_32 (
+	    over_1x128 (
+		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
 }
 
 static void
@@ -3588,7 +3681,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
 
@@ -3604,8 +3697,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
     xmm_def = create_mask_2x32_128 (src, src);
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -3622,10 +3715,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_pixel_8_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
 		                                   &mmx_alpha,
 		                                   &mmx_mask,
 		                                   &mmx_dest));
@@ -3677,10 +3770,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_pixel_8_1x64 (m);
-		mmx_dest = unpack_32_1x64 (d);
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
 
-		*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
 		                                   &mmx_alpha,
 		                                   &mmx_mask,
 		                                   &mmx_dest));
@@ -3907,9 +4000,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		*dst = pack_1x64_32 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
 	    }
 	    else
 	    {
@@ -3962,9 +4054,9 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		*dst = pack_1x64_32 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (
+			xmm_src, expand_pixel_8_1x128 (m)));
 	    }
 	    else
 	    {
@@ -4004,7 +4096,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
     uint32_t m;
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -4023,8 +4115,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -4041,12 +4133,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4114,12 +4206,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4156,7 +4248,7 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
     int32_t w;
     uint32_t opaque, zero;
 
-    __m64 ms;
+    __m128i ms;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
@@ -4187,11 +4279,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    ms = unpack_32_1x64 (s);
+	    ms = unpack_32_1x128 (s);
 
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (
-		    over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
 	    w--;
 	}
 
@@ -4253,11 +4345,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    ms = unpack_32_1x64 (s);
+	    ms = unpack_32_1x128 (s);
 
 	    *dst++ = pack_565_32_16 (
-		pack_1x64_32 (
-		    over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
 	    w--;
 	}
     }
@@ -4320,9 +4412,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    *dst++ = pack_1x64_32 (
-		over_rev_non_pre_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
 
 	    w--;
 	}
@@ -4367,9 +4459,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    s = *src++;
 	    d = *dst;
 
-	    *dst++ = pack_1x64_32 (
-		over_rev_non_pre_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
 
 	    w--;
 	}
@@ -4408,7 +4500,7 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
 
@@ -4422,8 +4514,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -4440,12 +4532,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4509,12 +4601,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *dst;
-		mmx_mask = unpack_32_1x64 (m);
-		mmx_dest = expand565_16_1x64 (d);
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
 
 		*dst = pack_565_32_16 (
-		    pack_1x64_32 (
-			in_over_1x64 (
+		    pack_1x128_32 (
+			in_over_1x128 (
 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
 	    }
 
@@ -4582,11 +4674,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
-				       unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (xmm_alpha,
+				       unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4619,11 +4711,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
     }
@@ -4689,10 +4781,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	{
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    _mm_movepi64_pi64 (xmm_alpha),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4717,10 +4809,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	{
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    _mm_movepi64_pi64 (xmm_alpha),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
     }
@@ -4774,9 +4866,9 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    s = (uint32_t) *src++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (
-		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4805,8 +4897,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    s = (uint32_t) *src++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
 	    w--;
 	}
     }
@@ -4869,11 +4961,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		_mm_adds_pu16 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 	    w--;
 	}
 
@@ -4905,11 +4997,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
-	    *dst++ = (uint8_t) pack_1x64_32 (
-		_mm_adds_pu16 (
-		    pix_multiply_1x64 (
-			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-		    unpack_32_1x64 (d)));
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
 
 	    w--;
 	}
@@ -4973,10 +5065,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 
 	while (w && ((unsigned long)dst & 15))
 	{
-	    *dst = (uint8_t)_mm_cvtsi64_si32 (
-		_mm_adds_pu8 (
-		    _mm_movepi64_pi64 (xmm_src),
-		    _mm_cvtsi32_si64 (*dst)));
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
 
 	    w--;
 	    dst++;
@@ -4993,10 +5085,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    *dst = (uint8_t)_mm_cvtsi64_si32 (
-		_mm_adds_pu8 (
-		    _mm_movepi64_pi64 (xmm_src),
-		    _mm_cvtsi32_si64 (*dst)));
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
 
 	    w--;
 	    dst++;
@@ -5284,7 +5376,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     uint32_t m;
     int src_stride, mask_stride, dst_stride;
     int32_t w;
-    __m64 ms;
+    __m128i ms;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -5313,17 +5405,17 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
             s = 0xff000000 | *src++;
             m = (uint32_t) *mask++;
             d = *dst;
-            ms = unpack_32_1x64 (s);
+            ms = unpack_32_1x128 (s);
 
             if (m != 0xff)
             {
-		__m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		__m64 md = unpack_32_1x64 (d);
+		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		__m128i md = unpack_32_1x128 (d);
 
-                ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
             }
 
-            *dst++ = pack_1x64_32 (ms);
+            *dst++ = pack_1x128_32 (ms);
             w--;
         }
 
@@ -5373,15 +5465,15 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                 }
                 else
                 {
-		    __m64 ma, md, ms;
+		    __m128i ma, md, ms;
 
                     d = *dst;
 
-		    ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-		    md = unpack_32_1x64 (d);
-		    ms = unpack_32_1x64 (s);
+		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		    md = unpack_32_1x128 (d);
+		    ms = unpack_32_1x128 (s);
 
-                    *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
                 }
 
             }
@@ -5457,15 +5549,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5529,15 +5621,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5591,12 +5683,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 
 	while (w && (unsigned long)dst & 15)
 	{
-	    __m64 vd;
+	    __m128i vd;
 
-	    vd = unpack_32_1x64 (*dst);
+	    vd = unpack_32_1x128 (*dst);
 
-	    *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
-					    _mm_movepi64_pi64 (xmm_src)));
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
 	    w--;
 	    dst++;
 	}
@@ -5626,12 +5718,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    __m64 vd;
+	    __m128i vd;
 
-	    vd = unpack_32_1x64 (*dst);
+	    vd = unpack_32_1x128 (*dst);
 
-	    *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
-					    _mm_movepi64_pi64 (xmm_src)));
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
 	    w--;
 	    dst++;
 	}
@@ -5703,15 +5795,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5773,15 +5865,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 		}
 		else
 		{
-		    __m64 ms, md, ma, msa;
+		    __m128i ms, md, ma, msa;
 
-		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
-		    ms = unpack_32_1x64 (s);
-		    md = unpack_32_1x64 (d);
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
 
-		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
 
-		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
 		}
 	    }
 
@@ -5927,13 +6019,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 	{
 	    uint32_t d = *dst;
 
-	    __m64 ms = unpack_32_1x64 (s);
-	    __m64 alpha     = expand_alpha_1x64 (ms);
-	    __m64 dest      = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 alpha_dst = unpack_32_1x64 (d);
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha     = expand_alpha_1x128 (ms);
+	    __m128i dest      = xmm_mask;
+	    __m128i alpha_dst = unpack_32_1x128 (d);
 
-	    *dst = pack_1x64_32 (
-		in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
 	}
 	dst++;
 	w--;
@@ -5985,13 +6077,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 	{
 	    uint32_t d = *dst;
 
-	    __m64 ms = unpack_32_1x64 (s);
-	    __m64 alpha = expand_alpha_1x64 (ms);
-	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-	    __m64 dest  = unpack_32_1x64 (d);
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha = expand_alpha_1x128 (ms);
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
 
-	    *dst = pack_1x64_32 (
-		in_over_1x64 (&ms, &alpha, &mask, &dest));
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &mask, &dest));
 	}
 
 	dst++;
-- 
1.7.3.1



More information about the Pixman mailing list