[Pixman] [PATCH 1/2] mmx: make ldq_u take __m64* directly

Wed Feb 15 15:40:47 PST 2012

Before, if __m64 is allocated in vector or floating-point registers,

	__m64 vs = ldq_u((uint64_t *)src);

would cause src to be loaded into an integer register and then
transferred to an __m64 register. By switching ldq_u's argument type to
__m64 we give the compile enough information to recognize that it can
load to the vector register directly.

This patch is necessary for the Loongson optimizations when __m64 is
typedef'd as double.

Signed-off-by: Matt Turner <mattst88 at gmail.com>
---
 pixman/pixman-mmx.c |   42 +++++++++++++++++++++---------------------
 1 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 937ce8f..25557a6 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -328,7 +328,7 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 
 /* Elemental unaligned loads */
 
-static __inline__ __m64 ldq_u(uint64_t *p)
+static __inline__ __m64 ldq_u(__m64 *p)
 {
 #ifdef USE_X86_MMX
     /* x86's alignment restrictions are very relaxed. */
@@ -341,7 +341,7 @@ static __inline__ __m64 ldq_u(uint64_t *p)
     aligned_p = (__m64 *)((uintptr_t)p & ~7);
     return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
 #else
-    struct __una_u64 { uint64_t x __attribute__((packed)); };
+    struct __una_u64 { __m64 x __attribute__((packed)); };
     const struct __una_u64 *ptr = (const struct __una_u64 *) p;
     return (__m64) ptr->x;
 #endif
@@ -1427,7 +1427,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    __m64 vs = ldq_u((uint64_t *)src);
+	    __m64 vs = ldq_u((__m64 *)src);
 	    __m64 vd = *(__m64 *)dst;
 	    __m64 vsrc0 = expand8888 (vs, 0);
 	    __m64 vsrc1 = expand8888 (vs, 1);
@@ -1508,14 +1508,14 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    __m64 vd6 = *(__m64 *)(dst + 12);
 	    __m64 vd7 = *(__m64 *)(dst + 14);
 
-	    __m64 vs0 = ldq_u((uint64_t *)(src + 0));
-	    __m64 vs1 = ldq_u((uint64_t *)(src + 2));
-	    __m64 vs2 = ldq_u((uint64_t *)(src + 4));
-	    __m64 vs3 = ldq_u((uint64_t *)(src + 6));
-	    __m64 vs4 = ldq_u((uint64_t *)(src + 8));
-	    __m64 vs5 = ldq_u((uint64_t *)(src + 10));
-	    __m64 vs6 = ldq_u((uint64_t *)(src + 12));
-	    __m64 vs7 = ldq_u((uint64_t *)(src + 14));
+	    __m64 vs0 = ldq_u((__m64 *)(src + 0));
+	    __m64 vs1 = ldq_u((__m64 *)(src + 2));
+	    __m64 vs2 = ldq_u((__m64 *)(src + 4));
+	    __m64 vs3 = ldq_u((__m64 *)(src + 6));
+	    __m64 vs4 = ldq_u((__m64 *)(src + 8));
+	    __m64 vs5 = ldq_u((__m64 *)(src + 10));
+	    __m64 vs6 = ldq_u((__m64 *)(src + 12));
+	    __m64 vs7 = ldq_u((__m64 *)(src + 14));
 
 	    vd0 = pack8888 (
 	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
@@ -2794,7 +2794,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
 
 	while (w >= 8)
 	{
-	    *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
+	    *(__m64*)dst = _mm_adds_pu8 (ldq_u((__m64 *)src), *(__m64*)dst);
 	    dst += 8;
 	    src += 8;
 	    w -= 8;
@@ -2852,7 +2852,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
+	    dst64 = _mm_adds_pu8 (ldq_u((__m64 *)src), *(__m64*)dst);
 	    *(uint64_t*)dst = to_uint64 (dst64);
 	    dst += 2;
 	    src += 2;
@@ -2977,14 +2977,14 @@ pixman_blt_mmx (uint32_t *src_bits,
 		  "%mm0", "%mm1", "%mm2", "%mm3",
 		  "%mm4", "%mm5", "%mm6", "%mm7");
 #else
-	    __m64 v0 = ldq_u((uint64_t *)(s + 0));
-	    __m64 v1 = ldq_u((uint64_t *)(s + 8));
-	    __m64 v2 = ldq_u((uint64_t *)(s + 16));
-	    __m64 v3 = ldq_u((uint64_t *)(s + 24));
-	    __m64 v4 = ldq_u((uint64_t *)(s + 32));
-	    __m64 v5 = ldq_u((uint64_t *)(s + 40));
-	    __m64 v6 = ldq_u((uint64_t *)(s + 48));
-	    __m64 v7 = ldq_u((uint64_t *)(s + 56));
+	    __m64 v0 = ldq_u((__m64 *)(s + 0));
+	    __m64 v1 = ldq_u((__m64 *)(s + 8));
+	    __m64 v2 = ldq_u((__m64 *)(s + 16));
+	    __m64 v3 = ldq_u((__m64 *)(s + 24));
+	    __m64 v4 = ldq_u((__m64 *)(s + 32));
+	    __m64 v5 = ldq_u((__m64 *)(s + 40));
+	    __m64 v6 = ldq_u((__m64 *)(s + 48));
+	    __m64 v7 = ldq_u((__m64 *)(s + 56));
 	    *(__m64 *)(d + 0)  = v0;
 	    *(__m64 *)(d + 8)  = v1;
 	    *(__m64 *)(d + 16) = v2;
-- 
1.7.3.4