[Pixman] [PATCH 03/11] mmx: make ldq_u take __m64* directly
Matt Turner
mattst88 at gmail.com
Wed Mar 14 21:00:41 PDT 2012
Before, if __m64 is allocated in vector or floating-point registers,
__m64 vs = ldq_u((uint64_t *)src);
would cause src to be loaded into an integer register and then
transferred to an __m64 register. By switching ldq_u's argument type to
__m64 we give the compile enough information to recognize that it can
load to the vector register directly.
This patch is necessary for the Loongson optimizations when __m64 is
typedef'd as double.
Signed-off-by: Matt Turner <mattst88 at gmail.com>
---
pixman/pixman-mmx.c | 54 +++++++++++++++++++++++++-------------------------
1 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 137a214..7142bd8 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -317,7 +317,7 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
/* Elemental unaligned loads */
-static __inline__ __m64 ldq_u(uint64_t *p)
+static __inline__ __m64 ldq_u (__m64 *p)
{
#ifdef USE_X86_MMX
/* x86's alignment restrictions are very relaxed. */
@@ -330,13 +330,13 @@ static __inline__ __m64 ldq_u(uint64_t *p)
aligned_p = (__m64 *)((uintptr_t)p & ~7);
return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
#else
- struct __una_u64 { uint64_t x __attribute__((packed)); };
+ struct __una_u64 { __m64 x __attribute__((packed)); };
const struct __una_u64 *ptr = (const struct __una_u64 *) p;
return (__m64) ptr->x;
#endif
}
-static __inline__ uint32_t ldl_u(const uint32_t *p)
+static __inline__ uint32_t ldl_u (const uint32_t *p)
{
#ifdef USE_X86_MMX
/* x86's alignment restrictions are very relaxed. */
@@ -363,8 +363,8 @@ load8888 (const uint32_t *v)
static force_inline __m64
load8888u (const uint32_t *v)
{
- uint32_t l = ldl_u(v);
- return load8888(&l);
+ uint32_t l = ldl_u (v);
+ return load8888 (&l);
}
static force_inline __m64
@@ -382,7 +382,7 @@ store (uint32_t *dest, __m64 v)
static force_inline void
store8888 (uint32_t *dest, __m64 v)
{
- v = pack8888 (v, _mm_setzero_si64());
+ v = pack8888 (v, _mm_setzero_si64 ());
store (dest, v);
}
@@ -1445,7 +1445,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
while (w >= 2)
{
- __m64 vs = ldq_u((uint64_t *)src);
+ __m64 vs = ldq_u ((__m64 *)src);
__m64 vd = *(__m64 *)dst;
__m64 vsrc0 = expand8888 (vs, 0);
__m64 vsrc1 = expand8888 (vs, 1);
@@ -1527,14 +1527,14 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
__m64 vd6 = *(__m64 *)(dst + 12);
__m64 vd7 = *(__m64 *)(dst + 14);
- __m64 vs0 = ldq_u((uint64_t *)(src + 0));
- __m64 vs1 = ldq_u((uint64_t *)(src + 2));
- __m64 vs2 = ldq_u((uint64_t *)(src + 4));
- __m64 vs3 = ldq_u((uint64_t *)(src + 6));
- __m64 vs4 = ldq_u((uint64_t *)(src + 8));
- __m64 vs5 = ldq_u((uint64_t *)(src + 10));
- __m64 vs6 = ldq_u((uint64_t *)(src + 12));
- __m64 vs7 = ldq_u((uint64_t *)(src + 14));
+ __m64 vs0 = ldq_u ((__m64 *)(src + 0));
+ __m64 vs1 = ldq_u ((__m64 *)(src + 2));
+ __m64 vs2 = ldq_u ((__m64 *)(src + 4));
+ __m64 vs3 = ldq_u ((__m64 *)(src + 6));
+ __m64 vs4 = ldq_u ((__m64 *)(src + 8));
+ __m64 vs5 = ldq_u ((__m64 *)(src + 10));
+ __m64 vs6 = ldq_u ((__m64 *)(src + 12));
+ __m64 vs7 = ldq_u ((__m64 *)(src + 14));
vd0 = pack8888 (
in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
@@ -2814,7 +2814,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
while (w >= 8)
{
- *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
+ *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
dst += 8;
src += 8;
w -= 8;
@@ -2872,7 +2872,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
while (w >= 2)
{
- dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst);
+ dst64 = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
*(uint64_t*)dst = to_uint64 (dst64);
dst += 2;
src += 2;
@@ -2963,7 +2963,7 @@ pixman_blt_mmx (uint32_t *src_bits,
while (w >= 4 && ((unsigned long)d & 7))
{
- *(uint32_t *)d = ldl_u((uint32_t *)s);
+ *(uint32_t *)d = ldl_u ((uint32_t *)s);
w -= 4;
s += 4;
@@ -2997,14 +2997,14 @@ pixman_blt_mmx (uint32_t *src_bits,
"%mm0", "%mm1", "%mm2", "%mm3",
"%mm4", "%mm5", "%mm6", "%mm7");
#else
- __m64 v0 = ldq_u((uint64_t *)(s + 0));
- __m64 v1 = ldq_u((uint64_t *)(s + 8));
- __m64 v2 = ldq_u((uint64_t *)(s + 16));
- __m64 v3 = ldq_u((uint64_t *)(s + 24));
- __m64 v4 = ldq_u((uint64_t *)(s + 32));
- __m64 v5 = ldq_u((uint64_t *)(s + 40));
- __m64 v6 = ldq_u((uint64_t *)(s + 48));
- __m64 v7 = ldq_u((uint64_t *)(s + 56));
+ __m64 v0 = ldq_u ((__m64 *)(s + 0));
+ __m64 v1 = ldq_u ((__m64 *)(s + 8));
+ __m64 v2 = ldq_u ((__m64 *)(s + 16));
+ __m64 v3 = ldq_u ((__m64 *)(s + 24));
+ __m64 v4 = ldq_u ((__m64 *)(s + 32));
+ __m64 v5 = ldq_u ((__m64 *)(s + 40));
+ __m64 v6 = ldq_u ((__m64 *)(s + 48));
+ __m64 v7 = ldq_u ((__m64 *)(s + 56));
*(__m64 *)(d + 0) = v0;
*(__m64 *)(d + 8) = v1;
*(__m64 *)(d + 16) = v2;
@@ -3021,7 +3021,7 @@ pixman_blt_mmx (uint32_t *src_bits,
}
while (w >= 4)
{
- *(uint32_t *)d = ldl_u((uint32_t *)s);
+ *(uint32_t *)d = ldl_u ((uint32_t *)s);
w -= 4;
s += 4;
--
1.7.3.4
More information about the Pixman
mailing list