[Pixman] [PATCH 2/2] mmx: compile on MIPS for loongson SIMD optimizations

Wed Feb 15 15:40:48 PST 2012

Signed-off-by: Matt Turner <mattst88 at gmail.com>
---
Background: Lemote (http://en.wikipedia.org/wiki/Lemote) makes computers
with Loongson MIPS processors. They have optimized pixman with inline
assembly, but the code is GPL and is messy, to the point that I wouldn't
want to include it. http://dev.lemote.com/cgit/Pixman.Loongson.git/

I set about optimizing by doing the same thing I did with iwMMXt:
using pixman-mmx.c directly. I had to implement equivalent _mm_*-style
intrinsic functions since Lemote or whoever wrote gcc support decided
to create their own set of (very awful) intrinsics.

This patch is actually two squashed WIP patches, which can be seen in their
original forms here
	ssh://people.freedesktop.org/~mattst88/pixman loongson-wip

The first patch implements the _mm_* intrinsics using gcc's Loongson
intrinsics. But, since the intrinsics are awful, terribly stupid machine
code is generated. The second patch reimplements the _mm_* intrinsics using
inline assembly, which results in a lot better generated code and performance.

I don't think there's value in committing them separately.

Performance isn't improved in all cases. I think that's because the existing
code in pixman-fast-paths.c is very good about skipping compositing if src == 0
or 0xff. Some MMX functions are currently missing this, like over_8888_0565.

http://people.freedesktop.org/~mattst88/pixman-loongson-benchdata.txt

Still TODO: runtime checks and a better configure test. For runtime checks
I guess the only thing I can do is a /proc/cpuinfo check like what's implemented
in the DSPr2 patches.

 configure.ac               |   46 ++++++++++
 pixman/Makefile.am         |   12 +++
 pixman/loongson-mmintrin.h |  216 ++++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-cpu.c        |    4 +-
 pixman/pixman-mmx.c        |   20 ++++-
 pixman/pixman-private.h    |    2 +-
 test/Makefile.sources      |    6 +-
 7 files changed, 298 insertions(+), 8 deletions(-)
 create mode 100644 pixman/loongson-mmintrin.h

diff --git a/configure.ac b/configure.ac
index 9f81b11..b5784ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -271,6 +271,52 @@ error Need Sun Studio 8 for visibility
 ])
 
 dnl ===========================================================================
+dnl Check for Loongson SIMD intrinsics
+
+have_loongson_SIMD=no
+AC_MSG_CHECKING(whether to use Loongson SIMD) 
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#ifndef __mips_loongson_vector_rev
+#error "Loongson SIMD intrinsics are only available on Loongson"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 4.4 for Loongson SIMD compilation"
+#endif
+#include <loongson.h>
+typedef uint64_t __m64;
+int main () {
+    union {
+        __m64 v;
+        char c[8];
+    } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+    int b = 4;
+    __m64 c = _mm_srli_pi16 (a.v, b);
+    return 0;
+}]])], have_loongson_SIMD=yes)
+
+
+AC_ARG_ENABLE(loongson,
+   [AC_HELP_STRING([--disable-loongson],
+                   [disable Loongson fast paths])],
+   [enable_loongson=$enableval], [enable_loongson=auto])
+
+if test $enable_loongson = no ; then
+   have_loongson_SIMD=disabled
+fi
+
+if test $have_loongson_SIMD = yes ; then
+   AC_DEFINE(USE_LOONGSON_SIMD, 1, [use Loongson SIMD])
+fi
+
+AC_MSG_RESULT($have_loongson_SIMD)
+if test $enable_loongson = yes && test $have_loongson_SIMD = no ; then
+   AC_MSG_ERROR([Loongson SIMD not detected])
+fi
+
+AM_CONDITIONAL(USE_LOONGSON_SIMD, test $have_loongson_SIMD = yes)
+
+dnl ===========================================================================
 dnl Check for MMX
 
 if test "x$MMX_CFLAGS" = "x" ; then
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 286b7cf..67e8b72 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -102,5 +102,17 @@ libpixman_1_la_LIBADD += libpixman-iwmmxt.la
 ASM_CFLAGS_IWMMXT=$(IWMMXT_CFLAGS)
 endif
 
+# loongson code
+if USE_LOONGSON_SIMD
+noinst_LTLIBRARIES += libpixman-loongson-simd.la
+libpixman_loongson_simd_la_SOURCES = pixman-mmx.c
+libpixman_loongson_simd_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
+libpixman_loongson_simd_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-loongson-simd.la
+
+ASM_CFLAGS_ls=$(LS_CFLAGS)
+endif
+
 .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
 	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
new file mode 100644
index 0000000..eb6a614
--- /dev/null
+++ b/pixman/loongson-mmintrin.h
@@ -0,0 +1,216 @@
+/* The gcc-provided loongson intrinsic functions are way too fucking broken
+ * to be of any use, otherwise I'd use them.
+ *
+ * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
+ *   close enough that they could have implemented the _mm_*-style intrinsic
+ *   interface and had a ton of optimized code available to them. Instead they
+ *   implemented something much, much worse.
+ *
+ * - pshuf takes a dead first argument, causing extra instructions to be
+ *   generated.
+ *
+ * - There are no 64-bit shift or logical intrinsics, which means you have
+ *   to implement them with inline assembly, but this is a nightmare because
+ *   gcc doesn't understand that the integer vector datatypes are actually in
+ *   floating-point registers, so you end up with braindead code like
+ *
+ *	punpcklwd	$f9,$f9,$f5
+ *	    dmtc1	v0,$f8
+ *	punpcklwd	$f19,$f19,$f5
+ *	    dmfc1	t9,$f9
+ *	    dmtc1	v0,$f9
+ *	    dmtc1	t9,$f20
+ *	    dmfc1	s0,$f19
+ *	punpcklbh	$f20,$f20,$f2
+ *
+ *   where crap just gets copied back and forth between integer and floating-
+ *   point registers ad nauseum.
+ *
+ * Instead of trying to workaround the problems from these crap intrinsics, I
+ * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
+ * assembly.
+ */
+
+/* vectors are stored in 64-bit floating-point registers */
+typedef double __m64;
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+	return 0.0;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddush %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddusb %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("and %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si64 (int __i)
+{
+	__m64 ret;
+	asm("dmtc1 %1, %0\n\t"
+	   : "=&f" (ret)
+	   : "r" (__i)
+	);
+	return ret;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si32 (__m64 __i)
+{
+	int ret;
+	asm("dmfc1 %0, %1\n\t"
+	   : "=&r" (ret)
+	   : "f" (__i)
+	);
+	return ret;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmullh %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("or %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packushb %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __m, int __n)
+{
+	__m64 ret, tmp;
+	asm("dmtc1  %3, %1\n\t"
+	    "pshufh %0, %2, %1\n\t"
+	    : "=&f" (ret), "=&f" (tmp)
+	    : "f" (__m), "r" (__n)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int __count)
+{
+	__m64 ret, tmp;
+	asm("dmtc1 %3, %1\n\t"
+	    "dsll  %0, %2, %1\n\t"
+	   : "=&f" (ret), "=&f" (tmp)
+	   : "f" (__m), "r" (__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int __count)
+{
+	__m64 ret, tmp;
+	asm("dmtc1 %3, %1 \n\t"
+	    "psrlh %0, %2, %1\n\t"
+	   : "=&f" (ret), "=&f" (tmp)
+	   : "f" (__m), "r" (__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int __count)
+{
+	__m64 ret, tmp;
+	asm("dmtc1 %3, %1 \n\t"
+	    "dsrl  %0, %2, %1\n\t"
+	   : "=&f" (ret), "=&f" (tmp)
+	   : "f" (__m), "r" (__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhbh %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("xor %0, %1, %2\n\t"
+	   : "=&f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
index 92942b2..8d98588 100644
--- a/pixman/pixman-cpu.c
+++ b/pixman/pixman-cpu.c
@@ -690,7 +690,9 @@ _pixman_choose_implementation (void)
     if (pixman_have_arm_iwmmxt ())
 	imp = _pixman_implementation_create_mmx (imp);
 #endif
-
+#ifdef USE_LOONGSON_SIMD
+    imp = _pixman_implementation_create_mmx (imp);
+#endif
 #ifdef USE_ARM_NEON
     if (pixman_have_arm_neon ())
 	imp = _pixman_implementation_create_arm_neon (imp);
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 25557a6..047492d 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -33,9 +33,13 @@
 #include <config.h>
 #endif
 
-#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_SIMD
 
+#ifdef USE_LOONGSON_SIMD
+#include <loongson-mmintrin.h>
+#else
 #include <mmintrin.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 
@@ -83,11 +87,14 @@ _mm_empty (void)
  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
  * If __m64 and uint64_t values can just be cast to each other directly,
  * then define USE_M64_CASTS.
+ * If __m64 is a double datatype, then define USE_M64_DOUBLE.
  */
 #ifdef _MSC_VER
 # define M64_MEMBER m64_u64
 #elif defined(__ICC)
 # define USE_CVT_INTRINSICS
+#elif defined(USE_LOONGSON_SIMD)
+# define USE_M64_DOUBLE
 #elif defined(__GNUC__)
 # define USE_M64_CASTS
 #elif defined(__SUNPRO_C)
@@ -105,7 +112,7 @@ _mm_empty (void)
 # endif
 #endif
 
-#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS)
+#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
 typedef uint64_t mmxdatafield;
 #else
 typedef __m64 mmxdatafield;
@@ -161,6 +168,8 @@ static const mmx_data_t c =
 #    define MC(x) to_m64 (c.mmx_ ## x)
 #elif defined(USE_M64_CASTS)
 #    define MC(x) ((__m64)c.mmx_ ## x)
+#elif defined(USE_M64_DOUBLE)
+#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
 #else
 #    define MC(x) c.mmx_ ## x
 #endif
@@ -175,6 +184,8 @@ to_m64 (uint64_t x)
 
     res.M64_MEMBER = x;
     return res;
+#elif defined USE_M64_DOUBLE
+    return *(__m64 *)&x;
 #else /* USE_M64_CASTS */
     return (__m64)x;
 #endif
@@ -188,6 +199,8 @@ to_uint64 (__m64 x)
 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     uint64_t res = x.M64_MEMBER;
     return res;
+#elif defined USE_M64_DOUBLE
+    return *(uint64_t *)&x;
 #else /* USE_M64_CASTS */
     return (uint64_t)x;
 #endif
@@ -3154,6 +3167,7 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
+
     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
     PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
     PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
@@ -3253,4 +3267,4 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     return imp;
 }
 
-#endif /* USE_X86_MMX || USE_ARM_IWMMXT */
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_SIMD */
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 8560385..86ed295 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -539,7 +539,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
 pixman_implementation_t *
 _pixman_implementation_create_noop (pixman_implementation_t *fallback);
 
-#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_SIMD
 pixman_implementation_t *
 _pixman_implementation_create_mmx (pixman_implementation_t *fallback);
 #endif
diff --git a/test/Makefile.sources b/test/Makefile.sources
index 99eb705..50ce15a 100644
--- a/test/Makefile.sources
+++ b/test/Makefile.sources
@@ -1,5 +1,8 @@
 # Tests (sorted by expected completion time)
 TESTPROGRAMS =			\
+	composite-traps-test	\
+	scaling-test		\
+	blitters-test		\
 	a1-trap-test		\
 	pdf-op-test		\
 	region-test		\
@@ -14,9 +17,6 @@ TESTPROGRAMS =			\
 	region-contains-test	\
 	alphamap		\
 	stress-test		\
-	composite-traps-test	\
-	blitters-test		\
-	scaling-test		\
 	affine-test		\
 	composite		\
 	$(NULL)
-- 
1.7.3.4