pixman: Branch 'master' - 3 commits

Mon Jun 15 11:12:27 PDT 2009

configure.ac             |    6 +--
 pixman/pixman-arm-neon.c |   94 +++++++++++++++++++++++++++++++----------------
 2 files changed, 65 insertions(+), 35 deletions(-)

New commits:
commit b1cb5922f785310ef790811b52e4e2b0c85dfccc
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date:   Mon Jun 15 16:09:32 2009 +0300

    Add RVCT support for straight blitter.

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 5fd82ab..467a0dd 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1837,7 +1837,6 @@ pixman_fill_neon (uint32_t *bits,
 #endif
 }
 
-#ifdef USE_GCC_INLINE_ASM
 
 // TODO: is there a more generic way of doing this being introduced?
 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
@@ -1849,11 +1848,16 @@ static inline void QuadwordCopy_neon(
 	uint32_t trailerCount // of bytes
 )
 {
+	uint8_t *tDst = dst, *tSrc = src;
+
 	// Uses aligned multi-register loads to maximise read bandwidth
 	// on uncached memory such as framebuffers
 	// The accesses do not have the aligned qualifiers, so that the copy
 	// may convert between aligned-uncached and unaligned-cached memory.
 	// It is assumed that the CPU can infer alignedness from the address.
+
+#ifdef USE_GCC_INLINE_ASM
+
 	asm volatile (
 	"	cmp       %[count], #8						\n"
 	"	blt 1f    @ skip oversized fragments		\n"
@@ -1889,7 +1893,7 @@ static inline void QuadwordCopy_neon(
 	"4: @ end										\n"
 
 	// Clobbered input registers marked as input/outputs
-	: [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+	: [dst] "+r" (tDst), [src] "+r" (tSrc), [count] "+r" (count)
 
 	// No unclobbered inputs
 	:
@@ -1899,32 +1903,67 @@ static inline void QuadwordCopy_neon(
 	: "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
 	);
 
+#else
+
+	while(count >= 8) {
+		uint8x16x4_t t1 = vld4q_u8(tSrc);
+		uint8x16x4_t t2 = vld4q_u8(tSrc + sizeof(uint8x16x4_t));
+		tSrc += sizeof(uint8x16x4_t) * 2;
+		vst4q_u8(tDst, t1);
+		vst4q_u8(tDst + sizeof(uint8x16x4_t), t2);
+		tDst += sizeof(uint8x16x4_t) * 2;
+		count -= 8;
+	}
+
+	if(count & 4) {
+		uint8x16x4_t t1 = vld4q_u8(tSrc);
+		tSrc += sizeof(uint8x16x4_t);
+		vst4q_u8(tDst, t1);
+		tDst += sizeof(uint8x16x4_t);
+	}
+
+	if(count & 2) {
+		uint8x8x4_t t1 = vld4_u8(tSrc);
+		tSrc += sizeof(uint8x8x4_t);
+		vst4_u8(tDst, t1);
+		tDst += sizeof(uint8x8x4_t);
+	}
+
+	if(count & 1) {
+		uint8x16_t t1 = vld1q_u8(tSrc);
+		tSrc += sizeof(uint8x16_t);
+		vst1q_u8(tDst, t1);
+		tDst += sizeof(uint8x16_t);
+	}
+
+#endif  // !USE_GCC_INLINE_ASM
+
 	if(trailerCount) {
-		uint8_t *tDst = dst, *tSrc = src;
+		if(trailerCount & 8) {
+			uint8x8_t t1 = vld1_u8(tSrc);
+			tSrc += sizeof(uint8x8_t);
+			vst1_u8(tDst, t1);
+			tDst += sizeof(uint8x8_t);
+		}
 
-		while(trailerCount >= 4) {
+		if(trailerCount & 4) {
 			*((uint32_t*) tDst) = *((uint32_t*) tSrc);
 			tDst += 4;
 			tSrc += 4;
-			trailerCount -= 4;
 		}
 
-		if(trailerCount >= 2) {
+		if(trailerCount & 2) {
 			*((uint16_t*) tDst) = *((uint16_t*) tSrc);
 			tDst += 2;
 			tSrc += 2;
-			trailerCount -= 2;
 		}
 
-		if(trailerCount) {
+		if(trailerCount & 1) {
 			*tDst++ = *tSrc++;
-			trailerCount--;
 		}
 	}
 }
 
-#endif  // USE_GCC_INLINE_ASM
-
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
@@ -1999,12 +2038,9 @@ pixman_blt_neon (
 	int dst_x, int dst_y,
 	int width, int height)
 {
-
 	if(!width || !height)
 		return TRUE;
 
-#ifdef USE_GCC_INLINE_ASM
-
 	// accelerate only straight copies involving complete bytes
 	if(src_bpp != dst_bpp || (src_bpp & 7))
 		return FALSE;
@@ -2027,13 +2063,6 @@ pixman_blt_neon (
 	}
 
 	return TRUE;
-
-#else /* USE_GCC_INLINE_ASM */
-
-	// TODO: intrinsic version for armcc
-	return FALSE;
-
-#endif
 }
 
 static pixman_bool_t
commit b6a3868ced67eb363273bfbee0d850c4d06cca34
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date:   Mon Jun 15 16:02:04 2009 +0300

    Better CFLAGS handling for recent ARM platforms.

diff --git a/configure.ac b/configure.ac
index 768f6b7..c410e62 100644
--- a/configure.ac
+++ b/configure.ac
@@ -327,7 +327,7 @@ AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
 
 dnl ===========================================================================
 dnl Check for ARM SIMD instructions
-ARM_SIMD_CFLAGS=""
+ARM_SIMD_CFLAGS="-march=armv6"
 
 have_arm_simd=no
 AC_MSG_CHECKING(whether to use ARM SIMD assembler)
@@ -366,7 +366,7 @@ AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
 
 dnl ==========================================================================
 dnl Check for ARM NEON instructions
-ARM_NEON_CFLAGS="-mfpu=neon -mfloat-abi=softfp"
+ARM_NEON_CFLAGS="-mfpu=neon"
 
 have_arm_neon=no
 AC_MSG_CHECKING(whether to use ARM NEON)
@@ -470,8 +470,6 @@ AC_SUBST(GTK_LIBS)
 AC_SUBST(DEP_CFLAGS)
 AC_SUBST(DEP_LIBS)
 
-
-		  
 AC_OUTPUT([pixman-1.pc
            pixman-1-uninstalled.pc
            Makefile
commit 1217c11a02ef60a3955fd98f7cec48de4cb9561b
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date:   Wed May 27 15:31:59 2009 +0300

    Misc warning fixes.

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index abcd24f..5fd82ab 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -159,7 +159,7 @@ fbCompositeSrcAdd_8000x8000neon (
             srcLine += srcStride;
             w = width;
 
-            uint8_t *keep_dst;
+            uint8_t *keep_dst=0;
 
 #ifndef USE_GCC_INLINE_ASM
             uint8x8_t sval,dval,temp;
@@ -226,6 +226,9 @@ fbCompositeSrcAdd_8000x8000neon (
     }
     else
     {
+        const uint8_t nil = 0;
+        const uint8x8_t vnil = vld1_dup_u8(&nil);
+
         while (height--)
         {
             dst = dstLine;
@@ -233,8 +236,8 @@ fbCompositeSrcAdd_8000x8000neon (
             src = srcLine;
             srcLine += srcStride;
             w = width;
-            uint8x8_t sval, dval;
-            uint8_t *dst4, *dst2;
+            uint8x8_t sval=vnil, dval=vnil;
+            uint8_t *dst4=0, *dst2=0;
 
             if (w&4)
             {
@@ -306,7 +309,7 @@ fbCompositeSrc_8888x8888neon (
 	    srcLine += srcStride;
 	    w = width;
 
-            uint32_t *keep_dst;
+            uint32_t *keep_dst=0;
 
 #ifndef USE_GCC_INLINE_ASM
             uint8x8x4_t sval,dval,temp;
@@ -472,7 +475,7 @@ fbCompositeSrc_8888x8x8888neon (
             srcLine += srcStride;
             w = width;
 
-            uint32_t *keep_dst;
+            uint32_t *keep_dst=0;
 
 #ifndef USE_GCC_INLINE_ASM
             uint8x8x4_t sval,dval,temp;
@@ -674,7 +677,7 @@ fbCompositeSolidMask_nx8x0565neon (
         // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
         while (height--)
         {
-            uint16_t *keep_dst;
+            uint16_t *keep_dst=0;
 
             dst = dstLine;
             dstLine += dstStride;
@@ -810,7 +813,7 @@ fbCompositeSolidMask_nx8x0565neon (
     {
         while (height--)
         {
-            void *dst4, *dst2;
+            void *dst4=0, *dst2=0;
 
             dst = dstLine;
             dstLine += dstStride;
@@ -1012,7 +1015,7 @@ fbCompositeSolidMask_nx8x8888neon (
         // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
         while (height--)
         {
-            uint32_t *keep_dst;
+            uint32_t *keep_dst=0;
 
             dst = dstLine;
             dstLine += dstStride;
@@ -1251,8 +1254,8 @@ fbCompositeSrcAdd_8888x8x8neon (
             maskLine += maskStride;
             w = width;
 
-            uint8x8_t mval, dval, res;
-            uint8_t *dst4, *dst2;
+            uint8x8_t mval=sa, dval=sa, res;
+            uint8_t *dst4=0, *dst2=0;
 
             if (w&4)
             {