pixman: Branch 'master' - 3 commits
Jeff Muizelaar
jrmuizel at kemper.freedesktop.org
Mon Jun 15 11:12:27 PDT 2009
configure.ac | 6 +--
pixman/pixman-arm-neon.c | 94 +++++++++++++++++++++++++++++++----------------
2 files changed, 65 insertions(+), 35 deletions(-)
New commits:
commit b1cb5922f785310ef790811b52e4e2b0c85dfccc
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date: Mon Jun 15 16:09:32 2009 +0300
Add RVCT support for straight blitter.
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 5fd82ab..467a0dd 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1837,7 +1837,6 @@ pixman_fill_neon (uint32_t *bits,
#endif
}
-#ifdef USE_GCC_INLINE_ASM
// TODO: is there a more generic way of doing this being introduced?
#define NEON_SCANLINE_BUFFER_PIXELS (1024)
@@ -1849,11 +1848,16 @@ static inline void QuadwordCopy_neon(
uint32_t trailerCount // of bytes
)
{
+ uint8_t *tDst = dst, *tSrc = src;
+
// Uses aligned multi-register loads to maximise read bandwidth
// on uncached memory such as framebuffers
// The accesses do not have the aligned qualifiers, so that the copy
// may convert between aligned-uncached and unaligned-cached memory.
// It is assumed that the CPU can infer alignedness from the address.
+
+#ifdef USE_GCC_INLINE_ASM
+
asm volatile (
" cmp %[count], #8 \n"
" blt 1f @ skip oversized fragments \n"
@@ -1889,7 +1893,7 @@ static inline void QuadwordCopy_neon(
"4: @ end \n"
// Clobbered input registers marked as input/outputs
- : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+ : [dst] "+r" (tDst), [src] "+r" (tSrc), [count] "+r" (count)
// No unclobbered inputs
:
@@ -1899,32 +1903,67 @@ static inline void QuadwordCopy_neon(
: "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
);
+#else
+
+ while(count >= 8) {
+ uint8x16x4_t t1 = vld4q_u8(tSrc);
+ uint8x16x4_t t2 = vld4q_u8(tSrc + sizeof(uint8x16x4_t));
+ tSrc += sizeof(uint8x16x4_t) * 2;
+ vst4q_u8(tDst, t1);
+ vst4q_u8(tDst + sizeof(uint8x16x4_t), t2);
+ tDst += sizeof(uint8x16x4_t) * 2;
+ count -= 8;
+ }
+
+ if(count & 4) {
+ uint8x16x4_t t1 = vld4q_u8(tSrc);
+ tSrc += sizeof(uint8x16x4_t);
+ vst4q_u8(tDst, t1);
+ tDst += sizeof(uint8x16x4_t);
+ }
+
+ if(count & 2) {
+ uint8x8x4_t t1 = vld4_u8(tSrc);
+ tSrc += sizeof(uint8x8x4_t);
+ vst4_u8(tDst, t1);
+ tDst += sizeof(uint8x8x4_t);
+ }
+
+ if(count & 1) {
+ uint8x16_t t1 = vld1q_u8(tSrc);
+ tSrc += sizeof(uint8x16_t);
+ vst1q_u8(tDst, t1);
+ tDst += sizeof(uint8x16_t);
+ }
+
+#endif // !USE_GCC_INLINE_ASM
+
if(trailerCount) {
- uint8_t *tDst = dst, *tSrc = src;
+ if(trailerCount & 8) {
+ uint8x8_t t1 = vld1_u8(tSrc);
+ tSrc += sizeof(uint8x8_t);
+ vst1_u8(tDst, t1);
+ tDst += sizeof(uint8x8_t);
+ }
- while(trailerCount >= 4) {
+ if(trailerCount & 4) {
*((uint32_t*) tDst) = *((uint32_t*) tSrc);
tDst += 4;
tSrc += 4;
- trailerCount -= 4;
}
- if(trailerCount >= 2) {
+ if(trailerCount & 2) {
*((uint16_t*) tDst) = *((uint16_t*) tSrc);
tDst += 2;
tSrc += 2;
- trailerCount -= 2;
}
- if(trailerCount) {
+ if(trailerCount & 1) {
*tDst++ = *tSrc++;
- trailerCount--;
}
}
}
-#endif // USE_GCC_INLINE_ASM
-
static const FastPathInfo arm_neon_fast_path_array[] =
{
{ PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8neon, 0 },
@@ -1999,12 +2038,9 @@ pixman_blt_neon (
int dst_x, int dst_y,
int width, int height)
{
-
if(!width || !height)
return TRUE;
-#ifdef USE_GCC_INLINE_ASM
-
// accelerate only straight copies involving complete bytes
if(src_bpp != dst_bpp || (src_bpp & 7))
return FALSE;
@@ -2027,13 +2063,6 @@ pixman_blt_neon (
}
return TRUE;
-
-#else /* USE_GCC_INLINE_ASM */
-
- // TODO: intrinsic version for armcc
- return FALSE;
-
-#endif
}
static pixman_bool_t
commit b6a3868ced67eb363273bfbee0d850c4d06cca34
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date: Mon Jun 15 16:02:04 2009 +0300
Better CFLAGS handling for recent ARM platforms.
diff --git a/configure.ac b/configure.ac
index 768f6b7..c410e62 100644
--- a/configure.ac
+++ b/configure.ac
@@ -327,7 +327,7 @@ AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
dnl ===========================================================================
dnl Check for ARM SIMD instructions
-ARM_SIMD_CFLAGS=""
+ARM_SIMD_CFLAGS="-march=armv6"
have_arm_simd=no
AC_MSG_CHECKING(whether to use ARM SIMD assembler)
@@ -366,7 +366,7 @@ AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
dnl ==========================================================================
dnl Check for ARM NEON instructions
-ARM_NEON_CFLAGS="-mfpu=neon -mfloat-abi=softfp"
+ARM_NEON_CFLAGS="-mfpu=neon"
have_arm_neon=no
AC_MSG_CHECKING(whether to use ARM NEON)
@@ -470,8 +470,6 @@ AC_SUBST(GTK_LIBS)
AC_SUBST(DEP_CFLAGS)
AC_SUBST(DEP_LIBS)
-
-
AC_OUTPUT([pixman-1.pc
pixman-1-uninstalled.pc
Makefile
commit 1217c11a02ef60a3955fd98f7cec48de4cb9561b
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date: Wed May 27 15:31:59 2009 +0300
Misc warning fixes.
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index abcd24f..5fd82ab 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -159,7 +159,7 @@ fbCompositeSrcAdd_8000x8000neon (
srcLine += srcStride;
w = width;
- uint8_t *keep_dst;
+ uint8_t *keep_dst=0;
#ifndef USE_GCC_INLINE_ASM
uint8x8_t sval,dval,temp;
@@ -226,6 +226,9 @@ fbCompositeSrcAdd_8000x8000neon (
}
else
{
+ const uint8_t nil = 0;
+ const uint8x8_t vnil = vld1_dup_u8(&nil);
+
while (height--)
{
dst = dstLine;
@@ -233,8 +236,8 @@ fbCompositeSrcAdd_8000x8000neon (
src = srcLine;
srcLine += srcStride;
w = width;
- uint8x8_t sval, dval;
- uint8_t *dst4, *dst2;
+ uint8x8_t sval=vnil, dval=vnil;
+ uint8_t *dst4=0, *dst2=0;
if (w&4)
{
@@ -306,7 +309,7 @@ fbCompositeSrc_8888x8888neon (
srcLine += srcStride;
w = width;
- uint32_t *keep_dst;
+ uint32_t *keep_dst=0;
#ifndef USE_GCC_INLINE_ASM
uint8x8x4_t sval,dval,temp;
@@ -472,7 +475,7 @@ fbCompositeSrc_8888x8x8888neon (
srcLine += srcStride;
w = width;
- uint32_t *keep_dst;
+ uint32_t *keep_dst=0;
#ifndef USE_GCC_INLINE_ASM
uint8x8x4_t sval,dval,temp;
@@ -674,7 +677,7 @@ fbCompositeSolidMask_nx8x0565neon (
// Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
while (height--)
{
- uint16_t *keep_dst;
+ uint16_t *keep_dst=0;
dst = dstLine;
dstLine += dstStride;
@@ -810,7 +813,7 @@ fbCompositeSolidMask_nx8x0565neon (
{
while (height--)
{
- void *dst4, *dst2;
+ void *dst4=0, *dst2=0;
dst = dstLine;
dstLine += dstStride;
@@ -1012,7 +1015,7 @@ fbCompositeSolidMask_nx8x8888neon (
// Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
while (height--)
{
- uint32_t *keep_dst;
+ uint32_t *keep_dst=0;
dst = dstLine;
dstLine += dstStride;
@@ -1251,8 +1254,8 @@ fbCompositeSrcAdd_8888x8x8neon (
maskLine += maskStride;
w = width;
- uint8x8_t mval, dval, res;
- uint8_t *dst4, *dst2;
+ uint8x8_t mval=sa, dval=sa, res;
+ uint8_t *dst4=0, *dst2=0;
if (w&4)
{
More information about the xorg-commit
mailing list