pixman: Branch 'master' - 4 commits

Tue Oct 1 13:45:05 PDT 2013

configure.ac             |   29 +-
 pixman/pixman-compiler.h |    2 
 pixman/pixman-vmx.c      |  539 +++++++++++++++++++++++++++++++++++++++--------
 test/Makefile.am         |    6 
 test/Makefile.sources    |    1 
 test/thread-test.c       |  197 +++++++++++++++++
 6 files changed, 675 insertions(+), 99 deletions(-)

New commits:
commit 7d05a7f4dc825f9c778e534fdabb749199c2e439
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Sat Sep 28 04:51:21 2013 +0300

    vmx: there is no need to handle unaligned destination anymore
    
    So the redundant variables, memory reads/writes and reshuffles
    can be safely removed. For example, this makes the inner loop
    of 'vmx_combine_add_u_no_mask' function much more simple.
    
    Before:
    
        7a20:7d a8 48 ce lvx     v13,r8,r9
        7a24:7d 80 48 ce lvx     v12,r0,r9
        7a28:7d 28 50 ce lvx     v9,r8,r10
        7a2c:7c 20 50 ce lvx     v1,r0,r10
        7a30:39 4a 00 10 addi    r10,r10,16
        7a34:10 0d 62 eb vperm   v0,v13,v12,v11
        7a38:10 21 4a 2b vperm   v1,v1,v9,v8
        7a3c:11 2c 6a eb vperm   v9,v12,v13,v11
        7a40:10 21 4a 00 vaddubs v1,v1,v9
        7a44:11 a1 02 ab vperm   v13,v1,v0,v10
        7a48:10 00 0a ab vperm   v0,v0,v1,v10
        7a4c:7d a8 49 ce stvx    v13,r8,r9
        7a50:7c 00 49 ce stvx    v0,r0,r9
        7a54:39 29 00 10 addi    r9,r9,16
        7a58:42 00 ff c8 bdnz+   7a20 <.vmx_combine_add_u_no_mask+0x120>
    
    After:
    
        76c0:7c 00 48 ce lvx     v0,r0,r9
        76c4:7d a8 48 ce lvx     v13,r8,r9
        76c8:39 29 00 10 addi    r9,r9,16
        76cc:7c 20 50 ce lvx     v1,r0,r10
        76d0:10 00 6b 2b vperm   v0,v0,v13,v12
        76d4:10 00 0a 00 vaddubs v0,v0,v1
        76d8:7c 00 51 ce stvx    v0,r0,r10
        76dc:39 4a 00 10 addi    r10,r10,16
        76e0:42 00 ff e0 bdnz+   76c0 <.vmx_combine_add_u_no_mask+0x120>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 130d78e..c33631c 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -134,15 +134,11 @@ over (vector unsigned int src,
     source ## _mask = vec_lvsl (0, source);
 
 #define COMPUTE_SHIFT_MASKS(dest, source)				\
-    dest ## _mask = vec_lvsl (0, dest);					\
-    source ## _mask = vec_lvsl (0, source);				\
-    store_mask = vec_lvsr (0, dest);
+    source ## _mask = vec_lvsl (0, source);
 
 #define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
     mask ## _mask = vec_lvsl (0, mask);					\
-    dest ## _mask = vec_lvsl (0, dest);					\
-    source ## _mask = vec_lvsl (0, source);				\
-    store_mask = vec_lvsr (0, dest);
+    source ## _mask = vec_lvsl (0, source);
 
 /* notice you have to declare temp vars...
  * Note: tmp3 and tmp4 must remain untouched!
@@ -151,23 +147,17 @@ over (vector unsigned int src,
 #define LOAD_VECTORS(dest, source)			  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
     v ## source = (typeof(v ## source))			  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
-    v ## dest = (typeof(v ## dest))			  \
-	vec_perm (tmp3, tmp4, dest ## _mask);
+    v ## dest = (typeof(v ## dest))vec_ld (0, dest);
 
 #define LOAD_VECTORSC(dest, source, mask)		  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
     v ## source = (typeof(v ## source))			  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
     tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
-    v ## dest = (typeof(v ## dest))			  \
-	vec_perm (tmp3, tmp4, dest ## _mask);		  \
+    v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
     tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
     v ## mask = (typeof(v ## mask))			  \
 	vec_perm (tmp1, tmp2, mask ## _mask);
@@ -178,11 +168,7 @@ over (vector unsigned int src,
                                 splat_alpha (v ## mask));
 
 #define STORE_VECTOR(dest)						\
-    edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
-    tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
-    tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
-    vec_st ((vector unsigned int) tmp3, 15, dest);			\
-    vec_st ((vector unsigned int) tmp1, 0, dest);
+    vec_st ((vector unsigned int) v ## dest, 0, dest);
 
 static void
 vmx_combine_over_u_no_mask (uint32_t *      dest,
@@ -191,8 +177,7 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -242,8 +227,7 @@ vmx_combine_over_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -314,8 +298,7 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -363,8 +346,7 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -432,8 +414,7 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -478,8 +459,7 @@ vmx_combine_in_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -544,8 +524,7 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -592,8 +571,7 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -660,8 +638,7 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -708,8 +685,7 @@ vmx_combine_out_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -774,8 +750,7 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -823,8 +798,7 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -891,8 +865,7 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -944,8 +917,7 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1021,8 +993,7 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1074,8 +1045,7 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1151,8 +1121,7 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1204,8 +1173,7 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1281,8 +1249,7 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1328,8 +1295,7 @@ vmx_combine_add_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1397,8 +1363,7 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1448,8 +1413,7 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1507,8 +1471,7 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1564,8 +1527,7 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1619,8 +1581,7 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1675,8 +1636,7 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1733,8 +1693,7 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1791,8 +1750,7 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask, vsrca;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1858,8 +1816,7 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1922,8 +1879,7 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1986,8 +1942,7 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
 
     while (width && ((uintptr_t)dest & 15))
     {
commit b6c5ba06f0c5c0bd8d186e7a4879fd3b33e7e13f
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Sat Sep 28 03:48:07 2013 +0300

    vmx: align destination to fix valgrind invalid memory writes
    
    The SIMD optimized inner loops in the VMX/Altivec code are trying
    to emulate unaligned accesses to the destination buffer. For each
    4 pixels (which fit into a 128-bit register) the current
    implementation:
      1. first performs two aligned reads, which cover the needed data
      2. reshuffles bytes to get the needed data in a single vector register
      3. does all the necessary calculations
      4. reshuffles bytes back to their original location in two registers
      5. performs two aligned writes back to the destination buffer
    
    Unfortunately in the case if the destination buffer is unaligned and
    the width is a perfect multiple of 4 pixels, we may have some writes
    crossing the boundaries of the destination buffer. In a multithreaded
    environment this may potentially corrupt the data outside of the
    destination buffer if it is concurrently read and written by some
    other thread.
    
    The valgrind report for blitters-test is full of:
    
    ==23085== Invalid write of size 8
    ==23085==    at 0x1004B0B4: vmx_combine_add_u (pixman-vmx.c:1089)
    ==23085==    by 0x100446EF: general_composite_rect (pixman-general.c:214)
    ==23085==    by 0x10002537: test_composite (blitters-test.c:363)
    ==23085==    by 0x1000369B: fuzzer_test_main._omp_fn.0 (utils.c:733)
    ==23085==    by 0x10004943: fuzzer_test_main (utils.c:728)
    ==23085==    by 0x10002C17: main (blitters-test.c:397)
    ==23085==  Address 0x5188218 is 0 bytes after a block of size 88 alloc'd
    ==23085==    at 0x4051DA0: memalign (vg_replace_malloc.c:581)
    ==23085==    by 0x4051E7B: posix_memalign (vg_replace_malloc.c:709)
    ==23085==    by 0x10004CFF: aligned_malloc (utils.c:833)
    ==23085==    by 0x10001DCB: create_random_image (blitters-test.c:47)
    ==23085==    by 0x10002263: test_composite (blitters-test.c:283)
    ==23085==    by 0x1000369B: fuzzer_test_main._omp_fn.0 (utils.c:733)
    ==23085==    by 0x10004943: fuzzer_test_main (utils.c:728)
    ==23085==    by 0x10002C17: main (blitters-test.c:397)
    
    This patch addresses the problem by first aligning the destination
    buffer at a 16 byte boundary in each combiner function. This trick
    is borrowed from the pixman SSE2 code.
    
    It allows to pass the new thread-test on PowerPC VMX/Altivec systems and
    also resolves the "make check" failure reported for POWER7 hardware:
        http://lists.freedesktop.org/archives/pixman/2013-August/002871.html

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index f629003..130d78e 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -194,6 +194,18 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -233,6 +245,22 @@ vmx_combine_over_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -289,6 +317,17 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -327,6 +366,20 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -382,6 +435,16 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, a);
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -418,6 +481,19 @@ vmx_combine_in_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -471,6 +547,17 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -508,6 +595,20 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -562,6 +663,17 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -599,6 +711,19 @@ vmx_combine_out_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -652,6 +777,17 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (~(*src++));
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -690,6 +826,20 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -744,6 +894,19 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -784,6 +947,24 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -843,6 +1024,19 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -883,6 +1077,24 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_a = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -942,6 +1154,19 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -982,6 +1207,24 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1041,6 +1284,17 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
@@ -1077,6 +1331,19 @@ vmx_combine_add_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1133,6 +1400,17 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1173,6 +1451,21 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1217,6 +1510,20 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf("%s\n",__PRETTY_FUNCTION__); */
@@ -1260,6 +1567,19 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t da = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1302,6 +1622,19 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1345,6 +1678,20 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1389,6 +1736,20 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1433,6 +1794,22 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1484,6 +1861,22 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1532,6 +1925,22 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1580,6 +1989,19 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
commit 0438435b9c915b61af21446b2cb2f77a2b98a3b9
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sat Sep 28 01:10:24 2013 -0400

    test: Add new thread-test program
    
    This test program allocates an array of 16 * 7 uint32_ts and spawns 16
    threads that each use 7 of the allocated uint32_ts as a destination
    image for a large number of composite operations. Each thread then
    computes and returns a checksum for the image. Finally, the main
    thread computes a checksum of the checksums and verifies that it
    matches expectations.
    
    The purpose of this test is catch errors where memory outside images
    is read and then written back. Such out-of-bounds accesses are broken
    when multiple threads are involved, because the threads will race to
    read and write the shared memory.
    
    V2:
    - Incorporate fixes from Siarhei for endianness and undefined behavior
      regarding argument evaluation
    - Make the images 7 pixels wide since the bug only happens when the
      composite width is greater than 4.
    - Compute a checksum of the checksums so that you don't have to
      update 16 values if something changes.
    
    V3: Remove stray dollar sign

diff --git a/test/Makefile.am b/test/Makefile.am
index 5d901d5..88dc36d 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1,8 +1,8 @@
 include $(top_srcdir)/test/Makefile.sources
 
-AM_CFLAGS = $(OPENMP_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS)
+AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
 
 libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
diff --git a/test/Makefile.sources b/test/Makefile.sources
index 2fabdb5..2ae5d9f 100644
--- a/test/Makefile.sources
+++ b/test/Makefile.sources
@@ -13,6 +13,7 @@ TESTPROGRAMS =			\
 	infinite-loop		\
 	trap-crasher		\
 	alpha-loop		\
+	thread-test		\
 	scaling-crash-test	\
 	scaling-helpers-test	\
 	gradient-crash-test	\
diff --git a/test/thread-test.c b/test/thread-test.c
new file mode 100644
index 0000000..f24c31d
--- /dev/null
+++ b/test/thread-test.c
@@ -0,0 +1,197 @@
+#include <config.h>
+
+#ifndef HAVE_PTHREADS
+
+int main ()
+{
+    printf ("Skipped thread-test - pthreads not supported\n");
+    return 0;
+}
+
+#else
+
+#include <stdlib.h>
+#include <pthread.h>
+#include "utils.h"
+
+typedef struct
+{
+    int       thread_no;
+    uint32_t *dst_buf;
+} info_t;
+
+static const pixman_op_t operators[] = 
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+};
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_b5g6r5,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_a4r4g4b4
+};
+
+#define N_ROUNDS 8192
+
+#define RAND_ELT(arr)							\
+    arr[prng_rand() % ARRAY_LENGTH (arr)]
+
+#define DEST_WIDTH (7)
+
+static void *
+thread (void *data)
+{
+    info_t *info = data;
+    uint32_t crc32 = 0x0;
+    uint32_t src_buf[64];
+    pixman_image_t *dst_img, *src_img;
+    int i;
+
+    prng_srand (info->thread_no);
+
+    for (i = 0; i < N_ROUNDS; ++i)
+    {
+	pixman_op_t op;
+	int rand1, rand2;
+
+	prng_randmemset (info->dst_buf, DEST_WIDTH * sizeof (uint32_t), 0);
+	prng_randmemset (src_buf, sizeof (src_buf), 0);
+
+	src_img = pixman_image_create_bits (
+	    RAND_ELT (formats), 4, 4, src_buf, 16);
+	dst_img = pixman_image_create_bits (
+	    RAND_ELT (formats), DEST_WIDTH, 1, info->dst_buf,
+	    DEST_WIDTH * sizeof (uint32_t));
+
+	image_endian_swap (src_img);
+	image_endian_swap (dst_img);
+	
+	rand2 = prng_rand() % 4;
+	rand1 = prng_rand() % 4;
+	op = RAND_ELT (operators);
+
+	pixman_image_composite32 (
+	    op,
+	    src_img, NULL, dst_img,
+	    rand1, rand2, 0, 0, 0, 0, DEST_WIDTH, 1);
+
+	crc32 = compute_crc32_for_image (crc32, dst_img);
+
+	pixman_image_unref (src_img);
+	pixman_image_unref (dst_img);
+    }
+
+    return (void *)(uintptr_t)crc32;
+}
+
+static inline uint32_t
+byteswap32 (uint32_t x)
+{
+    return ((x & ((uint32_t)0xFF << 24)) >> 24) |
+           ((x & ((uint32_t)0xFF << 16)) >>  8) |
+           ((x & ((uint32_t)0xFF <<  8)) <<  8) |
+           ((x & ((uint32_t)0xFF <<  0)) << 24);
+}
+
+int
+main (void)
+{
+    uint32_t dest[16 * DEST_WIDTH];
+    info_t info[16] = { { 0 } };
+    pthread_t threads[16];
+    void *retvals[16];
+    uint32_t crc32s[16], crc32;
+    int i;
+
+    for (i = 0; i < 16; ++i)
+    {
+	info[i].thread_no = i;
+	info[i].dst_buf = &dest[i * DEST_WIDTH];
+    }
+
+    for (i = 0; i < 16; ++i)
+	pthread_create (&threads[i], NULL, thread, &info[i]);
+
+    for (i = 0; i < 16; ++i)
+	pthread_join (threads[i], &retvals[i]);
+
+    for (i = 0; i < 16; ++i)
+    {
+	crc32s[i] = (uintptr_t)retvals[i];
+
+	if (is_little_endian())
+	    crc32s[i] = byteswap32 (crc32s[i]);
+    }
+
+    crc32 = compute_crc32 (0, crc32s, sizeof crc32s);
+
+#define EXPECTED 0xFD497D8D
+
+    if (crc32 != EXPECTED)
+    {
+	printf ("thread-test failed. Got checksum 0x%08X, expected 0x%08X\n",
+		crc32, EXPECTED);
+	return 1;
+    }
+
+    return 0;
+}
+
+#endif
+
commit 65829504073425362fc56995a1dcc8cc464b751a
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sat Sep 28 01:03:55 2013 -0400

    Rename HAVE_PTHREAD_SETSPECIFIC to HAVE_PTHREADS
    
    The test for pthread_setspecific() can be used as a general test for
    whether pthreads are available, so rename the variable from
    HAVE_PTHREAD_SETSPECIFIC to HAVE_PTHREADS and run the test even when
    better support for thread local variables are available.
    
    However, the pthread arguments are still only added to CFLAGS and
    LDFLAGS when pthread_setspecific() is used for thread local variables.
    
    V2: AC_SUBST(PTHREAD_CFLAGS)

diff --git a/configure.ac b/configure.ac
index 263c63e..2dd4776 100644
--- a/configure.ac
+++ b/configure.ac
@@ -961,37 +961,38 @@ main ()
 ]]))
 
 AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
-    if test "z$support_for_pthread_setspecific" != "zyes"; then
+    if test "z$support_for_pthreads" != "zyes"; then
 	PIXMAN_LINK_WITH_ENV(
 		[$1], [pthread_test_program],
 		[PTHREAD_CFLAGS="$CFLAGS"
 		 PTHREAD_LIBS="$LIBS"
 		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes])
+		 support_for_pthreads=yes])
     fi
 ])
 
-if test $ac_cv_tls = none ; then
-    support_for_pthread_setspecific=no
+support_for_pthreads=no
 
-    AC_MSG_CHECKING(for pthread_setspecific)
+AC_MSG_CHECKING(for pthreads)
 
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
     
-    if test $support_for_pthread_setspecific = yes; then
-	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-	AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+if test $support_for_pthreads = yes; then
+    AC_DEFINE([HAVE_PTHREADS], [], [Whether pthreads is supported])
+    if test $ac_cv_tls = none ; then
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
     fi
-
-    AC_MSG_RESULT($support_for_pthread_setspecific);
 fi
 
+AC_MSG_RESULT($support_for_pthreads)
+
 AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(HAVE_PTHREADS)
 AC_SUBST(PTHREAD_LDFLAGS)
 AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
 
 dnl =====================================
 dnl __attribute__((constructor))
diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h
index 9b190b4..2489adc 100644
--- a/pixman/pixman-compiler.h
+++ b/pixman/pixman-compiler.h
@@ -178,7 +178,7 @@
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
-#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+#elif defined(HAVE_PTHREADS)
 
 #include <pthread.h>