[Pixman] [PATCH 1/2] vmx: align destination to fix valgrind invalid memory writes

Siarhei Siamashka siarhei.siamashka at gmail.com
Fri Sep 27 19:30:14 PDT 2013


The SIMD optimized inner loops in the VMX/Altivec code are trying
to emulate unaligned accesses to the destination buffer. For each
4 pixels (which fit into a 128-bit register) the current
implementation:
  1. first performs two aligned reads, which cover the needed data
  2. reshuffles bytes to get the needed data in a single vector register
  3. does all the necessary calculations
  4. reshuffles bytes back to their original location in two registers
  5. performs two aligned writes back to the destination buffer

Unfortunately in the case if the destination buffer is unaligned and
the width is a perfect multiple of 4 pixels, we may have some writes
crossing the boundaries of the destination buffer. In a multithreaded
environment this may potentially corrupt the data outside of the
destination buffer if it is concurrently read and written by some
other thread.

It is the primary suspect for the "make check" failure on power7 hardware:
    http://lists.freedesktop.org/archives/pixman/2013-August/002871.html

The valgrind report for blitters-test is full of:

==23085== Invalid write of size 8
==23085==    at 0x1004B0B4: vmx_combine_add_u (pixman-vmx.c:1089)
==23085==    by 0x100446EF: general_composite_rect (pixman-general.c:214)
==23085==    by 0x10002537: test_composite (blitters-test.c:363)
==23085==    by 0x1000369B: fuzzer_test_main._omp_fn.0 (utils.c:733)
==23085==    by 0x10004943: fuzzer_test_main (utils.c:728)
==23085==    by 0x10002C17: main (blitters-test.c:397)
==23085==  Address 0x5188218 is 0 bytes after a block of size 88 alloc'd
==23085==    at 0x4051DA0: memalign (vg_replace_malloc.c:581)
==23085==    by 0x4051E7B: posix_memalign (vg_replace_malloc.c:709)
==23085==    by 0x10004CFF: aligned_malloc (utils.c:833)
==23085==    by 0x10001DCB: create_random_image (blitters-test.c:47)
==23085==    by 0x10002263: test_composite (blitters-test.c:283)
==23085==    by 0x1000369B: fuzzer_test_main._omp_fn.0 (utils.c:733)
==23085==    by 0x10004943: fuzzer_test_main (utils.c:728)
==23085==    by 0x10002C17: main (blitters-test.c:397)

This patch addresses the problem by first aligning the destination
buffer at a 16 byte boundary in each combiner function. This trick
is borrowed from the pixman SSE2 code.
---
 pixman/pixman-vmx.c | 422 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 422 insertions(+)

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index f629003..130d78e 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -194,6 +194,18 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -233,6 +245,22 @@ vmx_combine_over_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -289,6 +317,17 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -327,6 +366,20 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -382,6 +435,16 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, a);
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -418,6 +481,19 @@ vmx_combine_in_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -471,6 +547,17 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -508,6 +595,20 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -562,6 +663,17 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -599,6 +711,19 @@ vmx_combine_out_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -652,6 +777,17 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (~(*src++));
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -690,6 +826,20 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -744,6 +894,19 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -784,6 +947,24 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -843,6 +1024,19 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -883,6 +1077,24 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_a = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -942,6 +1154,19 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -982,6 +1207,24 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1041,6 +1284,17 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKS (dest, src);
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
@@ -1077,6 +1331,19 @@ vmx_combine_add_u_mask (uint32_t *      dest,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1133,6 +1400,17 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1173,6 +1451,21 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1217,6 +1510,20 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf("%s\n",__PRETTY_FUNCTION__); */
@@ -1260,6 +1567,19 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t da = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1302,6 +1622,19 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1345,6 +1678,20 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1389,6 +1736,20 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1433,6 +1794,22 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1484,6 +1861,22 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1532,6 +1925,22 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1580,6 +1989,19 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
 
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
+
+	*dest++ = s;
+	width--;
+    }
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-- 
1.8.1.5



More information about the Pixman mailing list