[Pixman] [PATCH] Faster C variant of over_n_8_8888 fast path

Sat Sep 11 05:31:10 PDT 2010

From: Siarhei Siamashka <siarhei.siamashka at nokia.com>

The main loop is split into handling 3 cases:
- opaque source
- translucent source without additive blending
- translucent source with additive blending

When using a normal premultiplied alpha format (by converting to it
from non-premultiplied for example), it is impossible to have any
color component which has a higher value than alpha. If any of the
R, G or B color components are increased artificially and become
higher than alpha, the blending operation becomes additive.

The distinction between additive and non-additive blending becomes
important if the target CPU does not have special instructions for
saturated addition. Non-additive blending does not require saturation
and just ordinary addition can be used instead, improving performance.

With this patch, over_n_8_8888 operation becomes ~1.5x faster for
the cases when non-additive alpha blending is used (this includes
the case of having opaque source and translucent mask for example).

Performance of the inner loop goes up from ~77MPix/s to ~122MPix/s
on Core i7 860 with gcc 4.4 as tested using a simple microbenchmark.
---
 pixman/pixman-fast-path.c |  113 +++++++++++++++++++++++++++++++++------------
 1 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index f03752f..1cb0796 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -290,6 +290,68 @@ fast_composite_in_8_8 (pixman_implementation_t *imp,
     }
 }
 
+/* A variant of 'over', which works faster for non-additive blending on the
+ * platforms which do not have special instructions for saturated addition
+ */
+static force_inline uint32_t
+over_a (uint32_t src, uint32_t dest, pixman_bool_t additive_blending)
+{
+    uint32_t a = ~src >> 24;
+    if (additive_blending)
+    {
+	UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+	return dest;
+    }
+    else
+    {
+	UN8x4_MUL_UN8 (dest, a);
+	return dest + src;
+    }
+}
+
+static force_inline void 
+fast_composite_over_n_8_8888_mainloop (uint32_t *    dst_line,
+                                       int           dst_stride,
+                                       uint8_t *     mask_line,
+                                       int           mask_stride,
+                                       int           width,
+                                       int           height,
+                                       uint32_t      src,
+                                       pixman_bool_t opaque_src,
+                                       pixman_bool_t additive_blending)
+{
+    uint32_t    *dst, d;
+    uint8_t     *mask, m;
+    int32_t      w;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (opaque_src)
+		    *dst = src;
+		else
+		    *dst = over_a (src, *dst, additive_blending);
+	    }
+	    else if (m)
+	    {
+		d = in (src, m);
+		*dst = over_a (d, *dst, additive_blending);
+	    }
+	    dst++;
+	}
+    }
+}
+
 static void
 fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
                               pixman_op_t              op,
@@ -305,46 +367,39 @@ fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
                               int32_t                  width,
                               int32_t                  height)
 {
-    uint32_t src, srca;
-    uint32_t    *dst_line, *dst, d;
-    uint8_t     *mask_line, *mask, m;
+    uint32_t src;
+    uint32_t *dst_line;
+    uint8_t  *mask_line;
     int dst_stride, mask_stride;
-    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-    srca = src >> 24;
     if (src == 0)
 	return;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    while (height--)
+    if (src >> 24 == 0xff)
     {
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-	w = width;
-
-	while (w--)
-	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		    *dst = src;
-		else
-		    *dst = over (src, *dst);
-	    }
-	    else if (m)
-	    {
-		d = in (src, m);
-		*dst = over (d, *dst);
-	    }
-	    dst++;
-	}
+	/* opaque source, and as a result also non-additive blending */
+	fast_composite_over_n_8_8888_mainloop (dst_line, dst_stride, mask_line,
+					       mask_stride, width, height, src,
+					       TRUE, FALSE);
+    }
+    else if (src >= (src << 8) && src >= (src << 16) && src >= (src << 24))
+    {
+	/* non-additive blending */
+	fast_composite_over_n_8_8888_mainloop (dst_line, dst_stride, mask_line,
+					       mask_stride, width, height, src,
+					       FALSE, FALSE);
+    }
+    else
+    {
+	/* additive blending (the slowest variant, hopefully very uncommon) */
+	fast_composite_over_n_8_8888_mainloop (dst_line, dst_stride, mask_line,
+					       mask_stride, width, height, src,
+					       FALSE, TRUE);
     }
 }
 
-- 
1.7.2.2