[cairo] Performance work update

Billy Biggs vektor at dumbterm.net
Sun Aug 14 21:42:04 PDT 2005


  I put together a quick set of benchmarks for some cases I care about:
compositing, line drawing, and image scaling (up and down).

    http://www.freedesktop.org/~vektor/cairo-benchmarks-0.1.tar.gz

  There is a setup.h file to control the number of iterations and enable
"realtime priority mode".  This mode sets the scheduler to SCHED_FIFO so
the benchmark cannot be preempted, effectively hanging the machine.
This is dangerous but useful for more accurate cycle measurements.

  I'd like to hear feedback on what operations people are finding slow
and maybe put together a more comprehensive set of benchmarks.  Please
email me if you have any ideas or code.

  Good news:

  - I did an implementation of fbCompositeSrc_8888x8888mmx which makes
    my "over" test twice as fast.  Not sure if this is needed in
    xserver but I'll put it there anyway after its reviewed.

  - I also found that mmxCombineMaskU and mmxCombineOverU missed special
    cases for 0 alpha and 0xff alpha.  Adding those checks gave a 2x
    speedup for my line test.  This seems to be a trend -- I bet most of
    the other routines can be made a lot faster with this technique.

  Bad news:

  - I am getting some strange results with Xfake.  My "lines" test is
    slower (at least 50%) with the MMX code enabled than with it
    disabled.  This is not reproducable with pixman + fbmmx, where the
    MMX code is much faster.

  - Some of the xlib paths with certain servers are terribly slow.  With
    XFree86 4.3 on my Debian machine, the "over" test is 100x slower
    than the pixman version.  I think we may need a better strategy for
    dealing with this problem.

  Attached is my current patch to xserver/fb if someone wanted to help
investigate the lines problem.

  -Billy

-------------- next part --------------
Index: fbmmx.c
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbmmx.c,v
retrieving revision 1.9
diff -p -u -r1.9 fbmmx.c
--- fbmmx.c	12 Aug 2005 16:11:51 -0000	1.9
+++ fbmmx.c	15 Aug 2005 03:59:37 -0000
@@ -357,14 +357,12 @@ pack565 (__m64 pixel, __m64 target, int 
 static __inline__ __m64
 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 {
-    x = _mm_mullo_pi16 (x, a);                  
-    y = _mm_mullo_pi16 (y, b);                  
-    x = _mm_srli_pi16(x, 1);                    
-    y = _mm_srli_pi16(y, 1);                    
-    x = _mm_adds_pu16 (x, y);                    
-    x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); 
+    x = _mm_mullo_pi16 (x, a);
+    y = _mm_mullo_pi16 (y, b);
     x = _mm_adds_pu16 (x, MC(4x0080));
-    x = _mm_srli_pi16 (x, 7);
+    x = _mm_adds_pu16 (x, y);
+    x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
+    x = _mm_srli_pi16 (x, 8);
 
     return x;
 }
@@ -376,11 +374,17 @@ mmxCombineMaskU (CARD32 *src, const CARD
 {
     const CARD32 *end = mask + width;
     while (mask < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        a = expand_alpha(a);
-        s = pix_multiply(s, a);
-        *src = store8888(s);
+        CARD32 mmask = *mask;
+        CARD32 ca = mmask >> 24;
+        if (ca == 0) {
+            *src = 0;
+        } else if (ca != 0xff) {
+            __m64 a = load8888(mmask);
+            __m64 s = load8888(*src);
+            a = expand_alpha(a);
+            s = pix_multiply(s, a);
+            *src = store8888(s);
+        }
         ++src;
         ++mask;
     }
@@ -394,10 +398,16 @@ mmxCombineOverU (CARD32 *dest, const CAR
     const CARD32 *end = dest + width;
 
     while (dest < end) {
-        __m64 s, sa;
-	s = load8888(*src);
-	sa = expand_alpha(s);
-	*dest = store8888(over(s, sa, load8888(*dest)));
+        CARD32 ss = *src;
+        CARD32 a = ss >> 24;
+        if (a == 0xff) {
+            *dest = ss;
+        } else if (a) {
+            __m64 s, sa;
+            s = load8888(ss);
+            sa = expand_alpha(s);
+            *dest = store8888(over(s, sa, load8888(*dest)));
+        }
         ++dest;
         ++src;
     }
@@ -586,7 +596,7 @@ mmxCombineSaturateU (CARD32 *dest, const
         CARD32 da = ~d >> 24;
 
         if (sa > da) {
-            __m64 msa = load8888(FbIntDiv(da, sa));
+            __m64 msa = load8888(FbIntDiv(da, sa)<<24);
             msa = expand_alpha(msa);
             ms = pix_multiply(ms, msa);
         }
Index: fbpict.c
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbpict.c,v
retrieving revision 1.39
diff -p -u -r1.39 fbpict.c
--- fbpict.c	12 Jul 2005 09:57:00 -0000	1.39
+++ fbpict.c	15 Aug 2005 03:59:37 -0000
@@ -1794,7 +1794,9 @@ fbComposite (CARD8      op,
 	    if (pSrc->format == pDst->format)
 	    {
 #ifdef USE_MMX
-		if (pSrc->pDrawable != pDst->pDrawable)
+		if (pSrc->pDrawable != pDst->pDrawable &&
+		    (PICT_FORMAT_BPP (pSrc->format) == 16 ||
+		     PICT_FORMAT_BPP (pSrc->format) == 32))
 		    func = fbCompositeCopyAreammx;
 		else
 #endif


More information about the cairo mailing list