More 24bit optimizations (was: a just another stupid newbie :)

Jaymz Julian jaymz@artificial-stupidity.net
Sun, 14 Dec 2003 14:47:30 +1100


--T4sUOijqQbZv57TR
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

The attached patch adds more optimizations than the last two versions, and
when the composite manager tilesize is set to 128x128, I can get 20fps playback
of a 352x240 video with it (only a wm running tho, not an entire de - it does
also have the effect of making KDE useable, tho, which is a nice bonus - i'mn
typing this in it, in fact :)

	-- jj


--T4sUOijqQbZv57TR
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="jj_24bit_combine_v3.patch"

Index: configure.ac
===================================================================
RCS file: /cvs/xserver/xserver/configure.ac,v
retrieving revision 3.41
diff -u -3 -p -r3.41 configure.ac
--- configure.ac	8 Dec 2003 01:55:06 -0000	3.41
+++ configure.ac	13 Dec 2003 18:50:51 -0000
@@ -333,6 +333,24 @@ if test "x$GCC" = "xyes"; then
 	XSERVER_CFLAGS="$GCC_WARNINGS $XSERVER_CFLAGS"
 fi
 
+AC_MSG_CHECKING(if unaligned word accesses behave as expected)
+AC_TRY_RUN([
+#include <stdio.h>
+
+int main()
+{
+        char j[8]={1,2,3,4,5,6,7,8};
+        int k= *(int *)(j+1);
+        if(k==0x02030405 || k==0x05040302)
+                return 0;
+        return 1;
+}
+],
+ [AC_DEFINE(WORKING_UNALIGNED_INT, 1, [unaligned word accesses behave as expected])
+  AC_MSG_RESULT(yes)], 
+ AC_MSG_RESULT(no),
+ AC_MSG_RESULT(assuming not on target machine))
+
 AC_OUTPUT([
 Makefile
 include/Makefile
Index: fb/fbcompose.c
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbcompose.c,v
retrieving revision 1.17
diff -u -3 -p -r1.17 fbcompose.c
--- fb/fbcompose.c	11 Sep 2003 05:12:50 -0000	1.17
+++ fb/fbcompose.c	13 Dec 2003 18:51:05 -0000
@@ -1649,15 +1649,20 @@ fbFetch_r8g8b8 (FbCompositeOperand *op)
     FbBits  *line = op->u.drawable.line; CARD32 offset = op->u.drawable.offset;
     CARD8   *pixel = ((CARD8 *) line) + (offset >> 3);
 #if IMAGE_BYTE_ORDER == MSBFirst
+    // FIXME: implent WORKING_UNALIGNED_INT for this endian :)
     return (0xff000000 |
 	    (pixel[0] << 16) |
 	    (pixel[1] << 8) |
 	    (pixel[2]));
 #else
-    return (0xff000000 |
-	    (pixel[2] << 16) |
-	    (pixel[1] << 8) |
-	    (pixel[0]));
+	#ifdef WORKING_UNALIGNED_INT
+		return *(CARD32 *)pixel|0xff000000;
+	#else
+	    return (0xff000000 |
+		    (pixel[2] << 16) |
+		    (pixel[1] << 8) |
+		    (pixel[0]));
+	#endif
 #endif
 }
 
Index: fb/fbpict.c
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbpict.c,v
retrieving revision 1.18
diff -u -3 -p -r1.18 fbpict.c
--- fb/fbpict.c	5 Nov 2003 05:45:31 -0000	1.18
+++ fb/fbpict.c	13 Dec 2003 18:51:12 -0000
@@ -161,6 +161,33 @@ fbIn24 (CARD32 x, CARD8 y)
     (line) = ((type *) __bits__) + (stride) * ((y) + __yoff__) + (mul) * ((x) + __xoff__); \
 }
 
+#define genericCombine24(a,b,c,d) (((a)*(c)+(b)*(d)))
+#if IMAGE_BYTE_ORDER == LSBFirst
+	#define setupPackedReader(count,temp,where,workingWhere,workingVal) count=(int)where; \
+					temp=count&3; \
+					where-=temp; \
+					workingWhere=(CARD32 *)where; \
+					workingVal=*workingWhere++; \
+					count=4-temp; \
+					workingVal>>=(8*temp)
+	#define readPacked(where,x,y,z) {if(!(x)) { (x)=4; y=*z++; } where=(y)&0xff; (y)>>=8; (x)--;}
+	#define readPackedSource(where) readPacked(where,ws,workingSource,wsrc)
+	#define readPackedDest(where) readPacked(where,wd,workingiDest,widst)
+	#define writePacked(what) workingoDest>>=8; workingoDest|=(what<<24); ww--; if(!ww) { ww=4; *wodst++=workingoDest; } 
+#else
+	#warning "I havn't tested fbCompositeTrans_0888xnx0888() on big endian yet!"
+	#define setupPackedReader(count,temp,where,workingWhere,workingVal) count=(int)where; \
+					temp=count&3; \
+					where-=temp; \
+					workingWhere=(CARD32 *)where; \
+					workingVal=*workingWhere++; \
+					count=4-temp; \
+					workingVal<<=(8*temp)
+	#define readPacked(where,x,y,z) {if(!(x)) { (x)=4; y=*z++; } where=(y)>>24; (y)<<=8; (x)--;}
+	#define readPackedSource(where) readPacked(where,ws,workingSource,wsrc)
+	#define readPackedDest(where) readPacked(where,wd,workingiDest,widst)
+	#define writePacked(what) workingoDest<<=8; workingoDest|=what; ww--; if(!ww) { ww=4; *wodst++=workingoDest; } 
+#endif
 /*
  * Naming convention:
  *
@@ -298,6 +325,7 @@ fbCompositeSolidMask_nx8888x8888C (CARD8
     }
 }
 
+#define srcAlphaCombine24(a,b) genericCombine24(a,b,srca,srcia)
 void
 fbCompositeSolidMask_nx8x0888 (CARD8      op,
 			       PicturePtr pSrc,
@@ -312,52 +340,86 @@ fbCompositeSolidMask_nx8x0888 (CARD8    
 			       CARD16     width,
 			       CARD16     height)
 {
-    CARD32	src, srca;
-    CARD8	*dstLine, *dst;
+    CARD32	src, srca, srcia;
+    CARD8	*dstLine, *dst, *edst;
     CARD32	d;
     CARD8	*maskLine, *mask, m;
     FbStride	dstStride, maskStride;
     CARD16	w;
+	CARD32 rs,gs,bs,rd,gd,bd;
 
     fbComposeGetSolid(pSrc, src);
     
     srca = src >> 24;
+    srcia = 255-srca;
     if (src == 0)
 	return;
+
+	rs=src&0xff;
+	gs=(src>>8)&0xff;
+	bs=(src>>16)&0xff;
     
     fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 3);
     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
-    
-    while (height--)
-    {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
-	w = width;
 
-	while (w--)
+    while (height--)
 	{
-	    m = *mask++;
-	    if (m == 0xff)
-	    {
-		if (srca == 0xff)
-		    d = src;
-		else
+		// fixme: cleanup unused
+		unsigned int wt,wd;
+		CARD32 workingiDest;
+		CARD32 *widst;
+		
+		edst=dst = dstLine;
+		dstLine += dstStride;
+		mask = maskLine;
+		maskLine += maskStride;
+		w = width;
+		
+#ifndef NO_MASKED_PACKED_READ
+		setupPackedReader(wd,wt,edst,widst,workingiDest);
+#endif
+				
+		while (w--)
 		{
-		    d = Fetch24(dst);
-		    d = fbOver24 (src, d);
+#ifndef NO_MASKED_PACKED_READ
+			readPackedDest(rd);
+			readPackedDest(gd);
+			readPackedDest(bd);
+#else
+			rd= *edst++;
+			gd= *edst++;
+			bd= *edst++;
+#endif
+			m = *mask++;
+			if (m == 0xff)
+			{
+				if (srca == 0xff)
+				{
+					*dst++=rs;
+					*dst++=gs;
+					*dst++=bs;
+				}
+				else
+				{
+					*dst++=(srcAlphaCombine24(rs, rd)>>8);
+					*dst++=(srcAlphaCombine24(gs, gd)>>8);
+					*dst++=(srcAlphaCombine24(bs, bd)>>8);
+				}
+			}
+			else if (m)
+			{
+				int na=(srca*(int)m)>>8;
+				int nia=255-na;
+				*dst++=(genericCombine24(rs, rd, na, nia)>>8);
+				*dst++=(genericCombine24(gs, gd, na, nia)>>8);
+				*dst++=(genericCombine24(bs, bd, na, nia)>>8);
+			}
+			else
+			{
+				dst+=3;
+			}
 		}
-		Store24(dst,d);
-	    }
-	    else if (m)
-	    {
-		d = fbOver24 (fbIn(src,m), Fetch24(dst));
-		Store24(dst,d);
-	    }
-	    dst += 3;
 	}
-    }
 }
 
 void
@@ -959,6 +1021,228 @@ fbCompositeTrans_0565xnx0565(CARD8      
     }
 }
 
+
+
+// macros for "i can't believe it's not fast" packed pixel handling
+#define alphamaskCombine24(a,b) genericCombine24(a,b,maskAlpha,maskiAlpha)
+void
+fbCompositeTrans_0888xnx0888(CARD8      op,
+			     PicturePtr pSrc,
+			     PicturePtr pMask,
+			     PicturePtr pDst,
+			     INT16      xSrc,
+			     INT16      ySrc,
+			     INT16      xMask,
+			     INT16      yMask,
+			     INT16      xDst,
+			     INT16      yDst,
+			     CARD16     width,
+			     CARD16     height)
+{
+    CARD8	*dstLine, *dst,*idst;
+    CARD8	*srcLine, *src;
+    FbStride	dstStride, srcStride;
+    CARD16	w;
+    FbBits	mask;
+    CARD16	maskAlpha,maskiAlpha;
+    
+    fbComposeGetSolid (pMask, mask);
+    maskAlpha = mask >> 24;
+	maskiAlpha= 255-maskAlpha;
+    
+    if (!maskAlpha)
+	return;
+    //if (maskAlpha == 0xff)
+    //{
+	//fbCompositeSrc_0888x0888 (op, pSrc, pMask, pDst,
+	//			  xSrc, ySrc, xMask, yMask, xDst, yDst, 
+	//			  width, height);
+	//return;
+    //}
+	
+    fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 3);
+    fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 3);
+
+	{
+		unsigned int ws,wt,wd,ww;
+		CARD32 workingSource;
+		CARD32 *wsrc;
+		CARD32 rs,gs,bs;
+		CARD32 rd,gd,bd;
+
+		CARD32 workingiDest,workingoDest;
+		CARD32 *widst,*wodst;
+
+
+		// are xSrc and xDst at the same alignment?  if not, we need to be complicated :)
+		//if(0==0)
+		if( (((xSrc*3)&3)!=((xDst*3)&3)) || (srcStride&3)!=0 || (dstStride&3)!=0)
+		{
+			while (height--)
+			{
+				idst=dst = dstLine;
+				dstLine += dstStride;
+				src = srcLine;
+				srcLine += srcStride;
+				w = width*3;
+				
+				setupPackedReader(wd,wt,idst,widst,workingiDest);
+				ww=(int)dst;
+				wt=ww&3;
+				dst-=wt; 
+				wodst=(CARD32 *)dst; 
+				workingoDest=*wodst; 
+				ww=4-wt;
+#if IMAGE_BYTE_ORDER == LSBFirst
+				workingoDest<<=(8*(ww+1));
+#else
+				workingoDest>>=(8*(ww+1));
+#endif
+
+				// get to word aligned
+				switch(!(int)src&3)
+				{
+					case 1:
+						readPackedDest(rd);
+						rd=alphamaskCombine24(*src++, rd)>>8;
+						writePacked(rd);
+						w--; if(w) break;
+					case 2:
+						readPackedDest(rd);
+						rd=alphamaskCombine24(*src++, rd)>>8;
+						writePacked(rd);
+						w--; if(w) break;
+					case 3:
+						readPackedDest(rd);
+						rd=alphamaskCombine24(*src++, rd)>>8;
+						writePacked(rd);
+						w--; if(w) break;
+				}
+				wsrc=(CARD32 *)src;
+				while (w>3)
+				{
+					rs=*wsrc++;
+#if IMAGE_BYTE_ORDER == LSBFirst
+					readPackedDest(rd);
+					rd=alphamaskCombine24(rs&0xff, rd)>>8;
+					writePacked(rd);
+
+					readPackedDest(rd);
+					rd=alphamaskCombine24((rs>>8)&0xff, rd)>>8;
+					writePacked(rd);
+					
+					readPackedDest(rd);
+					rd=alphamaskCombine24((rs>>16)&0xff, rd)>>8;
+					writePacked(rd);
+					
+					readPackedDest(rd);
+					rd=alphamaskCombine24(rs>>24, rd)>>8;
+					writePacked(rd);
+#else
+					readPackedDest(rd);
+					rd=alphamaskCombine24(rs>>24, rd)>>8;
+					writePacked(rd);
+					
+					readPackedDest(rd);
+					rd=alphamaskCombine24((rs>>16)&0xff, rd)>>8;
+					writePacked(rd);
+					
+					readPackedDest(rd);
+					rd=alphamaskCombine24((rs>>8)&0xff, rd)>>8;
+					writePacked(rd);
+
+					readPackedDest(rd);
+					rd=alphamaskCombine24(rs&0xff, rd)>>8;
+					writePacked(rd);
+#endif
+					w-=4;
+				}
+				src=(CARD8 *)wsrc;
+				switch(w)
+				{
+					case 3:
+						readPackedDest(rd);
+						rd=alphamaskCombine24(*src++, rd)>>8;
+						writePacked(rd);
+					case 2:
+						readPackedDest(rd);
+						rd=alphamaskCombine24(*src++, rd)>>8;
+						writePacked(rd);
+					case 1:
+						readPackedDest(rd);
+						rd=alphamaskCombine24(*src++, rd)>>8;
+						writePacked(rd);
+				}
+				dst=(CARD8 *)wodst;
+				switch(ww)
+				{
+					case 1:
+						dst[2]=(workingoDest>>8)&0xff;
+					case 2:
+						dst[1]=(workingoDest>>16)&0xff;
+					case 3:
+						dst[0]=workingoDest>>24;
+				}
+			}
+		}
+		else
+		{
+			while (height--)
+			{
+				idst=dst = dstLine;
+				dstLine += dstStride;
+				src = srcLine;
+				srcLine += srcStride;
+				w = width*3;
+				// get to word aligned
+				switch(!(int)src&3)
+				{
+					case 1:
+						rd=alphamaskCombine24(*src++, *dst)>>8;
+						*dst++=rd;
+						w--; if(w) break;
+					case 2:
+						rd=alphamaskCombine24(*src++, *dst)>>8;
+						*dst++=rd;
+						w--; if(w) break;
+					case 3:
+						rd=alphamaskCombine24(*src++, *dst)>>8;
+						*dst++=rd;
+						w--; if(w) break;
+				}
+				wsrc=(CARD32 *)src;
+				widst=(CARD32 *)dst;
+				while(w>3)
+				{
+					bd=0;
+					rs = *wsrc++;
+					rd = *widst;
+					bd|=alphamaskCombine24(rs&0xff, rd&0xff)>>8;
+					bd|=alphamaskCombine24((rs>>8)&0xff, (rd>>8)&0xff)&0xff00;
+					bd|=(alphamaskCombine24((rs>>16)&0xff, (rd>>16)&0xff)<<8)&0xff0000;
+					bd|=(alphamaskCombine24(rs>>24, rd>>24)<<16)&0xff000000;
+					*widst++=bd;
+					w-=4;
+				}
+				src=(CARD8 *)wsrc;
+				dst=(CARD8 *)widst;
+				switch(w)
+				{
+					case 3:
+						rd=alphamaskCombine24(*src++, *dst)>>8;
+						*dst++=rd;
+					case 2:
+						rd=alphamaskCombine24(*src++, *dst)>>8;
+						*dst++=rd;
+					case 1:
+						rd=alphamaskCombine24(*src++, *dst)>>8;
+						*dst++=rd;
+				}
+			}
+		}
+	}
+}
+
 /*
  * Simple bitblt
  */
@@ -1168,6 +1452,11 @@ fbComposite (CARD8      op,
 		    if (pDst->format == pSrc->format)
 		        func = fbCompositeTrans_0565xnx0565;
 		    break;
+		case PICT_r8g8b8:
+		case PICT_b8g8r8:
+		    if (pDst->format == pSrc->format)
+		        func = fbCompositeTrans_0888xnx0888;
+		    break;
 		}
 		if (func != fbCompositeGeneral)
 		    maskRepeat = FALSE;
@@ -1281,7 +1570,7 @@ fbComposite (CARD8      op,
 	    x_src = pbox->x1 - xDst + xSrc;
 	    x_msk = pbox->x1 - xDst + xMask;
 	    x_dst = pbox->x1;
-	    if (maskRepeat)
+	    if (maskRepeat && pMask->pDrawable->height > 1)
 	    {
 		y_msk = mod (y_msk, pMask->pDrawable->height);
 		if (h_this > pMask->pDrawable->height - y_msk)
@@ -1296,7 +1585,7 @@ fbComposite (CARD8      op,
 	    while (w)
 	    {
 		w_this = w;
-		if (maskRepeat)
+		if (maskRepeat && pMask->pDrawable->width > 1)
 		{
 		    x_msk = mod (x_msk, pMask->pDrawable->width);
 		    if (w_this > pMask->pDrawable->width - x_msk)
Index: fb/fbpict.h
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbpict.h,v
retrieving revision 1.13
diff -u -3 -p -r1.13 fbpict.h
--- fb/fbpict.h	5 Nov 2003 05:45:31 -0000	1.13
+++ fb/fbpict.h	13 Dec 2003 18:51:15 -0000
@@ -969,6 +969,20 @@ fbCompositeTrans_0565xnx0565(CARD8      
 			     CARD16     width,
 			     CARD16     height);
 
+void 
+fbCompositeTrans_0888xnx0888(CARD8      op,
+			     PicturePtr pSrc,
+			     PicturePtr pMask,
+			     PicturePtr pDst,
+			     INT16      xSrc,
+			     INT16      ySrc,
+			     INT16      xMask,
+			     INT16      yMask,
+			     INT16      xDst,
+			     INT16      yDst,
+			     CARD16     width,
+			     CARD16     height);
+
 void
 fbCompositeSrcSrc_nxn  (CARD8	op,
 			PicturePtr pSrc,
Index: hw/kdrive/src/kaa.c
===================================================================
RCS file: /cvs/xserver/xserver/hw/kdrive/src/kaa.c,v
retrieving revision 1.21
diff -u -3 -p -r1.21 kaa.c
--- hw/kdrive/src/kaa.c	20 Nov 2003 07:49:46 -0000	1.21
+++ hw/kdrive/src/kaa.c	13 Dec 2003 18:51:27 -0000
@@ -693,7 +693,9 @@ kaaImageGlyphBlt (DrawablePtr	pDrawable,
     switch (dstBpp) {
     case 8:	glyph = fbGlyph8; break;
     case 16:    glyph = fbGlyph16; break;
+#ifndef FBNO24BIT
     case 24:    glyph = fbGlyph24; break;
+#endif
     case 32:    glyph = fbGlyph32; break;
     }
     

--T4sUOijqQbZv57TR--