More 24bit optimizations (was: a just another stupid newbie :)
Jaymz Julian
jaymz@artificial-stupidity.net
Sun, 14 Dec 2003 14:47:30 +1100
--T4sUOijqQbZv57TR
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
The attached patch adds more optimizations than the last two versions, and
when the composite manager tilesize is set to 128x128, I can get 20fps playback
of a 352x240 video with it (only a wm running tho, not an entire de - it does
also have the effect of making KDE useable, tho, which is a nice bonus - i'mn
typing this in it, in fact :)
-- jj
--T4sUOijqQbZv57TR
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="jj_24bit_combine_v3.patch"
Index: configure.ac
===================================================================
RCS file: /cvs/xserver/xserver/configure.ac,v
retrieving revision 3.41
diff -u -3 -p -r3.41 configure.ac
--- configure.ac 8 Dec 2003 01:55:06 -0000 3.41
+++ configure.ac 13 Dec 2003 18:50:51 -0000
@@ -333,6 +333,24 @@ if test "x$GCC" = "xyes"; then
XSERVER_CFLAGS="$GCC_WARNINGS $XSERVER_CFLAGS"
fi
+AC_MSG_CHECKING(if unaligned word accesses behave as expected)
+AC_TRY_RUN([
+#include <stdio.h>
+
+int main()
+{
+ char j[8]={1,2,3,4,5,6,7,8};
+ int k= *(int *)(j+1);
+ if(k==0x02030405 || k==0x05040302)
+ return 0;
+ return 1;
+}
+],
+ [AC_DEFINE(WORKING_UNALIGNED_INT, 1, [unaligned word accesses behave as expected])
+ AC_MSG_RESULT(yes)],
+ AC_MSG_RESULT(no),
+ AC_MSG_RESULT(assuming not on target machine))
+
AC_OUTPUT([
Makefile
include/Makefile
Index: fb/fbcompose.c
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbcompose.c,v
retrieving revision 1.17
diff -u -3 -p -r1.17 fbcompose.c
--- fb/fbcompose.c 11 Sep 2003 05:12:50 -0000 1.17
+++ fb/fbcompose.c 13 Dec 2003 18:51:05 -0000
@@ -1649,15 +1649,20 @@ fbFetch_r8g8b8 (FbCompositeOperand *op)
FbBits *line = op->u.drawable.line; CARD32 offset = op->u.drawable.offset;
CARD8 *pixel = ((CARD8 *) line) + (offset >> 3);
#if IMAGE_BYTE_ORDER == MSBFirst
+ // FIXME: implent WORKING_UNALIGNED_INT for this endian :)
return (0xff000000 |
(pixel[0] << 16) |
(pixel[1] << 8) |
(pixel[2]));
#else
- return (0xff000000 |
- (pixel[2] << 16) |
- (pixel[1] << 8) |
- (pixel[0]));
+ #ifdef WORKING_UNALIGNED_INT
+ return *(CARD32 *)pixel|0xff000000;
+ #else
+ return (0xff000000 |
+ (pixel[2] << 16) |
+ (pixel[1] << 8) |
+ (pixel[0]));
+ #endif
#endif
}
Index: fb/fbpict.c
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbpict.c,v
retrieving revision 1.18
diff -u -3 -p -r1.18 fbpict.c
--- fb/fbpict.c 5 Nov 2003 05:45:31 -0000 1.18
+++ fb/fbpict.c 13 Dec 2003 18:51:12 -0000
@@ -161,6 +161,33 @@ fbIn24 (CARD32 x, CARD8 y)
(line) = ((type *) __bits__) + (stride) * ((y) + __yoff__) + (mul) * ((x) + __xoff__); \
}
+#define genericCombine24(a,b,c,d) (((a)*(c)+(b)*(d)))
+#if IMAGE_BYTE_ORDER == LSBFirst
+ #define setupPackedReader(count,temp,where,workingWhere,workingVal) count=(int)where; \
+ temp=count&3; \
+ where-=temp; \
+ workingWhere=(CARD32 *)where; \
+ workingVal=*workingWhere++; \
+ count=4-temp; \
+ workingVal>>=(8*temp)
+ #define readPacked(where,x,y,z) {if(!(x)) { (x)=4; y=*z++; } where=(y)&0xff; (y)>>=8; (x)--;}
+ #define readPackedSource(where) readPacked(where,ws,workingSource,wsrc)
+ #define readPackedDest(where) readPacked(where,wd,workingiDest,widst)
+ #define writePacked(what) workingoDest>>=8; workingoDest|=(what<<24); ww--; if(!ww) { ww=4; *wodst++=workingoDest; }
+#else
+ #warning "I havn't tested fbCompositeTrans_0888xnx0888() on big endian yet!"
+ #define setupPackedReader(count,temp,where,workingWhere,workingVal) count=(int)where; \
+ temp=count&3; \
+ where-=temp; \
+ workingWhere=(CARD32 *)where; \
+ workingVal=*workingWhere++; \
+ count=4-temp; \
+ workingVal<<=(8*temp)
+ #define readPacked(where,x,y,z) {if(!(x)) { (x)=4; y=*z++; } where=(y)>>24; (y)<<=8; (x)--;}
+ #define readPackedSource(where) readPacked(where,ws,workingSource,wsrc)
+ #define readPackedDest(where) readPacked(where,wd,workingiDest,widst)
+ #define writePacked(what) workingoDest<<=8; workingoDest|=what; ww--; if(!ww) { ww=4; *wodst++=workingoDest; }
+#endif
/*
* Naming convention:
*
@@ -298,6 +325,7 @@ fbCompositeSolidMask_nx8888x8888C (CARD8
}
}
+#define srcAlphaCombine24(a,b) genericCombine24(a,b,srca,srcia)
void
fbCompositeSolidMask_nx8x0888 (CARD8 op,
PicturePtr pSrc,
@@ -312,52 +340,86 @@ fbCompositeSolidMask_nx8x0888 (CARD8
CARD16 width,
CARD16 height)
{
- CARD32 src, srca;
- CARD8 *dstLine, *dst;
+ CARD32 src, srca, srcia;
+ CARD8 *dstLine, *dst, *edst;
CARD32 d;
CARD8 *maskLine, *mask, m;
FbStride dstStride, maskStride;
CARD16 w;
+ CARD32 rs,gs,bs,rd,gd,bd;
fbComposeGetSolid(pSrc, src);
srca = src >> 24;
+ srcia = 255-srca;
if (src == 0)
return;
+
+ rs=src&0xff;
+ gs=(src>>8)&0xff;
+ bs=(src>>16)&0xff;
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 3);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
-
- while (height--)
- {
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
- while (w--)
+ while (height--)
{
- m = *mask++;
- if (m == 0xff)
- {
- if (srca == 0xff)
- d = src;
- else
+ // fixme: cleanup unused
+ unsigned int wt,wd;
+ CARD32 workingiDest;
+ CARD32 *widst;
+
+ edst=dst = dstLine;
+ dstLine += dstStride;
+ mask = maskLine;
+ maskLine += maskStride;
+ w = width;
+
+#ifndef NO_MASKED_PACKED_READ
+ setupPackedReader(wd,wt,edst,widst,workingiDest);
+#endif
+
+ while (w--)
{
- d = Fetch24(dst);
- d = fbOver24 (src, d);
+#ifndef NO_MASKED_PACKED_READ
+ readPackedDest(rd);
+ readPackedDest(gd);
+ readPackedDest(bd);
+#else
+ rd= *edst++;
+ gd= *edst++;
+ bd= *edst++;
+#endif
+ m = *mask++;
+ if (m == 0xff)
+ {
+ if (srca == 0xff)
+ {
+ *dst++=rs;
+ *dst++=gs;
+ *dst++=bs;
+ }
+ else
+ {
+ *dst++=(srcAlphaCombine24(rs, rd)>>8);
+ *dst++=(srcAlphaCombine24(gs, gd)>>8);
+ *dst++=(srcAlphaCombine24(bs, bd)>>8);
+ }
+ }
+ else if (m)
+ {
+ int na=(srca*(int)m)>>8;
+ int nia=255-na;
+ *dst++=(genericCombine24(rs, rd, na, nia)>>8);
+ *dst++=(genericCombine24(gs, gd, na, nia)>>8);
+ *dst++=(genericCombine24(bs, bd, na, nia)>>8);
+ }
+ else
+ {
+ dst+=3;
+ }
}
- Store24(dst,d);
- }
- else if (m)
- {
- d = fbOver24 (fbIn(src,m), Fetch24(dst));
- Store24(dst,d);
- }
- dst += 3;
}
- }
}
void
@@ -959,6 +1021,228 @@ fbCompositeTrans_0565xnx0565(CARD8
}
}
+
+
+// macros for "i can't believe it's not fast" packed pixel handling
+#define alphamaskCombine24(a,b) genericCombine24(a,b,maskAlpha,maskiAlpha)
+void
+fbCompositeTrans_0888xnx0888(CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD8 *dstLine, *dst,*idst;
+ CARD8 *srcLine, *src;
+ FbStride dstStride, srcStride;
+ CARD16 w;
+ FbBits mask;
+ CARD16 maskAlpha,maskiAlpha;
+
+ fbComposeGetSolid (pMask, mask);
+ maskAlpha = mask >> 24;
+ maskiAlpha= 255-maskAlpha;
+
+ if (!maskAlpha)
+ return;
+ //if (maskAlpha == 0xff)
+ //{
+ //fbCompositeSrc_0888x0888 (op, pSrc, pMask, pDst,
+ // xSrc, ySrc, xMask, yMask, xDst, yDst,
+ // width, height);
+ //return;
+ //}
+
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 3);
+ fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 3);
+
+ {
+ unsigned int ws,wt,wd,ww;
+ CARD32 workingSource;
+ CARD32 *wsrc;
+ CARD32 rs,gs,bs;
+ CARD32 rd,gd,bd;
+
+ CARD32 workingiDest,workingoDest;
+ CARD32 *widst,*wodst;
+
+
+ // are xSrc and xDst at the same alignment? if not, we need to be complicated :)
+ //if(0==0)
+ if( (((xSrc*3)&3)!=((xDst*3)&3)) || (srcStride&3)!=0 || (dstStride&3)!=0)
+ {
+ while (height--)
+ {
+ idst=dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width*3;
+
+ setupPackedReader(wd,wt,idst,widst,workingiDest);
+ ww=(int)dst;
+ wt=ww&3;
+ dst-=wt;
+ wodst=(CARD32 *)dst;
+ workingoDest=*wodst;
+ ww=4-wt;
+#if IMAGE_BYTE_ORDER == LSBFirst
+ workingoDest<<=(8*(ww+1));
+#else
+ workingoDest>>=(8*(ww+1));
+#endif
+
+ // get to word aligned
+ switch(!(int)src&3)
+ {
+ case 1:
+ readPackedDest(rd);
+ rd=alphamaskCombine24(*src++, rd)>>8;
+ writePacked(rd);
+ w--; if(w) break;
+ case 2:
+ readPackedDest(rd);
+ rd=alphamaskCombine24(*src++, rd)>>8;
+ writePacked(rd);
+ w--; if(w) break;
+ case 3:
+ readPackedDest(rd);
+ rd=alphamaskCombine24(*src++, rd)>>8;
+ writePacked(rd);
+ w--; if(w) break;
+ }
+ wsrc=(CARD32 *)src;
+ while (w>3)
+ {
+ rs=*wsrc++;
+#if IMAGE_BYTE_ORDER == LSBFirst
+ readPackedDest(rd);
+ rd=alphamaskCombine24(rs&0xff, rd)>>8;
+ writePacked(rd);
+
+ readPackedDest(rd);
+ rd=alphamaskCombine24((rs>>8)&0xff, rd)>>8;
+ writePacked(rd);
+
+ readPackedDest(rd);
+ rd=alphamaskCombine24((rs>>16)&0xff, rd)>>8;
+ writePacked(rd);
+
+ readPackedDest(rd);
+ rd=alphamaskCombine24(rs>>24, rd)>>8;
+ writePacked(rd);
+#else
+ readPackedDest(rd);
+ rd=alphamaskCombine24(rs>>24, rd)>>8;
+ writePacked(rd);
+
+ readPackedDest(rd);
+ rd=alphamaskCombine24((rs>>16)&0xff, rd)>>8;
+ writePacked(rd);
+
+ readPackedDest(rd);
+ rd=alphamaskCombine24((rs>>8)&0xff, rd)>>8;
+ writePacked(rd);
+
+ readPackedDest(rd);
+ rd=alphamaskCombine24(rs&0xff, rd)>>8;
+ writePacked(rd);
+#endif
+ w-=4;
+ }
+ src=(CARD8 *)wsrc;
+ switch(w)
+ {
+ case 3:
+ readPackedDest(rd);
+ rd=alphamaskCombine24(*src++, rd)>>8;
+ writePacked(rd);
+ case 2:
+ readPackedDest(rd);
+ rd=alphamaskCombine24(*src++, rd)>>8;
+ writePacked(rd);
+ case 1:
+ readPackedDest(rd);
+ rd=alphamaskCombine24(*src++, rd)>>8;
+ writePacked(rd);
+ }
+ dst=(CARD8 *)wodst;
+ switch(ww)
+ {
+ case 1:
+ dst[2]=(workingoDest>>8)&0xff;
+ case 2:
+ dst[1]=(workingoDest>>16)&0xff;
+ case 3:
+ dst[0]=workingoDest>>24;
+ }
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ idst=dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width*3;
+ // get to word aligned
+ switch(!(int)src&3)
+ {
+ case 1:
+ rd=alphamaskCombine24(*src++, *dst)>>8;
+ *dst++=rd;
+ w--; if(w) break;
+ case 2:
+ rd=alphamaskCombine24(*src++, *dst)>>8;
+ *dst++=rd;
+ w--; if(w) break;
+ case 3:
+ rd=alphamaskCombine24(*src++, *dst)>>8;
+ *dst++=rd;
+ w--; if(w) break;
+ }
+ wsrc=(CARD32 *)src;
+ widst=(CARD32 *)dst;
+ while(w>3)
+ {
+ bd=0;
+ rs = *wsrc++;
+ rd = *widst;
+ bd|=alphamaskCombine24(rs&0xff, rd&0xff)>>8;
+ bd|=alphamaskCombine24((rs>>8)&0xff, (rd>>8)&0xff)&0xff00;
+ bd|=(alphamaskCombine24((rs>>16)&0xff, (rd>>16)&0xff)<<8)&0xff0000;
+ bd|=(alphamaskCombine24(rs>>24, rd>>24)<<16)&0xff000000;
+ *widst++=bd;
+ w-=4;
+ }
+ src=(CARD8 *)wsrc;
+ dst=(CARD8 *)widst;
+ switch(w)
+ {
+ case 3:
+ rd=alphamaskCombine24(*src++, *dst)>>8;
+ *dst++=rd;
+ case 2:
+ rd=alphamaskCombine24(*src++, *dst)>>8;
+ *dst++=rd;
+ case 1:
+ rd=alphamaskCombine24(*src++, *dst)>>8;
+ *dst++=rd;
+ }
+ }
+ }
+ }
+}
+
/*
* Simple bitblt
*/
@@ -1168,6 +1452,11 @@ fbComposite (CARD8 op,
if (pDst->format == pSrc->format)
func = fbCompositeTrans_0565xnx0565;
break;
+ case PICT_r8g8b8:
+ case PICT_b8g8r8:
+ if (pDst->format == pSrc->format)
+ func = fbCompositeTrans_0888xnx0888;
+ break;
}
if (func != fbCompositeGeneral)
maskRepeat = FALSE;
@@ -1281,7 +1570,7 @@ fbComposite (CARD8 op,
x_src = pbox->x1 - xDst + xSrc;
x_msk = pbox->x1 - xDst + xMask;
x_dst = pbox->x1;
- if (maskRepeat)
+ if (maskRepeat && pMask->pDrawable->height > 1)
{
y_msk = mod (y_msk, pMask->pDrawable->height);
if (h_this > pMask->pDrawable->height - y_msk)
@@ -1296,7 +1585,7 @@ fbComposite (CARD8 op,
while (w)
{
w_this = w;
- if (maskRepeat)
+ if (maskRepeat && pMask->pDrawable->width > 1)
{
x_msk = mod (x_msk, pMask->pDrawable->width);
if (w_this > pMask->pDrawable->width - x_msk)
Index: fb/fbpict.h
===================================================================
RCS file: /cvs/xserver/xserver/fb/fbpict.h,v
retrieving revision 1.13
diff -u -3 -p -r1.13 fbpict.h
--- fb/fbpict.h 5 Nov 2003 05:45:31 -0000 1.13
+++ fb/fbpict.h 13 Dec 2003 18:51:15 -0000
@@ -969,6 +969,20 @@ fbCompositeTrans_0565xnx0565(CARD8
CARD16 width,
CARD16 height);
+void
+fbCompositeTrans_0888xnx0888(CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
void
fbCompositeSrcSrc_nxn (CARD8 op,
PicturePtr pSrc,
Index: hw/kdrive/src/kaa.c
===================================================================
RCS file: /cvs/xserver/xserver/hw/kdrive/src/kaa.c,v
retrieving revision 1.21
diff -u -3 -p -r1.21 kaa.c
--- hw/kdrive/src/kaa.c 20 Nov 2003 07:49:46 -0000 1.21
+++ hw/kdrive/src/kaa.c 13 Dec 2003 18:51:27 -0000
@@ -693,7 +693,9 @@ kaaImageGlyphBlt (DrawablePtr pDrawable,
switch (dstBpp) {
case 8: glyph = fbGlyph8; break;
case 16: glyph = fbGlyph16; break;
+#ifndef FBNO24BIT
case 24: glyph = fbGlyph24; break;
+#endif
case 32: glyph = fbGlyph32; break;
}
--T4sUOijqQbZv57TR--