[cairo] Very slow performance
Ned Konz
ned at squeakland.org
Thu Dec 23 15:46:20 PST 2004
On Thursday 23 December 2004 12:20 pm, Chris wrote:
> I profiled the entire set of libraries and one thing sticks out greatly:
>
> Each sample counts as 0.01 seconds.
> % self self total
> time seconds calls ms/call ms/call name
> 90.93 174.85 6092 28.70 28.70 pixman_fill_rect_32bbp
> 1.15 2.21 141053 0.02 0.06 IcRasterizeTrapezoid
> 0.77 1.49 1963553 0.00 0.00 IcBuildCompositeOperand
>
> At least in my test here, pixman_fill_rect_32bbp is by far the biggest
> user of time.
>
> Are there possibly some super-fast routines that would work here?
> Filling a rectangle is so common that I would think there are a ton of
> options here. Possibly using hardware acceleration since just about
> every computer has some sort of blitter available.
A little bit of searching found this from the Xorg code that uses MMX and has
a carefully unrolled inner loop that can transfer 64 bytes in 8 cycles, plus
loop overhead. Perhaps it could be applied?
Bool
fbSolidFillmmx (DrawablePtr pDraw,
int x,
int y,
int width,
int height,
FbBits xor)
{
FbStride stride;
int bpp;
ullong fill;
Vector8x8 vfill;
CARD32 byte_width;
CARD8 *byte_line;
FbBits *bits;
int xoff, yoff;
CHECKPOINT();
fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
return FALSE;
if (bpp != 16 && bpp != 32)
return FALSE;
if (bpp == 16)
{
stride = stride * sizeof (FbBits) / 2;
byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
byte_width = 2 * width;
stride *= 2;
}
else
{
stride = stride * sizeof (FbBits) / 4;
byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
byte_width = 4 * width;
stride *= 4;
}
fill = ((ullong)xor << 32) | xor;
vfill = (Vector8x8)fill;
while (height--)
{
CARD8 *d = byte_line;
byte_line += stride;
int w = byte_width;
while (w >= 2 && ((unsigned long)d & 3))
{
*(CARD16 *)d = xor;
w -= 2;
d += 2;
}
while (w >= 4 && ((unsigned int)d & 7))
{
*(CARD32 *)d = xor;
w -= 4;
d += 4;
}
while (w >= 64)
{
__asm__ __volatile (
"movq %0, (%1)\n\t"
"movq %0, 8(%1)\n\t"
"movq %0, 16(%1)\n\t"
"movq %0, 24(%1)\n\t"
"movq %0, 32(%1)\n\t"
"movq %0, 40(%1)\n\t"
"movq %0, 48(%1)\n\t"
"movq %0, 56(%1)\n\t"
: /* no output */
: "y" (vfill), "r" (d)
: "memory");
w -= 64;
d += 64;
}
while (w >= 4)
{
*(CARD32 *)d = xor;
w -= 4;
d += 4;
}
if (w >= 2)
{
*(CARD16 *)d = xor;
w -= 2;
d += 2;
}
}
emms();
return TRUE;
}
--
Ned Konz
http://bike-nomad.com/squeak/
More information about the cairo
mailing list