[cairo] Very slow performance

Thu Dec 23 15:46:20 PST 2004

On Thursday 23 December 2004 12:20 pm, Chris wrote:
> I profiled the entire set of libraries and one thing sticks out greatly:
>
> Each sample counts as 0.01 seconds.
>   %      self              self     total
> time   seconds    calls  ms/call  ms/call  name
> 90.93   174.85     6092    28.70    28.70  pixman_fill_rect_32bbp
>   1.15     2.21   141053    0.02     0.06  IcRasterizeTrapezoid
>   0.77     1.49  1963553    0.00     0.00  IcBuildCompositeOperand
>
> At least in my test here, pixman_fill_rect_32bbp is by far the biggest
> user of time.
>
> Are there possibly some super-fast routines that would work here?
> Filling a rectangle is so common that I would think there are a ton of
> options here.  Possibly using hardware acceleration since just about
> every computer has some sort of blitter available.

A little bit of searching found this from the Xorg code that uses MMX and has 
a carefully unrolled inner loop that can transfer 64 bytes in 8 cycles, plus 
loop overhead. Perhaps it could be applied?

Bool
fbSolidFillmmx (DrawablePtr pDraw,
  int  x,
  int  y,
  int  width,
  int  height,
  FbBits  xor)
{ 
    FbStride stride;
    int  bpp;
    ullong fill;
    Vector8x8 vfill;
    CARD32 byte_width;
    CARD8 *byte_line;
    FbBits      *bits;
    int  xoff, yoff;

    CHECKPOINT();

    fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);

    if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
 return FALSE;

    if (bpp != 16 && bpp != 32)
 return FALSE;

    if (bpp == 16)
    {
 stride = stride * sizeof (FbBits) / 2;
 byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
 byte_width = 2 * width;
 stride *= 2;
    }
    else
    {
 stride = stride * sizeof (FbBits) / 4;
 byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
 byte_width = 4 * width;
 stride *= 4;
    }

    fill = ((ullong)xor << 32) | xor;
    vfill = (Vector8x8)fill;

    while (height--)
    {
 CARD8 *d = byte_line;
 byte_line += stride;
 int w = byte_width;

 while (w >= 2 && ((unsigned long)d & 3))
 {
     *(CARD16 *)d = xor;
     w -= 2;
     d += 2;
 }

 while (w >= 4 && ((unsigned int)d & 7))
 {
     *(CARD32 *)d = xor;

     w -= 4;
     d += 4;
 }

 while (w >= 64)
 {
     __asm__ __volatile  (
  "movq %0, (%1)\n\t"
  "movq %0, 8(%1)\n\t"
  "movq %0, 16(%1)\n\t"
  "movq %0, 24(%1)\n\t"
  "movq %0, 32(%1)\n\t"
  "movq %0, 40(%1)\n\t"
  "movq %0, 48(%1)\n\t"
  "movq %0, 56(%1)\n\t"
  : /* no output */
  : "y" (vfill), "r" (d)
  : "memory");
     w -= 64;
     d += 64;
 }
 while (w >= 4)
 {
     *(CARD32 *)d = xor;

     w -= 4;
     d += 4;
 }
 if (w >= 2)
 {
     *(CARD16 *)d = xor;
     w -= 2;
     d += 2;
 }
    }

    emms();
    return TRUE;
}

-- 
Ned Konz
http://bike-nomad.com/squeak/