[Liboil] a copy8x8_u8

Adam D. Moss adam at gimp.org
Wed Nov 16 12:07:55 PST 2005


Unrolling copy8x8_u8_ints yields a ~30% speedup here (I guess
gcc4 doesn't bother).  Using uint64_t is surprisingly a little
slower than this.  I don't actually use this function, I was just
curious - perhaps no-one uses it, so more implementations aren't
justified.

static void
copy8x8_u8_ints_unrolled (uint8_t *d1, int ds, uint8_t *s1, int ss)
{
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
   d1+=ds; s1+=ss;
   ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
   ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
}
OIL_DEFINE_IMPL (copy8x8_u8_ints_unrolled, copy8x8_u8);



More information about the Liboil mailing list