[Liboil] a copy8x8_u8
Adam D. Moss
adam at gimp.org
Wed Nov 16 12:07:55 PST 2005
Unrolling copy8x8_u8_ints yields a ~30% speedup here (I guess
gcc4 doesn't bother). Using uint64_t is surprisingly a little
slower than this. I don't actually use this function, I was just
curious - perhaps no-one uses it, so more implementations aren't
justified.
static void
copy8x8_u8_ints_unrolled (uint8_t *d1, int ds, uint8_t *s1, int ss)
{
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
d1+=ds; s1+=ss;
((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
}
OIL_DEFINE_IMPL (copy8x8_u8_ints_unrolled, copy8x8_u8);
More information about the Liboil
mailing list