[Pixman] [PATCH 04/12] vmx: implement fast path vmx_blt

Tue Jul 14 01:40:51 PDT 2015

On Thu,  2 Jul 2015 13:04:09 +0300
Oded Gabbay <oded.gabbay at gmail.com> wrote:

> No changes were observed when running cairo trimmed benchmarks.

Maybe mention that the performance improvements can be
observed after applying another "vmx: implement fast path
vmx_composite_copy_area" patch? Or even squash these two
patches.

> Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
>
> ---
>  pixman/pixman-vmx.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 124 insertions(+)
> 
> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
> index b9acd6c..b42288b 100644
> --- a/pixman/pixman-vmx.c
> +++ b/pixman/pixman-vmx.c
> @@ -2708,6 +2708,128 @@ vmx_fill (pixman_implementation_t *imp,
>      return TRUE;
>  }
>  
> +static pixman_bool_t
> +vmx_blt (pixman_implementation_t *imp,
> +          uint32_t *               src_bits,
> +          uint32_t *               dst_bits,
> +          int                      src_stride,
> +          int                      dst_stride,
> +          int                      src_bpp,
> +          int                      dst_bpp,
> +          int                      src_x,
> +          int                      src_y,
> +          int                      dest_x,
> +          int                      dest_y,
> +          int                      width,
> +          int                      height)
> +{
> +    uint8_t *   src_bytes;
> +    uint8_t *   dst_bytes;
> +    int byte_width;
> +
> +    if (src_bpp != dst_bpp)
> +	return FALSE;
> +
> +    if (src_bpp == 16)
> +    {
> +	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
> +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
> +	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
> +	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
> +	byte_width = 2 * width;
> +	src_stride *= 2;
> +	dst_stride *= 2;
> +    }
> +    else if (src_bpp == 32)
> +    {
> +	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
> +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
> +	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
> +	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
> +	byte_width = 4 * width;
> +	src_stride *= 4;
> +	dst_stride *= 4;
> +    }
> +    else
> +    {
> +	return FALSE;
> +    }
> +
> +    while (height--)
> +    {
> +	int w;
> +	uint8_t *s = src_bytes;
> +	uint8_t *d = dst_bytes;
> +	src_bytes += src_stride;
> +	dst_bytes += dst_stride;
> +	w = byte_width;
> +
> +	while (w >= 2 && ((uintptr_t)d & 3))
> +	{
> +	    *(uint16_t *)d = *(uint16_t *)s;
> +	    w -= 2;
> +	    s += 2;
> +	    d += 2;
> +	}
> +
> +	while (w >= 4 && ((uintptr_t)d & 15))
> +	{
> +	    *(uint32_t *)d = *(uint32_t *)s;
> +
> +	    w -= 4;
> +	    s += 4;
> +	    d += 4;
> +	}
> +
> +	while (w >= 64)
> +	{
> +	    vector unsigned int vmx0, vmx1, vmx2, vmx3;
> +
> +	    vmx0 = load_128_unaligned ((uint32_t*) s);
> +	    vmx1 = load_128_unaligned ((uint32_t*)(s + 16));
> +	    vmx2 = load_128_unaligned ((uint32_t*)(s + 32));
> +	    vmx3 = load_128_unaligned ((uint32_t*)(s + 48));

On big endian systems the unaligned loads are not implemented very
efficiently. We get roughly twice more loads than necessary. This whole
loop compiles into the following code on a 32-bit big endian system
with GCC 4.8.4:

    b170:	39 69 00 10 	addi    r11,r9,16
    b174:	7c 80 48 0c 	lvsl    v4,r0,r9
    b178:	38 a9 00 20 	addi    r5,r9,32
    b17c:	7c a0 48 ce 	lvx     v5,r0,r9
    b180:	38 c9 00 30 	addi    r6,r9,48
    b184:	7d 87 48 ce 	lvx     v12,r7,r9
    b188:	7c c0 58 0c 	lvsl    v6,r0,r11
    b18c:	39 29 00 40 	addi    r9,r9,64
    b190:	7c e0 58 ce 	lvx     v7,r0,r11
    b194:	7d a7 58 ce 	lvx     v13,r7,r11
    b198:	39 6a 00 10 	addi    r11,r10,16
    b19c:	7d 00 28 0c 	lvsl    v8,r0,r5
    b1a0:	7d 20 28 ce 	lvx     v9,r0,r5
    b1a4:	7c 27 28 ce 	lvx     v1,r7,r5
    b1a8:	38 aa 00 20 	addi    r5,r10,32
    b1ac:	7d 40 30 0c 	lvsl    v10,r0,r6
    b1b0:	7d 60 30 ce 	lvx     v11,r0,r6
    b1b4:	7c 07 30 ce 	lvx     v0,r7,r6
    b1b8:	38 ca 00 30 	addi    r6,r10,48
    b1bc:	11 85 61 2b 	vperm   v12,v5,v12,v4
    b1c0:	11 a7 69 ab 	vperm   v13,v7,v13,v6
    b1c4:	10 29 0a 2b 	vperm   v1,v9,v1,v8
    b1c8:	10 0b 02 ab 	vperm   v0,v11,v0,v10
    b1cc:	7d 80 51 ce 	stvx    v12,r0,r10
    b1d0:	39 4a 00 40 	addi    r10,r10,64
    b1d4:	7d a0 59 ce 	stvx    v13,r0,r11
    b1d8:	7c 20 29 ce 	stvx    v1,r0,r5
    b1dc:	7c 00 31 ce 	stvx    v0,r0,r6
    b1e0:	42 00 ff 90 	bdnz+   b170 <vmx_blt.part.4+0x260>

As we can see, there are 8 LVX instructions and 4 STVX. And it is
difficult to remove the redundant loads even for a clever compiler
because of a special 15 bytes offset used for the second load in each
pair. This is very powerpc specific handling of unaligned loads.

We had a similar discussion about ARM WMMX unaligned loads several
years ago:
    http://lists.freedesktop.org/archives/pixman/2011-July/001337.html
    http://lists.freedesktop.org/archives/pixman/2011-August/001361.html

Nothing was done at that time though, so there is still room for
improvement for WMMX too :-)

> +	    save_128_aligned ((uint32_t*)(d), vmx0);
> +	    save_128_aligned ((uint32_t*)(d + 16), vmx1);
> +	    save_128_aligned ((uint32_t*)(d + 32), vmx2);
> +	    save_128_aligned ((uint32_t*)(d + 48), vmx3);
> +
> +	    s += 64;
> +	    d += 64;
> +	    w -= 64;
> +	}
> +
> +	while (w >= 16)
> +	{
> +	    save_128_aligned ((uint32_t*) d, load_128_unaligned ((uint32_t*) s));
> +
> +	    w -= 16;
> +	    d += 16;
> +	    s += 16;
> +	}
> +
> +	while (w >= 4)
> +	{
> +	    *(uint32_t *)d = *(uint32_t *)s;
> +
> +	    w -= 4;
> +	    s += 4;
> +	    d += 4;
> +	}
> +
> +	if (w >= 2)
> +	{
> +	    *(uint16_t *)d = *(uint16_t *)s;
> +	    w -= 2;
> +	    s += 2;
> +	    d += 2;
> +	}
> +    }
> +
> +    return TRUE;
> +}
> +
>  static void
>  vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
>                                 pixman_composite_info_t *info)
> @@ -2812,6 +2934,7 @@ vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
>  
>  static const pixman_fast_path_t vmx_fast_paths[] =
>  {
> +    /* PIXMAN_OP_OVER */
>      PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
> @@ -2865,6 +2988,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
>      imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
>      imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
>  
> +    imp->blt = vmx_blt;
>      imp->fill = vmx_fill;
>  
>      return imp;

Still because the "vmx_composite_copy_area" benchmarks show a
performance improvement in practice (even with less than perfect
unaligned loads handling on big endian systems), I see no reasons
to complain. So

Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>

(if the commit message is updated to include benchmark results)

-- 
Best regards,
Siarhei Siamashka