[Pixman] [PATCH] sse2: Add a fast path for add_n_8888
Matt Turner
mattst88 at gmail.com
Wed Jan 2 07:47:06 PST 2013
On Wed, Jan 2, 2013 at 3:01 AM, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> This path is being exercised by inplace compositing of trapezoids, for
> instance as used in the firefox-asteroids cairo-trace.
>
> core2 @ 2.66GHz,
>
> reference memcpy speed = 4898.2MB/s (1224.6MP/s for 32bpp fills)
>
> before: add_n_8888 = L1: 4.36 L2: 4.27 M: 1.61 ( 0.13%) HT:
> 1.65 VT: 1.63 R: 1.63 RT: 1.59 ( 21Kops/s)
>
> after: add_n_8888 = L1:2969.09 L2:3926.11 M:603.30 ( 49.27%) HT:524.69
> VT:401.01 R:407.59 RT:210.34 ( 804Kops/s)
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> pixman/pixman-sse2.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 63 insertions(+)
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index 665eead..73eee68 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -4519,9 +4519,70 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
>
> sse2_combine_add_u (imp, op, dst, src, NULL, width);
> }
> +}
> +
> +static void
> +sse2_composite_add_n_8888 (pixman_implementation_t *imp,
> + pixman_composite_info_t *info)
> +{
> + PIXMAN_COMPOSITE_ARGS (info);
> + uint32_t *dst_line, *dst, src;
> + int dst_stride;
> +
> + __m128i xmm_src;
> +
> + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
> +
> + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
> + if (src == 0)
> + return;
> +
> + if (src == ~0)
> + {
> + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
> + dest_x, dest_y, width, height, ~0);
> +
> + return;
> + }
> +
> + xmm_src = _mm_set_epi32 (src, src, src, src);
> + while (height--)
> + {
> + int w = width;
> + uint32_t d;
>
> + dst = dst_line;
> + dst_line += dst_stride;
> +
> + while (w && (unsigned long)dst & 15)
Use uintptr_t instead. The rest of the patch looks good to me.
> + {
> + d = *dst;
> + *dst++ =
> + _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
> + w--;
> + }
> +
> + while (w >= 4)
> + {
> + save_128_aligned
> + ((__m128i*)dst,
> + _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
> +
> + dst += 4;
> + w -= 4;
> + }
> +
> + while (w--)
> + {
> + d = *dst;
> + *dst++ =
> + _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
> + _mm_cvtsi32_si128 (d)));
> + }
> + }
> }
>
> +
> static pixman_bool_t
> pixman_blt_sse2 (uint32_t *src_bits,
> uint32_t *dst_bits,
> @@ -5814,6 +5875,8 @@ static const pixman_fast_path_t sse2_fast_paths[] =
> PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
> PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
> PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
> + PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
> + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
>
> /* PIXMAN_OP_SRC */
> PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
> --
> 1.7.10.4
More information about the Pixman
mailing list