[Pixman] [PATCH] sse2: Add a fast path for add_n_8_8888
Chris Wilson
chris at chris-wilson.co.uk
Wed Jan 2 08:36:52 PST 2013
This path is being exercised by compositing of trapezoids, for
instance as used in the firefox-asteroids cairo-trace (and appears to be
the last non-SSE2 kernel).
core2 @ 2.66GHz,
reference memcpy speed = 4898.2MB/s (1224.6MP/s for 32bpp fills)
before: add_n_8_8888 = L1: 4.53 L2: 4.39 M: 1.62 ( 0.20%) HT: 1.66 VT: 1.63 R: 1.62 RT: 1.55 ( 21Kops/s)
after: add_n_8_8888 = L1: 531.03 L2: 531.91 M:446.21 ( 54.56%) HT: 298.44 VT:258.56 R:161.87 RT: 63.80 ( 441Kops/s)
firefox-asteroids (xvfb): 6.108s -> 5.943s
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
pixman/pixman-sse2.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 97 insertions(+)
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 8ef6afb..589a100 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4586,6 +4586,101 @@ sse2_composite_add_n_8888 (pixman_implementation_t *imp,
}
}
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+
+ __m128i xmm_src;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+ if (src == 0)
+ return;
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((uintptr_t)dst & 15))
+ {
+ uint8_t m = *mask++;
+ if (m)
+ {
+ *dst = pack_1x128_32
+ (_mm_adds_epu16
+ (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+ unpack_32_1x128 (*dst)));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t m = *(uint32_t*)mask;
+ if (m)
+ {
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+ __m128i xmm_mask =
+ _mm_unpacklo_epi8 (unpack_32_1x128(m),
+ _mm_setzero_si128 ());
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+ if (m)
+ {
+ *dst = pack_1x128_32
+ (_mm_adds_epu16
+ (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+ unpack_32_1x128 (*dst)));
+ }
+ dst++;
+ w--;
+ }
+ }
+}
static pixman_bool_t
sse2_blt (pixman_implementation_t *imp,
@@ -5911,6 +6006,8 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
/* PIXMAN_OP_SRC */
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
--
1.7.10.4
More information about the Pixman
mailing list