[Pixman] [PATCH 06/12] vmx: implement fast path vmx_composite_over_n_8888_8888_ca

Thu Jul 2 03:04:11 PDT 2015

POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.

reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)

                Before           After           Change
              ---------------------------------------------
L1              61.92            244.91          +295.53%
L2              62.74            243.3           +287.79%
M               63.03            241.94          +283.85%
HT              59.91            144.22          +140.73%
VT              59.4             174.39          +193.59%
R               53.6             111.37          +107.78%
RT              37.99            46.38           +22.08%
Kops/s          436              506             +16.06%

cairo trimmed benchmarks :

Speedups
========
t-xfce4-terminal-a1  1540.37 -> 1226.14 :  1.26x
t-firefox-talos-gfx  1488.59 -> 1209.19 :  1.23x

Slowdowns
=========
        t-evolution  553.88  -> 581.63  :  1.05x
          t-poppler  364.99  -> 383.79  :  1.05x
t-firefox-scrolling  1223.65 -> 1304.34 :  1.07x

Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
---
 pixman/pixman-vmx.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index e69d530..966219f 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2871,6 +2871,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 }
 
 static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    vector unsigned int vsrc, valpha, vmask, vdest;
+
+    vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src),
+			    (vector unsigned int) AVV(0));
+
+    valpha = expand_alpha_1x128(vsrc);
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		vmask = unpack_32_1x128(m);
+		vdest = unpack_32_1x128(d);
+
+		*pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    /* pm is NOT necessarily 16-byte aligned */
+	    vmx_mask = load_128_unaligned (pm);
+
+	    pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0));
+
+	    /* if all bits in mask are zero, pack_cmp is not 0 */
+	    if (pack_cmp == 0)
+	    {
+		/* pd is 16-byte aligned */
+		vmx_dst = load_128_aligned (pd);
+
+		unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+				    &vmx_mask_lo, &vmx_mask_hi);
+
+		unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+				    &vmx_dst_lo, &vmx_dst_hi);
+
+		in_over_2x128 (&vsrc, &vsrc,
+			       &valpha, &valpha,
+			       &vmx_mask_lo, &vmx_mask_hi,
+			       &vmx_dst_lo, &vmx_dst_hi);
+
+		save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		vmask = unpack_32_1x128(m);
+		vdest = unpack_32_1x128(d);
+
+		*pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+}
+
+static void
 vmx_composite_add_8_8 (pixman_implementation_t *imp,
             pixman_composite_info_t *info)
 {
@@ -2953,6 +3061,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, vmx_composite_copy_area),
     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, vmx_composite_copy_area),
 
-- 
2.4.3