[Pixman] [PATCH v2 07/11] vmx: implement fast path vmx_composite_over_n_8888_8888_ca
Oded Gabbay
oded.gabbay at gmail.com
Wed Jul 15 06:36:31 PDT 2015
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 61.92 244.91 +295.53%
L2 62.74 243.3 +287.79%
M 63.03 241.94 +283.85%
HT 59.91 144.22 +140.73%
VT 59.4 174.39 +193.59%
R 53.6 111.37 +107.78%
RT 37.99 46.38 +22.08%
Kops/s 436 506 +16.06%
cairo trimmed benchmarks :
Speedups
========
t-xfce4-terminal-a1 1540.37 -> 1226.14 : 1.26x
t-firefox-talos-gfx 1488.59 -> 1209.19 : 1.23x
Slowdowns
=========
t-evolution 553.88 -> 581.63 : 1.05x
t-poppler 364.99 -> 383.79 : 1.05x
t-firefox-scrolling 1223.65 -> 1304.34 : 1.07x
The slowdowns can be explained in cases where the images are small and
un-aligned to 16-byte boundary. In that case, the function will first
work on the un-aligned area, even in operations of 1 byte. In case of
small images, the overhead of such operations can be more than the
savings we get from using the vmx instructions that are done on the
aligned part of the image.
In the C fast-path implementation, there is no special treatment for the
un-aligned part, as it works in 4 byte quantities on the entire image.
Because llbb is a synthetic test, I would assume it has much less
alignment issues than "real-world" scenario, such as cairo benchmarks,
which are basically recorded traces of real application activity.
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
---
pixman/pixman-vmx.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 112 insertions(+)
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 641c487..47393dc 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2715,6 +2715,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
}
static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ vector unsigned int vsrc, valpha, vmask, vdest;
+
+ vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+ vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src),
+ (vector unsigned int) AVV(0));
+
+ valpha = expand_alpha_1x128(vsrc);
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (uintptr_t)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ vmask = unpack_32_1x128(m);
+ vdest = unpack_32_1x128(d);
+
+ *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ /* pm is NOT necessarily 16-byte aligned */
+ vmx_mask = load_128_unaligned (pm);
+
+ pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0));
+
+ /* if all bits in mask are zero, pack_cmp is not 0 */
+ if (pack_cmp == 0)
+ {
+ /* pd is 16-byte aligned */
+ vmx_dst = load_128_aligned (pd);
+
+ unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+ &vmx_mask_lo, &vmx_mask_hi);
+
+ unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ in_over_2x128 (&vsrc, &vsrc,
+ &valpha, &valpha,
+ &vmx_mask_lo, &vmx_mask_hi,
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ vmask = unpack_32_1x128(m);
+ vdest = unpack_32_1x128(d);
+
+ *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
+}
+
+static void
vmx_composite_add_8_8 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -2796,6 +2904,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
/* PIXMAN_OP_ADD */
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
--
2.4.3
More information about the Pixman
mailing list