pixman: Branch 'master' - 11 commits
Oded Gabbay
gabbayo at kemper.freedesktop.org
Wed Jul 29 00:50:45 PDT 2015
pixman/pixman-vmx.c | 1185 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 1159 insertions(+), 26 deletions(-)
New commits:
commit 8d9be3619a906855a3e3a1e052317833cb24cabe
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Wed Jul 1 14:34:07 2015 +0300
vmx: implement fast path iterator vmx_fetch_a8
no changes were observed when running cairo trimmed benchmarks.
Running "lowlevel-blt-bench src_8_8888" on POWER8, 8 cores,
3.4GHz, RHEL 7.1 ppc64le gave the following results:
reference memcpy speed = 25197.2MB/s (6299.3MP/s for 32bpp fills)
Before After Change
--------------------------------------------
L1 965.34 3936 +307.73%
L2 942.99 3436.29 +264.40%
M 902.24 2757.77 +205.66%
HT 448.46 784.99 +75.04%
VT 430.05 819.78 +90.62%
R 412.9 717.04 +73.66%
RT 168.93 220.63 +30.60%
Kops/s 1025 1303 +27.12%
It was benchmarked against commid id e2d211a from pixman/master
Siarhei Siamashka reported that on playstation3, it shows the following
results:
== before ==
src_8_8888 = L1: 194.37 L2: 198.46 M:155.90 (148.35%)
HT: 59.18 VT: 36.71 R: 38.93 RT: 12.79 ( 106Kops/s)
== after ==
src_8_8888 = L1: 373.96 L2: 391.10 M:245.81 (233.88%)
HT: 80.81 VT: 44.33 R: 48.10 RT: 14.79 ( 122Kops/s)
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 773ad76..a9bd024 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3139,6 +3139,49 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
return iter->buffer;
}
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+ iter->bits += iter->stride;
+
+ while (w && (((uintptr_t)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vmx0 = load_128_unaligned((uint32_t *) src);
+
+ unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+ save_128_aligned(dst, vmx6);
+ save_128_aligned((dst + 4), vmx5);
+ save_128_aligned((dst + 8), vmx4);
+ save_128_aligned((dst + 12), vmx3);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
#define IMAGE_FLAGS \
(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
@@ -3148,6 +3191,9 @@ static const pixman_iter_info_t vmx_iters[] =
{ PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
_pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
},
+ { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+ },
{ PIXMAN_null },
};
commit 47f74ca94637d79ee66c37a81eea0200e453fcc1
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Mon Jun 29 15:31:02 2015 +0300
vmx: implement fast path iterator vmx_fetch_x8r8g8b8
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
cairo trimmed benchmarks :
Speedups
========
t-firefox-asteroids 533.92 -> 489.94 : 1.09x
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 0950850..773ad76 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3105,6 +3105,52 @@ static const pixman_fast_path_t vmx_fast_paths[] =
{ PIXMAN_OP_NONE },
};
+static uint32_t *
+vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ vector unsigned int ff000000 = mask_ff000000;
+ uint32_t *dst = iter->buffer;
+ uint32_t *src = (uint32_t *)iter->bits;
+
+ iter->bits += iter->stride;
+
+ while (w && ((uintptr_t)dst) & 0x0f)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+#define IMAGE_FLAGS \
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
+ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t vmx_iters[] =
+{
+ { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
+ },
+ { PIXMAN_null },
+};
+
pixman_implementation_t *
_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
{
@@ -3147,5 +3193,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
imp->fill = vmx_fill;
+ imp->iter_info = vmx_iters;
+
return imp;
}
commit fcbb97d4458d717b9c15858aedcbee2d33c8ac5a
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Sun Jun 28 23:25:24 2015 +0300
vmx: implement fast path scaled nearest vmx_8888_8888_OVER
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 134.36 181.68 +35.22%
L2 135.07 180.67 +33.76%
M 134.6 180.51 +34.11%
HT 121.77 128.79 +5.76%
VT 120.49 145.07 +20.40%
R 93.83 102.3 +9.03%
RT 50.82 46.93 -7.65%
Kops/s 448 422 -5.80%
cairo trimmed benchmarks :
Speedups
========
t-firefox-asteroids 533.92 -> 497.92 : 1.07x
t-midori-zoomed 692.98 -> 651.24 : 1.06x
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 64e9125..0950850 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2954,6 +2954,129 @@ vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
}
}
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t src_width_fixed,
+ pixman_bool_t fully_transparent_src)
+{
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
+
+ vector unsigned int vmx_dst_lo, vmx_dst_hi;
+ vector unsigned int vmx_src_lo, vmx_src_hi;
+ vector unsigned int vmx_alpha_lo, vmx_alpha_hi;
+
+ if (fully_transparent_src)
+ return;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((uintptr_t)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ *pd++ = core_combine_over_u_pixel_vmx (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ vector unsigned int tmp;
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp2 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp3 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp4 = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ tmp[0] = tmp1;
+ tmp[1] = tmp2;
+ tmp[2] = tmp3;
+ tmp[3] = tmp4;
+
+ vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm);
+
+ if (is_opaque (vmx_src_hi))
+ {
+ save_128_aligned (pd, vmx_src_hi);
+ }
+ else if (!is_zero (vmx_src_hi))
+ {
+ vmx_dst_hi = load_128_aligned (pd);
+
+ unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0),
+ &vmx_src_lo, &vmx_src_hi);
+
+ unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0),
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ expand_alpha_2x128 (
+ vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi);
+
+ over_2x128 (&vmx_src_lo, &vmx_src_hi,
+ &vmx_alpha_lo, &vmx_alpha_hi,
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ *pd++ = core_combine_over_u_pixel_vmx (s, d);
+ if (pm)
+ pm++;
+
+ w--;
+ }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, NORMAL)
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
@@ -2974,6 +3097,11 @@ static const pixman_fast_path_t vmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
{ PIXMAN_OP_NONE },
};
commit ad612c4205f0ae46fc72a50e0c90ccd05487fcba
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Sun Jun 28 22:23:44 2015 +0300
vmx: implement fast path vmx_composite_src_x888_8888
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 1115.4 5006.49 +348.85%
L2 1112.26 4338.01 +290.02%
M 1110.54 2524.15 +127.29%
HT 745.41 1140.03 +52.94%
VT 749.03 1287.13 +71.84%
R 423.91 547.6 +29.18%
RT 205.79 194.98 -5.25%
Kops/s 1414 1361 -3.75%
cairo trimmed benchmarks :
Speedups
========
t-gnome-system-monitor 1402.62 -> 1212.75 : 1.16x
t-firefox-asteroids 533.92 -> 474.50 : 1.13x
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 47393dc..64e9125 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2689,6 +2689,62 @@ vmx_fill (pixman_implementation_t *imp,
}
static void
+vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (uintptr_t)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+
+ vmx_src1 = load_128_unaligned (src);
+ vmx_src2 = load_128_unaligned (src + 4);
+ vmx_src3 = load_128_unaligned (src + 8);
+ vmx_src4 = load_128_unaligned (src + 12);
+
+ save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
+ save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
+ save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
+ save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+ }
+}
+
+static void
vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -2914,6 +2970,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
+
{ PIXMAN_OP_NONE },
};
commit fafc1d403b8405727d3918bcb605cb98044af90a
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Sun Jun 28 10:14:20 2015 +0300
vmx: implement fast path vmx_composite_over_n_8888_8888_ca
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 61.92 244.91 +295.53%
L2 62.74 243.3 +287.79%
M 63.03 241.94 +283.85%
HT 59.91 144.22 +140.73%
VT 59.4 174.39 +193.59%
R 53.6 111.37 +107.78%
RT 37.99 46.38 +22.08%
Kops/s 436 506 +16.06%
cairo trimmed benchmarks :
Speedups
========
t-xfce4-terminal-a1 1540.37 -> 1226.14 : 1.26x
t-firefox-talos-gfx 1488.59 -> 1209.19 : 1.23x
Slowdowns
=========
t-evolution 553.88 -> 581.63 : 1.05x
t-poppler 364.99 -> 383.79 : 1.05x
t-firefox-scrolling 1223.65 -> 1304.34 : 1.07x
The slowdowns can be explained in cases where the images are small and
un-aligned to 16-byte boundary. In that case, the function will first
work on the un-aligned area, even in operations of 1 byte. In case of
small images, the overhead of such operations can be more than the
savings we get from using the vmx instructions that are done on the
aligned part of the image.
In the C fast-path implementation, there is no special treatment for the
un-aligned part, as it works in 4 byte quantities on the entire image.
Because llbb is a synthetic test, I would assume it has much less
alignment issues than "real-world" scenario, such as cairo benchmarks,
which are basically recorded traces of real application activity.
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 641c487..47393dc 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2715,6 +2715,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
}
static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ vector unsigned int vsrc, valpha, vmask, vdest;
+
+ vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+ vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src),
+ (vector unsigned int) AVV(0));
+
+ valpha = expand_alpha_1x128(vsrc);
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (uintptr_t)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ vmask = unpack_32_1x128(m);
+ vdest = unpack_32_1x128(d);
+
+ *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ /* pm is NOT necessarily 16-byte aligned */
+ vmx_mask = load_128_unaligned (pm);
+
+ pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0));
+
+ /* if all bits in mask are zero, pack_cmp is not 0 */
+ if (pack_cmp == 0)
+ {
+ /* pd is 16-byte aligned */
+ vmx_dst = load_128_aligned (pd);
+
+ unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+ &vmx_mask_lo, &vmx_mask_hi);
+
+ unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ in_over_2x128 (&vsrc, &vsrc,
+ &valpha, &valpha,
+ &vmx_mask_lo, &vmx_mask_hi,
+ &vmx_dst_lo, &vmx_dst_hi);
+
+ save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ vmask = unpack_32_1x128(m);
+ vdest = unpack_32_1x128(d);
+
+ *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
+}
+
+static void
vmx_composite_add_8_8 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -2796,6 +2904,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
/* PIXMAN_OP_ADD */
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
commit a3e914407e354df70b9200e263608f1fc2e686cf
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Thu Jun 18 15:05:49 2015 +0300
vmx: implement fast path composite_add_8888_8888
Copied impl. from sse2 file and edited to use vmx functions
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 16 cores, 3.4GHz, ppc64le :
reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 248.76 3284.48 +1220.34%
L2 264.09 2826.47 +970.27%
M 261.24 2405.06 +820.63%
HT 217.27 857.3 +294.58%
VT 213.78 980.09 +358.46%
R 176.61 442.95 +150.81%
RT 107.54 150.08 +39.56%
Kops/s 917 1125 +22.68%
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index e49e8aa..641c487 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2765,6 +2765,31 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp,
}
}
+static void
+vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+
+ vmx_combine_add_u (imp, op, dst, src, NULL, width);
+ }
+}
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
@@ -2774,6 +2799,8 @@ static const pixman_fast_path_t vmx_fast_paths[] =
/* PIXMAN_OP_ADD */
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
{ PIXMAN_OP_NONE },
};
commit d5b5343c7df99082597e0c37aec937dcf5b6602d
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Thu Jun 18 14:56:47 2015 +0300
vmx: implement fast path composite_add_8_8
Copied impl. from sse2 file and edited to use vmx functions
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 16 cores, 3.4GHz, ppc64le :
reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 687.63 9140.84 +1229.33%
L2 715 7495.78 +948.36%
M 717.39 8460.14 +1079.29%
HT 569.56 1020.12 +79.11%
VT 520.3 1215.56 +133.63%
R 514.81 874.35 +69.84%
RT 341.28 305.42 -10.51%
Kops/s 1621 1579 -2.59%
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 9eae31c..e49e8aa 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2714,12 +2714,67 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
}
}
+static void
+vmx_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (uintptr_t)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ vmx_combine_add_u (imp, op,
+ (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+ }
+}
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+
{ PIXMAN_OP_NONE },
};
commit 339eeaf095f949694d7f79a45171ac03a3b06f90
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Thu Jun 18 14:12:05 2015 +0300
vmx: implement fast path composite_over_8888_8888
Copied impl. from sse2 file and edited to use vmx functions
It was benchmarked against commid id 2be523b from pixman/master
POWER8, 16 cores, 3.4GHz, ppc64le :
reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
Before After Change
---------------------------------------------
L1 129.47 1054.62 +714.57%
L2 138.31 1011.02 +630.98%
M 139.99 1008.65 +620.52%
HT 122.11 468.45 +283.63%
VT 121.06 532.21 +339.62%
R 108.48 240.5 +121.70%
RT 77.87 116.7 +49.87%
Kops/s 758 981 +29.42%
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 61fdb80..9eae31c 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2688,8 +2688,38 @@ vmx_fill (pixman_implementation_t *imp,
return TRUE;
}
+static void
+vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ dst = dst_line;
+ src = src_line;
+
+ while (height--)
+ {
+ vmx_combine_over_u (imp, op, dst, src, NULL, width);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
{ PIXMAN_OP_NONE },
};
commit 0cc8a2e9714efcb7cdd7e2a94c9cba49c3e29e00
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Sun Jun 28 09:42:19 2015 +0300
vmx: implement fast path vmx_fill
Based on sse2 impl.
It was benchmarked against commid id e2d211a from pixman/master
Tested cairo trimmed benchmarks on POWER8, 8 cores, 3.4GHz,
RHEL 7.1 ppc64le :
speedups
========
t-swfdec-giant-steps 1383.09 -> 718.63 : 1.92x speedup
t-gnome-system-monitor 1403.53 -> 918.77 : 1.53x speedup
t-evolution 552.34 -> 415.24 : 1.33x speedup
t-xfce4-terminal-a1 1573.97 -> 1351.46 : 1.16x speedup
t-firefox-paintball 847.87 -> 734.50 : 1.15x speedup
t-firefox-asteroids 565.99 -> 492.77 : 1.15x speedup
t-firefox-canvas-swscroll 1656.87 -> 1447.48 : 1.14x speedup
t-midori-zoomed 724.73 -> 642.16 : 1.13x speedup
t-firefox-planet-gnome 975.78 -> 911.92 : 1.07x speedup
t-chromium-tabs 292.12 -> 274.74 : 1.06x speedup
t-firefox-chalkboard 690.78 -> 653.93 : 1.06x speedup
t-firefox-talos-gfx 1375.30 -> 1303.74 : 1.05x speedup
t-firefox-canvas-alpha 1016.79 -> 967.24 : 1.05x speedup
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 39d1a06..61fdb80 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2537,6 +2537,157 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
}
}
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t filler)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
+
+ vector unsigned int vfiller;
+
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
+
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
+
+ b = filler & 0xff;
+ w = (b << 8) | b;
+ filler = (w << 16) | w;
+ }
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
+
+ filler = (filler & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ vfiller = create_mask_1x32_128(&filler);
+
+ while (height--)
+ {
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ if (w >= 1 && ((uintptr_t)d & 1))
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((uintptr_t)d & 3))
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((uintptr_t)d & 15))
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+ vec_st(vfiller, 0, (uint32_t *) d + 16);
+ vec_st(vfiller, 0, (uint32_t *) d + 20);
+ vec_st(vfiller, 0, (uint32_t *) d + 24);
+ vec_st(vfiller, 0, (uint32_t *) d + 28);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+ }
+
+ return TRUE;
+}
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
{ PIXMAN_OP_NONE },
@@ -2582,5 +2733,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+ imp->fill = vmx_fill;
+
return imp;
}
commit c12ee95089e7d281a29a24bf56b81f5c16dec6ee
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Sun Jun 28 09:42:08 2015 +0300
vmx: add helper functions
This patch adds the following helper functions for reuse of code,
hiding BE/LE differences and maintainability.
All of the functions were defined as static force_inline.
Names were copied from pixman-sse2.c so conversion of fast-paths between
sse2 and vmx would be easier from now on. Therefore, I tried to keep the
input/output of the functions to be as close as possible to the sse2
definitions.
The functions are:
- load_128_aligned : load 128-bit from a 16-byte aligned memory
address into a vector
- load_128_unaligned : load 128-bit from memory into a vector,
without guarantee of alignment for the
source pointer
- save_128_aligned : save 128-bit vector into a 16-byte aligned
memory address
- create_mask_16_128 : take a 16-bit value and fill with it
a new vector
- create_mask_1x32_128 : take a 32-bit pointer and fill a new
vector with the 32-bit value from that pointer
- create_mask_32_128 : take a 32-bit value and fill with it
a new vector
- unpack_32_1x128 : unpack 32-bit value into a vector
- unpacklo_128_16x8 : unpack the eight low 8-bit values of a vector
- unpackhi_128_16x8 : unpack the eight high 8-bit values of a vector
- unpacklo_128_8x16 : unpack the four low 16-bit values of a vector
- unpackhi_128_8x16 : unpack the four high 16-bit values of a vector
- unpack_128_2x128 : unpack the eight low 8-bit values of a vector
into one vector and the eight high 8-bit
values into another vector
- unpack_128_2x128_16 : unpack the four low 16-bit values of a vector
into one vector and the four high 16-bit
values into another vector
- unpack_565_to_8888 : unpack an RGB_565 vector to 8888 vector
- pack_1x128_32 : pack a vector and return the LSB 32-bit of it
- pack_2x128_128 : pack two vectors into one and return it
- negate_2x128 : xor two vectors with mask_00ff (separately)
- is_opaque : returns whether all the pixels contained in
the vector are opaque
- is_zero : returns whether the vector equals 0
- is_transparent : returns whether all the pixels
contained in the vector are transparent
- expand_pixel_8_1x128 : expand an 8-bit pixel into lower 8 bytes of a
vector
- expand_alpha_1x128 : expand alpha from vector and return the new
vector
- expand_alpha_2x128 : expand alpha from one vector and another alpha
from a second vector
- expand_alpha_rev_2x128 : expand a reversed alpha from one vector and
another reversed alpha from a second vector
- pix_multiply_2x128 : do pix_multiply for two vectors (separately)
- over_2x128 : perform over op. on two vectors
- in_over_2x128 : perform in-over op. on two vectors
v2: removed expand_pixel_32_1x128 as it was not used by any function and
its implementation was erroneous
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 880a19a..39d1a06 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -30,10 +30,19 @@
#endif
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
#include <altivec.h>
#define AVV(x...) {x}
+static vector unsigned int mask_00ff;
+static vector unsigned int mask_ff000000;
+static vector unsigned int mask_red;
+static vector unsigned int mask_green;
+static vector unsigned int mask_blue;
+static vector unsigned int mask_565_fix_rb;
+static vector unsigned int mask_565_fix_g;
+
static force_inline vector unsigned int
splat_alpha (vector unsigned int pix)
{
@@ -233,6 +242,464 @@ do \
#define STORE_VECTOR(dest) \
vec_st ((vector unsigned int) v ## dest, 0, dest);
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline vector unsigned int
+load_128_aligned (const uint32_t* src)
+{
+ return *((vector unsigned int *) src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline vector unsigned int
+load_128_unaligned (const uint32_t* src)
+{
+ vector unsigned int vsrc;
+ DECLARE_SRC_MASK_VAR;
+
+ COMPUTE_SHIFT_MASK (src);
+ LOAD_VECTOR (src);
+
+ return vsrc;
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (uint32_t* data,
+ vector unsigned int vdata)
+{
+ STORE_VECTOR(data)
+}
+
+static force_inline vector unsigned int
+create_mask_16_128 (uint16_t mask)
+{
+ uint16_t* src;
+ vector unsigned short vsrc;
+ DECLARE_SRC_MASK_VAR;
+
+ src = &mask;
+
+ COMPUTE_SHIFT_MASK (src);
+ LOAD_VECTOR (src);
+ return (vector unsigned int) vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_1x32_128 (const uint32_t *src)
+{
+ vector unsigned int vsrc;
+ DECLARE_SRC_MASK_VAR;
+
+ COMPUTE_SHIFT_MASK (src);
+ LOAD_VECTOR (src);
+ return vec_splat(vsrc, 0);
+}
+
+static force_inline vector unsigned int
+create_mask_32_128 (uint32_t mask)
+{
+ return create_mask_1x32_128(&mask);
+}
+
+static force_inline vector unsigned int
+unpack_32_1x128 (uint32_t data)
+{
+ vector unsigned int vdata = {0, 0, 0, data};
+ vector unsigned short lo;
+
+ lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned char) AVV(0),
+ (vector unsigned char) vdata);
+#else
+ vec_mergel ((vector unsigned char) vdata,
+ (vector unsigned char) AVV(0));
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned char lo;
+
+ /* unpack to short */
+ lo = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned char) data2,
+ (vector unsigned char) data1);
+#else
+ vec_mergel ((vector unsigned char) data1,
+ (vector unsigned char) data2);
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned char hi;
+
+ /* unpack to short */
+ hi = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+ vec_mergeh ((vector unsigned char) data2,
+ (vector unsigned char) data1);
+#else
+ vec_mergeh ((vector unsigned char) data1,
+ (vector unsigned char) data2);
+#endif
+
+ return (vector unsigned int) hi;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned short lo;
+
+ /* unpack to char */
+ lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned short) data2,
+ (vector unsigned short) data1);
+#else
+ vec_mergel ((vector unsigned short) data1,
+ (vector unsigned short) data2);
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned short hi;
+
+ /* unpack to char */
+ hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergeh ((vector unsigned short) data2,
+ (vector unsigned short) data1);
+#else
+ vec_mergeh ((vector unsigned short) data1,
+ (vector unsigned short) data2);
+#endif
+
+ return (vector unsigned int) hi;
+}
+
+static force_inline void
+unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
+ vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+ *data_lo = unpacklo_128_16x8(data1, data2);
+ *data_hi = unpackhi_128_16x8(data1, data2);
+}
+
+static force_inline void
+unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
+ vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+ *data_lo = unpacklo_128_8x16(data1, data2);
+ *data_hi = unpackhi_128_8x16(data1, data2);
+}
+
+static force_inline vector unsigned int
+unpack_565_to_8888 (vector unsigned int lo)
+{
+ vector unsigned int r, g, b, rb, t;
+
+ r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
+ g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
+ b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
+
+ rb = vec_or (r, b);
+ t = vec_and (rb, mask_565_fix_rb);
+ t = vec_sr (t, create_mask_32_128(5));
+ rb = vec_or (rb, t);
+
+ t = vec_and (g, mask_565_fix_g);
+ t = vec_sr (t, create_mask_32_128(6));
+ g = vec_or (g, t);
+
+ return vec_or (rb, g);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (vector unsigned int data)
+{
+ vector unsigned char vpack;
+
+ vpack = vec_packsu((vector unsigned short) data,
+ (vector unsigned short) AVV(0));
+
+ return vec_extract((vector unsigned int) vpack, 1);
+}
+
+static force_inline vector unsigned int
+pack_2x128_128 (vector unsigned int lo, vector unsigned int hi)
+{
+ vector unsigned char vpack;
+
+ vpack = vec_packsu((vector unsigned short) hi,
+ (vector unsigned short) lo);
+
+ return (vector unsigned int) vpack;
+}
+
+static force_inline void
+negate_2x128 (vector unsigned int data_lo,
+ vector unsigned int data_hi,
+ vector unsigned int* neg_lo,
+ vector unsigned int* neg_hi)
+{
+ *neg_lo = vec_xor (data_lo, mask_00ff);
+ *neg_hi = vec_xor (data_hi, mask_00ff);
+}
+
+static force_inline int
+is_opaque (vector unsigned int x)
+{
+ uint32_t cmp_result;
+ vector bool int ffs = vec_cmpeq(x, x);
+
+ cmp_result = vec_all_eq(x, ffs);
+
+ return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (vector unsigned int x)
+{
+ uint32_t cmp_result;
+
+ cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+
+ return cmp_result == 0xffff;
+}
+
+static force_inline int
+is_transparent (vector unsigned int x)
+{
+ uint32_t cmp_result;
+
+ cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+ return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline vector unsigned int
+expand_pixel_8_1x128 (uint8_t data)
+{
+ vector unsigned int vdata;
+
+ vdata = unpack_32_1x128 ((uint32_t) data);
+
+#ifdef WORDS_BIGENDIAN
+ return vec_perm (vdata, vdata,
+ (vector unsigned char)AVV (
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#else
+ return vec_perm (vdata, vdata,
+ (vector unsigned char)AVV (
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#endif
+}
+
+static force_inline vector unsigned int
+expand_alpha_1x128 (vector unsigned int data)
+{
+#ifdef WORDS_BIGENDIAN
+ return vec_perm (data, data,
+ (vector unsigned char)AVV (
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#else
+ return vec_perm (data, data,
+ (vector unsigned char)AVV (
+ 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+ 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#endif
+}
+
+static force_inline void
+expand_alpha_2x128 (vector unsigned int data_lo,
+ vector unsigned int data_hi,
+ vector unsigned int* alpha_lo,
+ vector unsigned int* alpha_hi)
+{
+
+ *alpha_lo = expand_alpha_1x128(data_lo);
+ *alpha_hi = expand_alpha_1x128(data_hi);
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (vector unsigned int data_lo,
+ vector unsigned int data_hi,
+ vector unsigned int* alpha_lo,
+ vector unsigned int* alpha_hi)
+{
+#ifdef WORDS_BIGENDIAN
+ *alpha_lo = vec_perm (data_lo, data_lo,
+ (vector unsigned char)AVV (
+ 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+ 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+
+ *alpha_hi = vec_perm (data_hi, data_hi,
+ (vector unsigned char)AVV (
+ 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+ 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
+#else
+ *alpha_lo = vec_perm (data_lo, data_lo,
+ (vector unsigned char)AVV (
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+
+ *alpha_hi = vec_perm (data_hi, data_hi,
+ (vector unsigned char)AVV (
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
+#endif
+}
+
+static force_inline void
+pix_multiply_2x128 (vector unsigned int* data_lo,
+ vector unsigned int* data_hi,
+ vector unsigned int* alpha_lo,
+ vector unsigned int* alpha_hi,
+ vector unsigned int* ret_lo,
+ vector unsigned int* ret_hi)
+{
+ *ret_lo = pix_multiply(*data_lo, *alpha_lo);
+ *ret_hi = pix_multiply(*data_hi, *alpha_hi);
+}
+
+static force_inline void
+over_2x128 (vector unsigned int* src_lo,
+ vector unsigned int* src_hi,
+ vector unsigned int* alpha_lo,
+ vector unsigned int* alpha_hi,
+ vector unsigned int* dst_lo,
+ vector unsigned int* dst_hi)
+{
+ vector unsigned int t1, t2;
+
+ negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+ pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+ *dst_lo = (vector unsigned int)
+ vec_adds ((vector unsigned char) *src_lo,
+ (vector unsigned char) *dst_lo);
+
+ *dst_hi = (vector unsigned int)
+ vec_adds ((vector unsigned char) *src_hi,
+ (vector unsigned char) *dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (vector unsigned int* src_lo,
+ vector unsigned int* src_hi,
+ vector unsigned int* alpha_lo,
+ vector unsigned int* alpha_hi,
+ vector unsigned int* mask_lo,
+ vector unsigned int* mask_hi,
+ vector unsigned int* dst_lo,
+ vector unsigned int* dst_hi)
+{
+ vector unsigned int s_lo, s_hi;
+ vector unsigned int a_lo, a_hi;
+
+ pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+ pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+ over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
+{
+ uint8_t a;
+ vector unsigned int vmxs;
+
+ a = src >> 24;
+
+ if (a == 0xff)
+ {
+ return src;
+ }
+ else if (src)
+ {
+ vmxs = unpack_32_1x128 (src);
+ return pack_1x128_32(
+ over(vmxs, expand_alpha_1x128 (vmxs), unpack_32_1x128 (dst)));
+ }
+
+ return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+ uint32_t s = *ps;
+
+ if (pm)
+ {
+ vector unsigned int ms, mm;
+
+ mm = unpack_32_1x128 (*pm);
+ mm = expand_alpha_1x128 (mm);
+
+ ms = unpack_32_1x128 (s);
+ ms = pix_multiply (ms, mm);
+
+ s = pack_1x128_32 (ms);
+ }
+
+ return s;
+}
+
+static force_inline vector unsigned int
+combine4 (const uint32_t* ps, const uint32_t* pm)
+{
+ vector unsigned int vmx_src_lo, vmx_src_hi;
+ vector unsigned int vmx_msk_lo, vmx_msk_hi;
+ vector unsigned int s;
+
+ if (pm)
+ {
+ vmx_msk_lo = load_128_unaligned(pm);
+
+ if (is_transparent(vmx_msk_lo))
+ return (vector unsigned int) AVV(0);
+ }
+
+ s = load_128_unaligned(ps);
+
+ if (pm)
+ {
+ unpack_128_2x128(s, (vector unsigned int) AVV(0),
+ &vmx_src_lo, &vmx_src_hi);
+
+ unpack_128_2x128(vmx_msk_lo, (vector unsigned int) AVV(0),
+ &vmx_msk_lo, &vmx_msk_hi);
+
+ expand_alpha_2x128(vmx_msk_lo, vmx_msk_hi, &vmx_msk_lo, &vmx_msk_hi);
+
+ pix_multiply_2x128(&vmx_src_lo, &vmx_src_hi,
+ &vmx_msk_lo, &vmx_msk_hi,
+ &vmx_src_lo, &vmx_src_hi);
+
+ s = pack_2x128_128(vmx_src_lo, vmx_src_hi);
+ }
+
+ return s;
+}
+
static void
vmx_combine_over_u_no_mask (uint32_t * dest,
const uint32_t *src,
@@ -2080,6 +2547,15 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+ /* VMX constants */
+ mask_00ff = create_mask_16_128 (0x00ff);
+ mask_ff000000 = create_mask_32_128 (0xff000000);
+ mask_red = create_mask_32_128 (0x00f80000);
+ mask_green = create_mask_32_128 (0x0000fc00);
+ mask_blue = create_mask_32_128 (0x000000f8);
+ mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
+ mask_565_fix_g = create_mask_32_128 (0x0000c000);
+
/* Set up function pointers */
imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
commit 034149537be94862b43fb09699b8c2149bfe948d
Author: Oded Gabbay <oded.gabbay at gmail.com>
Date: Thu Jul 2 11:04:20 2015 +0300
vmx: add LOAD_VECTOR macro
This patch adds a macro for loading a single vector.
It also make the other LOAD_VECTORx macros use this macro as a base so
code would be re-used.
In addition, I fixed minor coding style issues.
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka at gmail.com>
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index cef921f..880a19a 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -169,33 +169,29 @@ over (vector unsigned int src,
mask ## _mask = vec_lvsl (0, mask); \
source ## _mask = vec_lvsl (0, source);
-/* notice you have to declare temp vars...
- * Note: tmp3 and tmp4 must remain untouched!
- */
-
-#define LOAD_VECTORS(dest, source) \
-do { \
+#define LOAD_VECTOR(source) \
+do \
+{ \
vector unsigned char tmp1, tmp2; \
tmp1 = (typeof(tmp1))vec_ld (0, source); \
tmp2 = (typeof(tmp2))vec_ld (15, source); \
- v ## source = (typeof(v ## source)) \
+ v ## source = (typeof(v ## source)) \
vec_perm (tmp1, tmp2, source ## _mask); \
+} while (0)
+
+#define LOAD_VECTORS(dest, source) \
+do \
+{ \
+ LOAD_VECTOR(source); \
v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
-} while (0);
+} while (0)
#define LOAD_VECTORSC(dest, source, mask) \
-do { \
- vector unsigned char tmp1, tmp2; \
- tmp1 = (typeof(tmp1))vec_ld (0, source); \
- tmp2 = (typeof(tmp2))vec_ld (15, source); \
- v ## source = (typeof(v ## source)) \
- vec_perm (tmp1, tmp2, source ## _mask); \
- tmp1 = (typeof(tmp1))vec_ld (0, mask); \
- v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
- tmp2 = (typeof(tmp2))vec_ld (15, mask); \
- v ## mask = (typeof(v ## mask)) \
- vec_perm (tmp1, tmp2, mask ## _mask); \
-} while (0);
+do \
+{ \
+ LOAD_VECTORS(dest, source); \
+ LOAD_VECTOR(mask); \
+} while (0)
#define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
#define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
@@ -213,14 +209,16 @@ do { \
#define COMPUTE_SHIFT_MASKC(dest, source, mask)
+# define LOAD_VECTOR(source) \
+ v ## source = *((typeof(v ## source)*)source);
+
# define LOAD_VECTORS(dest, source) \
- v ## source = *((typeof(v ## source)*)source); \
- v ## dest = *((typeof(v ## dest)*)dest);
+ LOAD_VECTOR(source); \
+ LOAD_VECTOR(dest); \
# define LOAD_VECTORSC(dest, source, mask) \
- v ## source = *((typeof(v ## source)*)source); \
- v ## dest = *((typeof(v ## dest)*)dest); \
- v ## mask = *((typeof(v ## mask)*)mask);
+ LOAD_VECTORS(dest, source); \
+ LOAD_VECTOR(mask); \
#define DECLARE_SRC_MASK_VAR
#define DECLARE_MASK_MASK_VAR
@@ -228,7 +226,7 @@ do { \
#endif /* WORDS_BIGENDIAN */
#define LOAD_VECTORSM(dest, source, mask) \
- LOAD_VECTORSC (dest, source, mask) \
+ LOAD_VECTORSC (dest, source, mask); \
v ## source = pix_multiply (v ## source, \
splat_alpha (v ## mask));
More information about the xorg-commit
mailing list