[Pixman] [PATCH 1/4] vmx: optimize scaled_nearest_scanline_vmx_8888_8888_OVER
Oded Gabbay
oded.gabbay at gmail.com
Sun Sep 6 08:27:08 PDT 2015
This patch optimizes scaled_nearest_scanline_vmx_8888_8888_OVER and all
the functions it calls (combine1, combine4 and
core_combine_over_u_pixel_vmx).
The optimization is done by removing use of expand_alpha_1x128 and
expand_alpha_2x128 in favor of splat_alpha and MUL/ADD macros from
pixman_combine32.h.
Running "lowlevel-blt-bench -n over_8888_8888" on POWER8, 8 cores,
3.4GHz, RHEL 7.2 ppc64le gave the following results:
reference memcpy speed = 24847.3MB/s (6211.8MP/s for 32bpp fills)
Before After Change
--------------------------------------------
L1 182.05 210.22 +15.47%
L2 180.6 208.92 +15.68%
M 180.52 208.22 +15.34%
HT 130.17 178.97 +37.49%
VT 145.82 184.22 +26.33%
R 104.51 129.38 +23.80%
RT 48.3 61.54 +27.41%
Kops/s 430 504 +17.21%
Signed-off-by: Oded Gabbay <oded.gabbay at gmail.com>
---
pixman/pixman-vmx.c | 80 ++++++++++++-----------------------------------------
1 file changed, 18 insertions(+), 62 deletions(-)
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index a9bd024..d9fc5d6 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -623,10 +623,9 @@ in_over_2x128 (vector unsigned int* src_lo,
static force_inline uint32_t
core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
{
- uint8_t a;
- vector unsigned int vmxs;
+ uint32_t a;
- a = src >> 24;
+ a = ALPHA_8(src);
if (a == 0xff)
{
@@ -634,9 +633,7 @@ core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
}
else if (src)
{
- vmxs = unpack_32_1x128 (src);
- return pack_1x128_32(
- over(vmxs, expand_alpha_1x128 (vmxs), unpack_32_1x128 (dst)));
+ UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
}
return dst;
@@ -646,19 +643,10 @@ static force_inline uint32_t
combine1 (const uint32_t *ps, const uint32_t *pm)
{
uint32_t s = *ps;
+ uint32_t a = ALPHA_8(*pm);
if (pm)
- {
- vector unsigned int ms, mm;
-
- mm = unpack_32_1x128 (*pm);
- mm = expand_alpha_1x128 (mm);
-
- ms = unpack_32_1x128 (s);
- ms = pix_multiply (ms, mm);
-
- s = pack_1x128_32 (ms);
- }
+ UN8x4_MUL_UN8(s, a);
return s;
}
@@ -666,38 +654,22 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
static force_inline vector unsigned int
combine4 (const uint32_t* ps, const uint32_t* pm)
{
- vector unsigned int vmx_src_lo, vmx_src_hi;
- vector unsigned int vmx_msk_lo, vmx_msk_hi;
- vector unsigned int s;
+ vector unsigned int src, msk;
if (pm)
{
- vmx_msk_lo = load_128_unaligned(pm);
+ msk = load_128_unaligned(pm);
- if (is_transparent(vmx_msk_lo))
+ if (is_transparent(msk))
return (vector unsigned int) AVV(0);
}
- s = load_128_unaligned(ps);
+ src = load_128_unaligned(ps);
if (pm)
- {
- unpack_128_2x128(s, (vector unsigned int) AVV(0),
- &vmx_src_lo, &vmx_src_hi);
-
- unpack_128_2x128(vmx_msk_lo, (vector unsigned int) AVV(0),
- &vmx_msk_lo, &vmx_msk_hi);
-
- expand_alpha_2x128(vmx_msk_lo, vmx_msk_hi, &vmx_msk_lo, &vmx_msk_hi);
+ src = pix_multiply(src, msk);
- pix_multiply_2x128(&vmx_src_lo, &vmx_src_hi,
- &vmx_msk_lo, &vmx_msk_hi,
- &vmx_src_lo, &vmx_src_hi);
-
- s = pack_2x128_128(vmx_src_lo, vmx_src_hi);
- }
-
- return s;
+ return src;
}
static void
@@ -2966,9 +2938,7 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
uint32_t s, d;
const uint32_t* pm = NULL;
- vector unsigned int vmx_dst_lo, vmx_dst_hi;
- vector unsigned int vmx_src_lo, vmx_src_hi;
- vector unsigned int vmx_alpha_lo, vmx_alpha_hi;
+ vector unsigned int vsrc, vdst;
if (fully_transparent_src)
return;
@@ -3015,31 +2985,17 @@ scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
tmp[2] = tmp3;
tmp[3] = tmp4;
- vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm);
+ vsrc = combine4 ((const uint32_t *) &tmp, pm);
- if (is_opaque (vmx_src_hi))
+ if (is_opaque (vsrc))
{
- save_128_aligned (pd, vmx_src_hi);
+ save_128_aligned (pd, vsrc);
}
- else if (!is_zero (vmx_src_hi))
+ else if (!is_zero (vsrc))
{
- vmx_dst_hi = load_128_aligned (pd);
-
- unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0),
- &vmx_src_lo, &vmx_src_hi);
-
- unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0),
- &vmx_dst_lo, &vmx_dst_hi);
-
- expand_alpha_2x128 (
- vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi);
-
- over_2x128 (&vmx_src_lo, &vmx_src_hi,
- &vmx_alpha_lo, &vmx_alpha_hi,
- &vmx_dst_lo, &vmx_dst_hi);
+ vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
- /* rebuid the 4 pixel data and save*/
- save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+ save_128_aligned (pd, vdst);
}
w -= 4;
--
2.4.3
More information about the Pixman
mailing list