[Pixman] [PATCH] avx2: Add fast path for over_reverse_n_8888
Matt Turner
mattst88 at gmail.com
Tue Jan 22 03:43:24 UTC 2019
lowlevel-blt-bench, over_reverse_n_8888, 100 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 2372.6 2.50 4387.6 8.00 100.00% +84.9%
L2 2490.3 5.29 4326.5 20.79 100.00% +73.7%
M 2418.3 10.43 3718.0 38.55 100.00% +53.7%
HT 1555.8 13.35 2112.9 23.85 100.00% +35.8%
VT 1120.1 9.58 1403.7 15.43 100.00% +25.3%
R 958.5 17.66 1176.9 20.87 100.00% +22.8%
RT 407.3 6.79 450.1 7.22 100.00% +10.5%
At most 18 outliers rejected per test per set.
cairo-perf-trace with trimmed traces, 30 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
poppler 0.516 0.003 0.478 0.002 100.000% +8.1%
Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
---
pixman/pixman-avx2.c | 94 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 94 insertions(+)
diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c
index faef552..6a67515 100644
--- a/pixman/pixman-avx2.c
+++ b/pixman/pixman-avx2.c
@@ -28,6 +28,18 @@ negate_2x256 (__m256i data_lo,
*neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2);
}
+static force_inline __m256i
+unpack_32_1x256 (uint32_t data)
+{
+ return _mm256_unpacklo_epi8 (_mm256_broadcastd_epi32 (_mm_cvtsi32_si128 (data)), _mm256_setzero_si256 ());
+}
+
+static force_inline __m256i
+expand_pixel_32_1x256 (uint32_t data)
+{
+ return _mm256_shuffle_epi32 (unpack_32_1x256 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
static force_inline __m256i
pack_2x256_256 (__m256i lo, __m256i hi)
{
@@ -100,6 +112,13 @@ save_256_aligned (__m256i* dst,
_mm256_store_si256 (dst, data);
}
+static force_inline void
+save_256_unaligned (__m256i* dst,
+ __m256i data)
+{
+ _mm256_storeu_si256 (dst, data);
+}
+
static force_inline int
is_opaque_256 (__m256i x)
{
@@ -429,12 +448,87 @@ avx2_composite_over_8888_8888 (pixman_implementation_t *imp,
src += src_stride;
}
}
+
+static void
+avx2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ __m256i ymm_src;
+ __m256i ymm_dst, ymm_dst_lo, ymm_dst_hi;
+ __m256i ymm_dsta_hi, ymm_dsta_lo;
+ int dst_stride;
+ int32_t w;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ ymm_src = expand_pixel_32_1x256 (src);
+
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w >= 8)
+ {
+ __m256i tmp_lo, tmp_hi;
+
+ ymm_dst = load_256_unaligned ((__m256i*)dst);
+
+ unpack_256_2x256 (ymm_dst, &ymm_dst_lo, &ymm_dst_hi);
+ expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi, &ymm_dsta_lo, &ymm_dsta_hi);
+
+ tmp_lo = ymm_src;
+ tmp_hi = ymm_src;
+
+ over_2x256 (&ymm_dst_lo, &ymm_dst_hi,
+ &ymm_dsta_lo, &ymm_dsta_hi,
+ &tmp_lo, &tmp_hi);
+
+ save_256_unaligned (
+ (__m256i*)dst, pack_2x256_256 (tmp_lo, tmp_hi));
+
+ w -= 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ __m128i vd;
+
+ vd = unpack_32_1x128 (*dst);
+
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ _mm256_castsi256_si128 (ymm_src)));
+ w--;
+ dst++;
+ }
+
+ }
+
+}
+
static const pixman_fast_path_t avx2_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, avx2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, avx2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, avx2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, avx2_composite_over_8888_8888),
+
+ /* PIXMAN_OP_OVER_REVERSE */
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, avx2_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, avx2_composite_over_reverse_n_8888),
+
{ PIXMAN_OP_NONE },
};
--
2.19.2
More information about the Pixman
mailing list