pixman: Branch 'master' - 4 commits
Siarhei Siamashka
siamashka at kemper.freedesktop.org
Mon Feb 28 06:03:26 PST 2011
pixman/pixman-arm-neon-asm.S | 197 +++++++++++++++++++
pixman/pixman-arm-neon.c | 45 ++++
pixman/pixman-fast-path.h | 432 +++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-sse2.c | 112 +++++++++++
test/Makefile.am | 2
test/scaling-helpers-test.c | 93 +++++++++
6 files changed, 881 insertions(+)
New commits:
commit 17feaa9c50bb8521b0366345efe181bd99754957
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date: Tue Feb 22 18:45:03 2011 +0200
ARM: NEON optimization for bilinear scaled 'src_8888_8888'
Initial NEON optimization for bilinear scaling. Can be probably
improved more.
Benchmark on ARM Cortex-A8:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s
after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 47daf45..c168e10 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \
10, /* dst_r_basereg */ \
8, /* src_basereg */ \
15 /* mask_basereg */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+.macro bilinear_interpolate_last_pixel
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vshr.u16 d30, d24, #8
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ /* 5 cycles bubble */
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #16
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ vst1.32 {d0[0]}, [OUT, :32]!
+.endm
+
+.macro bilinear_interpolate_two_pixels
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d20}, [TMP1]
+ vld1.32 {d21}, [TMP2]
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d30, q0, #16
+ vshrn.u32 d31, q10, #16
+ vmovn.u16 d0, q15
+ vst1.32 {d0}, [OUT]!
+.endm
+
+.macro bilinear_interpolate_four_pixels
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d20}, [TMP1]
+ vld1.32 {d21}, [TMP2]
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d4}, [TMP1]
+ vld1.32 {d5}, [TMP2]
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d5, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d16}, [TMP1]
+ vld1.32 {d17}, [TMP2]
+ vmull.u8 q9, d16, d28
+ vmlal.u8 q9, d17, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q2, d6, #8
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #8
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
+ vshrn.u32 d4, q2, #16
+ vshrn.u32 d5, q8, #16
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vst1.32 {d0, d1}, [OUT]!
+.endm
+
+
+/*
+ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
+ * const uint32_t * top,
+ * const uint32_t * bottom,
+ * int wt,
+ * int wb,
+ * pixman_fixed_t x,
+ * pixman_fixed_t ux,
+ * int width)
+ */
+
+pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+
+ mov ip, sp
+ push {r4, r5, r6, r7}
+ ldmia ip, {WB, X, UX, WIDTH}
+
+ cmp WIDTH, #0
+ ble 3f
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+ vadd.u16 q13, q13, q13
+
+ subs WIDTH, WIDTH, #4
+ blt 1f
+0:
+ bilinear_interpolate_four_pixels
+ subs WIDTH, WIDTH, #4
+ bge 0b
+1:
+ tst WIDTH, #2
+ beq 2f
+ bilinear_interpolate_two_pixels
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_interpolate_last_pixel
+3:
+ pop {r4, r5, r6, r7}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq BOTTOM
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+.endfunc
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3e0c0d1..c7c0254 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits,
}
}
+void
+pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out,
+ const uint32_t * top,
+ const uint32_t * bottom,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width);
+
+static force_inline void
+scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
+ src_bottom, wt, wb,
+ vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
static const pixman_fast_path_t arm_neon_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
{ PIXMAN_OP_NONE },
};
commit 350029396d911941591149cc82b5e68a78ad6747
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date: Mon Feb 21 20:18:02 2011 +0200
SSE2 optimization for bilinear scaled 'src_8888_8888'
A primitive naive implementation of bilinear scaling using SSE2 intrinsics,
which only handles one pixel at a time. It is approximately 2x faster than
pixman general compositing path. Single pass processing without intermediate
temporary buffer contributes to ~15% and loop unrolling contributes to ~20%
of this speedup.
Benchmark on Intel Core i7 (x86-64):
Using cairo-perf-trace:
before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6
after: image firefox-planet-gnome 10.961 11.013 0.19% 5/6
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
after: op=1, src=20028888, dst=20028888, speed=165.38 MPix/s
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 88287b4..696005f 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+static void
+bilinear_interpolate_line_sse2 (uint32_t * out,
+ const uint32_t * top,
+ const uint32_t * bottom,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width)
+{
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
+ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
+ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+ const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
+ const __m128i xmm_zero = _mm_setzero_si128 ();
+ __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
+ uint32_t pix1, pix2, pix3, pix4;
+
+ #define INTERPOLATE_ONE_PIXEL(pix) \
+ do { \
+ __m128i xmm_wh, xmm_lo, xmm_hi, a; \
+ /* fetch 2x2 pixel block into sse2 register */ \
+ uint32_t tl = top [pixman_fixed_to_int (x)]; \
+ uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \
+ uint32_t bl = bottom [pixman_fixed_to_int (x)]; \
+ uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \
+ a = _mm_set_epi32 (tr, tl, br, bl); \
+ x += ux; \
+ /* vertical interpolation */ \
+ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
+ xmm_wt), \
+ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
+ xmm_wb)); \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc, \
+ _mm_xor_si128 (xmm_xorc, \
+ _mm_srli_epi16 (xmm_x, 8))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ /* shift and pack the result */ \
+ a = _mm_srli_epi32 (a, 16); \
+ a = _mm_packs_epi32 (a, a); \
+ a = _mm_packus_epi16 (a, a); \
+ pix = _mm_cvtsi128_si32 (a); \
+ } while (0)
+
+ while ((width -= 4) >= 0)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ INTERPOLATE_ONE_PIXEL (pix2);
+ INTERPOLATE_ONE_PIXEL (pix3);
+ INTERPOLATE_ONE_PIXEL (pix4);
+ *out++ = pix1;
+ *out++ = pix2;
+ *out++ = pix3;
+ *out++ = pix4;
+ }
+ if (width & 2)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ INTERPOLATE_ONE_PIXEL (pix2);
+ *out++ = pix1;
+ *out++ = pix2;
+ }
+ if (width & 1)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ *out = pix1;
+ }
+
+ #undef INTERPOLATE_ONE_PIXEL
+}
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
+ wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
static const pixman_fast_path_t sse2_fast_paths[] =
{
/* PIXMAN_OP_OVER */
@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
{ PIXMAN_OP_NONE },
};
commit 0df43b8ae5031dd83775d00b57b6bed809db0e89
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date: Mon Feb 21 02:07:09 2011 +0200
test: check correctness of 'bilinear_pad_repeat_get_scanline_bounds'
Individual correctness check for the new bilinear scaling related
supplementary function. This test program uses a bit wider range
of input arguments, not covered by other tests.
diff --git a/test/Makefile.am b/test/Makefile.am
index 057e9ce..9dc7219 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -13,6 +13,7 @@ TESTPROGRAMS = \
trap-crasher \
alpha-loop \
scaling-crash-test \
+ scaling-helpers-test \
gradient-crash-test \
alphamap \
stress-test \
@@ -33,6 +34,7 @@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
composite_SOURCES = composite.c utils.c utils.h
gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
stress_test_SOURCES = stress-test.c utils.c utils.h
+scaling_helpers_test_SOURCES = scaling-helpers-test.c utils.c utils.h
# Benchmarks
diff --git a/test/scaling-helpers-test.c b/test/scaling-helpers-test.c
new file mode 100644
index 0000000..c186138
--- /dev/null
+++ b/test/scaling-helpers-test.c
@@ -0,0 +1,93 @@
+#include <config.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utils.h"
+#include "pixman-fast-path.h"
+
+/* A trivial reference implementation for
+ * 'bilinear_pad_repeat_get_scanline_bounds'
+ */
+static void
+bilinear_pad_repeat_get_scanline_bounds_ref (int32_t source_image_width,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x,
+ int32_t * left_pad,
+ int32_t * left_tz,
+ int32_t * width,
+ int32_t * right_tz,
+ int32_t * right_pad)
+{
+ int w = *width;
+ *left_pad = 0;
+ *left_tz = 0;
+ *width = 0;
+ *right_tz = 0;
+ *right_pad = 0;
+ int64_t vx = vx_;
+ while (--w >= 0)
+ {
+ if (vx < 0)
+ {
+ if (vx + pixman_fixed_1 < 0)
+ *left_pad += 1;
+ else
+ *left_tz += 1;
+ }
+ else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width))
+ {
+ if (vx >= pixman_int_to_fixed (source_image_width))
+ *right_pad += 1;
+ else
+ *right_tz += 1;
+ }
+ else
+ {
+ *width += 1;
+ }
+ vx += unit_x;
+ }
+}
+
+int
+main (void)
+{
+ int i;
+ for (i = 0; i < 10000; i++)
+ {
+ int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1;
+ int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2;
+ pixman_fixed_t vx = lcg_rand_N(10000 << 16) - (3000 << 16);
+ int32_t width = lcg_rand_N(10000);
+ int32_t source_image_width = lcg_rand_N(10000) + 1;
+ pixman_fixed_t unit_x = lcg_rand_N(10 << 16) + 1;
+ width1 = width2 = width;
+
+ bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width,
+ vx,
+ unit_x,
+ &left_pad1,
+ &left_tz1,
+ &width1,
+ &right_tz1,
+ &right_pad1);
+
+ bilinear_pad_repeat_get_scanline_bounds (source_image_width,
+ vx,
+ unit_x,
+ &left_pad2,
+ &left_tz2,
+ &width2,
+ &right_tz2,
+ &right_pad2);
+
+ assert (left_pad1 == left_pad2);
+ assert (left_tz1 == left_tz2);
+ assert (width1 == width2);
+ assert (right_tz1 == right_tz2);
+ assert (right_pad1 == right_pad2);
+ }
+
+ return 0;
+}
commit d506bf68fd0e9a1c5dd484daee70631699918387
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date: Mon Feb 21 01:29:02 2011 +0200
Main loop template for fast single pass bilinear scaling
Can be used for implementing SIMD optimized fast path
functions which work with bilinear scaled source images.
Similar to the template for nearest scaling main loop, the
following types of mask are supported:
1. no mask
2. non-scaled a8 mask with SAMPLES_COVER_CLIP flag
3. solid mask
PAD repeat is fully supported. NONE repeat is partially
supported (right now only works if source image has alpha
channel or when alpha channel of the source image does not
have any effect on the compositing operation).
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index d081222..1885d47 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -587,4 +587,436 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t source_image_width,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ int32_t * left_pad,
+ int32_t * left_tz,
+ int32_t * width,
+ int32_t * right_tz,
+ int32_t * right_pad)
+{
+ int width1 = *width, left_pad1, right_pad1;
+ int width2 = *width, left_pad2, right_pad2;
+
+ pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+ &width1, &left_pad1, &right_pad1);
+ pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+ unit_x, &width2, &left_pad2, &right_pad2);
+
+ *left_pad = left_pad2;
+ *left_tz = left_pad1 - left_pad2;
+ *right_tz = right_pad2 - right_pad1;
+ *right_pad = right_pad1;
+ *width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ * scanline_func (dst_type_t * dst,
+ * const mask_type_ * mask,
+ * const src_type_t * src_top,
+ * const src_type_t * src_bottom,
+ * int32_t width,
+ * int weight_top,
+ * int weight_bottom,
+ * pixman_fixed_t vx,
+ * pixman_fixed_t unit_x,
+ * pixman_fixed_t max_vx,
+ * pixman_bool_t zero_src)
+ *
+ * Where:
+ * dst - destination scanline buffer for storing results
+ * mask - mask buffer (or single value for solid mask)
+ * src_top, src_bottom - two source scanlines
+ * width - number of pixels to process
+ * weight_top - weight of the top row for interpolation
+ * weight_bottom - weight of the bottom row for interpolation
+ * vx - initial position for fetching the first pair of
+ * pixels from the source buffer
+ * unit_x - position increment needed to move to the next pair
+ * of pixels
+ * max_vx - image size as a fixed point value, can be used for
+ * implementing NORMAL repeat (when it is supported)
+ * zero_src - boolean hint variable, which is set to TRUE when
+ * all source pixels are fetched from zero padding
+ * zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ * but sometimes it may be less than that for NONE repeat when handling
+ * fuzzy antialiased top or bottom image edges. Also both top and
+ * bottom weight variables are guaranteed to have value in 0-255
+ * range and can fit into unsigned byte or be used with 8-bit SIMD
+ * multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid) \
+static void \
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dst_x, \
+ int32_t dst_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type_t *dst_line; \
+ mask_type_t *mask_line; \
+ src_type_t *src_first_line; \
+ int y1, y2; \
+ pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \
+ pixman_vector_t v; \
+ pixman_fixed_t vx, vy; \
+ pixman_fixed_t unit_x, unit_y; \
+ int32_t left_pad, left_tz, right_tz, right_pad; \
+ \
+ dst_type_t *dst; \
+ mask_type_t solid_mask; \
+ const mask_type_t *mask = &solid_mask; \
+ int src_stride, mask_stride, dst_stride; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
+ if (have_mask) \
+ { \
+ if (mask_is_solid) \
+ { \
+ solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format); \
+ mask_stride = 0; \
+ } \
+ else \
+ { \
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t, \
+ mask_stride, mask_line, 1); \
+ } \
+ } \
+ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
+ * transformed from destination space to source space */ \
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
+ \
+ /* reference point is the center of the pixel */ \
+ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
+ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
+ v.vector[2] = pixman_fixed_1; \
+ \
+ if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
+ return; \
+ \
+ unit_x = src_image->common.transform->matrix[0][0]; \
+ unit_y = src_image->common.transform->matrix[1][1]; \
+ \
+ v.vector[0] -= pixman_fixed_1 / 2; \
+ v.vector[1] -= pixman_fixed_1 / 2; \
+ \
+ vy = v.vector[1]; \
+ \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
+ PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x, \
+ &left_pad, &left_tz, &width, &right_tz, &right_pad); \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
+ { \
+ /* PAD repeat does not need special handling for 'transition zones' and */ \
+ /* they can be combined with 'padding zones' safely */ \
+ left_pad += left_tz; \
+ right_pad += right_tz; \
+ left_tz = right_tz = 0; \
+ } \
+ v.vector[0] += left_pad * unit_x; \
+ } \
+ \
+ while (--height >= 0) \
+ { \
+ int weight1, weight2; \
+ dst = dst_line; \
+ dst_line += dst_stride; \
+ vx = v.vector[0]; \
+ if (have_mask && !mask_is_solid) \
+ { \
+ mask = mask_line; \
+ mask_line += mask_stride; \
+ } \
+ \
+ y1 = pixman_fixed_to_int (vy); \
+ weight2 = (vy >> 8) & 0xff; \
+ if (weight2) \
+ { \
+ /* normal case, both row weights are in 0-255 range and fit unsigned byte */ \
+ y2 = y1 + 1; \
+ weight1 = 256 - weight2; \
+ } \
+ else \
+ { \
+ /* set both top and bottom row to the same scanline, and weights to 128+128 */ \
+ y2 = y1; \
+ weight1 = weight2 = 128; \
+ } \
+ vy += unit_y; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
+ { \
+ src_type_t *src1, *src2; \
+ src_type_t buf1[2]; \
+ src_type_t buf2[2]; \
+ repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height); \
+ repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \
+ src1 = src_first_line + src_stride * y1; \
+ src2 = src_first_line + src_stride * y2; \
+ \
+ if (left_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = src1[0]; \
+ buf2[0] = buf2[1] = src2[0]; \
+ scanline_func (dst, mask, \
+ buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \
+ dst += left_pad; \
+ if (have_mask && !mask_is_solid) \
+ mask += left_pad; \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (dst, mask, \
+ src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \
+ dst += width; \
+ if (have_mask && !mask_is_solid) \
+ mask += width; \
+ } \
+ if (right_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \
+ buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \
+ scanline_func (dst, mask, \
+ buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \
+ } \
+ } \
+ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ src_type_t *src1, *src2; \
+ src_type_t buf1[2]; \
+ src_type_t buf2[2]; \
+ /* handle top/bottom zero padding by just setting weights to 0 if needed */ \
+ if (y1 < 0) \
+ { \
+ weight1 = 0; \
+ y1 = 0; \
+ } \
+ if (y1 >= src_image->bits.height) \
+ { \
+ weight1 = 0; \
+ y1 = src_image->bits.height - 1; \
+ } \
+ if (y2 < 0) \
+ { \
+ weight2 = 0; \
+ y2 = 0; \
+ } \
+ if (y2 >= src_image->bits.height) \
+ { \
+ weight2 = 0; \
+ y2 = src_image->bits.height - 1; \
+ } \
+ src1 = src_first_line + src_stride * y1; \
+ src2 = src_first_line + src_stride * y2; \
+ \
+ if (left_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = 0; \
+ buf2[0] = buf2[1] = 0; \
+ scanline_func (dst, mask, \
+ buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \
+ dst += left_pad; \
+ if (have_mask && !mask_is_solid) \
+ mask += left_pad; \
+ } \
+ if (left_tz > 0) \
+ { \
+ buf1[0] = 0; \
+ buf1[1] = src1[0]; \
+ buf2[0] = 0; \
+ buf2[1] = src2[0]; \
+ scanline_func (dst, mask, \
+ buf1, buf2, left_tz, weight1, weight2, \
+ pixman_fixed_frac (vx), unit_x, 0, FALSE); \
+ dst += left_tz; \
+ if (have_mask && !mask_is_solid) \
+ mask += left_tz; \
+ vx += left_tz * unit_x; \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (dst, mask, \
+ src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \
+ dst += width; \
+ if (have_mask && !mask_is_solid) \
+ mask += width; \
+ vx += width * unit_x; \
+ } \
+ if (right_tz > 0) \
+ { \
+ buf1[0] = src1[src_image->bits.width - 1]; \
+ buf1[1] = 0; \
+ buf2[0] = src2[src_image->bits.width - 1]; \
+ buf2[1] = 0; \
+ scanline_func (dst, mask, \
+ buf1, buf2, right_tz, weight1, weight2, \
+ pixman_fixed_frac (vx), unit_x, 0, FALSE); \
+ dst += right_tz; \
+ if (have_mask && !mask_is_solid) \
+ mask += right_tz; \
+ } \
+ if (right_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = 0; \
+ buf2[0] = buf2[1] = 0; \
+ scanline_func (dst, mask, \
+ buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \
+ } \
+ } \
+ else \
+ { \
+ scanline_func (dst, mask, src_first_line + src_stride * y1, \
+ src_first_line + src_stride * y2, width, \
+ weight1, weight2, vx, unit_x, max_vx, FALSE); \
+ } \
+ } \
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid) \
+ FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+ dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define SCALED_BILINEAR_FLAGS \
+ (FAST_PATH_SCALE_TRANSFORM | \
+ FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_BILINEAR_FILTER | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \
+ }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func) \
+ SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
#endif
More information about the xorg-commit
mailing list