[Pixman] [PATCH 3/7] C variant of bilinear scaled 'src_8888_8888' fast path
Siarhei Siamashka
siarhei.siamashka at gmail.com
Tue Feb 22 13:23:44 PST 2011
From: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Because of doing scaling in a single pass without temporary buffers, it
is a bit faster than general path on x86 (and provides even better speedup
on MIPS and ARM).
Benchmark on Intel Core i7:
Using cairo-perf-trace:
before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6
after: image firefox-planet-gnome 12.019 12.054 0.15% 5/6
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
after: op=1, src=20028888, dst=20028888, speed=82.61 MPix/s
Benchmark on ARM Cortex-A8:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s
after: op=1, src=20028888, dst=20028888, speed=10.72 MPix/s
Benchmark on MIPS 24K:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=5.12 MPix/s
after: op=1, src=20028888, dst=20028888, speed=6.96 MPix/s
Microbenchmark (scaling 500x500 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=5.26 MPix/s
after: op=1, src=20028888, dst=20028888, speed=7.00 MPix/s
---
pixman/pixman-fast-path.c | 144 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 144 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 92f0308..1e3094e 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1458,6 +1458,143 @@ FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
uint16_t, uint16_t, PAD)
static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int wt, int wb)
+{
+#if SIZEOF_LONG > 4
+ uint64_t distxy, distxiy, distixy, distixiy;
+ uint64_t tl64, tr64, bl64, br64;
+ uint64_t f, r;
+
+ distxy = distx * wb;
+ distxiy = distx * wt;
+ distixy = wb * (256 - distx);
+ distixiy = (256 - distx) * wt;
+
+ /* Alpha and Blue */
+ tl64 = tl & 0xff0000ff;
+ tr64 = tr & 0xff0000ff;
+ bl64 = bl & 0xff0000ff;
+ br64 = br & 0xff0000ff;
+
+ f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+ r = f & 0x0000ff0000ff0000ull;
+
+ /* Red and Green */
+ tl64 = tl;
+ tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+ tr64 = tr;
+ tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+ bl64 = bl;
+ bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+ br64 = br;
+ br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+ f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+ r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+ return (uint32_t)(r >> 16);
+#else
+ int distxy, distxiy, distixy, distixiy;
+ uint32_t f, r;
+
+ distxy = distx * wb;
+ distxiy = distx * wt;
+ distixy = wb * (256 - distx);
+ distixiy = (256 - distx) * wt;
+
+ /* Blue */
+ r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+ + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
+
+ /* Green */
+ f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+ + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
+ r |= f & 0xff000000;
+
+ tl >>= 16;
+ tr >>= 16;
+ bl >>= 16;
+ br >>= 16;
+ r >>= 16;
+
+ /* Red */
+ f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+ + (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy;
+ r |= f & 0x00ff0000;
+
+ /* Alpha */
+ f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+ + (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy;
+ r |= f & 0xff000000;
+
+ return r;
+#endif
+}
+
+static void
+bilinear_interpolate_line (uint32_t * buffer,
+ const uint32_t * top_row,
+ const uint32_t * bottom_row,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width)
+{
+ while (--width >= 0)
+ {
+ uint32_t tl, tr, bl, br;
+ int distx;
+
+ tl = top_row [pixman_fixed_to_int (x)];
+ tr = top_row [pixman_fixed_to_int (x) + 1];
+ bl = bottom_row [pixman_fixed_to_int (x)];
+ br = bottom_row [pixman_fixed_to_int (x) + 1];
+
+ distx = (x >> 8) & 0xff;
+
+ *buffer++ = bilinear_interpolation (tl, tr, bl, br, distx, wt, wb);
+
+ x += ux;
+ }
+}
+
+static force_inline void
+scaled_bilinear_scanline_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ bilinear_interpolate_line (dst, src_top, src_bottom,
+ wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_SRC,
+ scaled_bilinear_scanline_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_SRC,
+ scaled_bilinear_scanline_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_SRC,
+ scaled_bilinear_scanline_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
+static force_inline uint32_t
fetch_nearest (pixman_repeat_t src_repeat,
pixman_format_code_t format,
uint32_t *src, int x, int src_width)
@@ -1973,6 +2110,13 @@ static const pixman_fast_path_t c_fast_paths[] =
SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+
#define NEAREST_FAST_PATH(op,s,d) \
{ PIXMAN_OP_ ## op, \
PIXMAN_ ## s, SCALED_NEAREST_FLAGS, \
--
1.7.3.4
More information about the Pixman
mailing list