[Pixman] [PATCH 2/2] sse2: faster bilinear scaling with 7-bit precision (use _mm_madd_epi16)
Siarhei Siamashka
siarhei.siamashka at gmail.com
Mon Jun 25 16:44:03 PDT 2012
Reducing interpolation precision allows the use of PMADDWD instruction.
It is much faster:
8-bit: image firefox-fishtank 57.584 58.349 0.74% 3/3
7-bit: image firefox-fishtank 51.139 51.229 0.30% 3/3
8-bit: src_8888_8888 = L1: 228.71 L2: 226.52 M:224.82 ( 14.95%) HT:183.22 VT:154.02 R:171.72 RT:109.36
7-bit: src_8888_8888 = L1: 320.45 L2: 317.43 M:314.38 ( 20.77%) HT:215.13 VT:177.35 R:204.46 RT:121.93
---
pixman/pixman-private.h | 2 +-
pixman/pixman-sse2.c | 35 +++++++++++++++++++++++++----------
2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 020e026..18aa523 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -1,5 +1,5 @@
/* bilinear interpolation precision (must be <= 8) */
-#define BILINEAR_INTERPOLATION_BITS 8
+#define BILINEAR_INTERPOLATION_BITS 7
#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
#ifndef __ASSEMBLER__
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 301bce5..f665b37 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5369,8 +5369,10 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
#define BILINEAR_DECLARE_VARIABLES \
const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
- const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
- const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
+ const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
+ const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
+ const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
+ const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
unit_x, unit_x, unit_x, unit_x); \
const __m128i xmm_zero = _mm_setzero_si128 (); \
@@ -5390,15 +5392,28 @@ do { \
xmm_wt), \
_mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
xmm_wb)); \
- /* calculate horizontal weights */ \
- xmm_wh = _mm_add_epi16 (xmm_addc, _mm_xor_si128 (xmm_xorc, \
+ if (BILINEAR_INTERPOLATION_BITS < 8) \
+ { \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \
_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
- xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
- /* horizontal interpolation */ \
- xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
- xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
- a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
- _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
+ a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
+ } \
+ else \
+ { \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \
+ _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ } \
/* shift and pack the result */ \
a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
a = _mm_packs_epi32 (a, a); \
--
1.7.3.4
More information about the Pixman
mailing list