[Pixman] [PATCH] mmx: add src_8888_0565
Matt Turner
mattst88 at gmail.com
Thu Apr 19 14:40:01 PDT 2012
Uses the pmadd technique described in
http://software.intel.com/sites/landingpage/legacy/mmx/MMX_App_24-16_Bit_Conversion.pdf
Loongson:
src_8888_0565 = L1: 106.13 L2: 83.57 M: 33.46 ( 68.90%) HT: 30.29 VT: 27.67 R: 26.11 RT: 15.06 ( 135Kops/s)
src_8888_0565 = L1: 122.10 L2: 117.53 M: 37.97 ( 78.58%) HT: 33.14 VT: 30.09 R: 29.01 RT: 15.76 ( 139Kops/s)
ARM/iwMMXt:
src_8888_0565 = L1: 67.88 L2: 56.61 M: 31.20 ( 56.74%) HT: 29.22 VT: 27.01 R: 25.39 RT: 19.29 ( 130Kops/s)
src_8888_0565 = L1: 110.38 L2: 82.33 M: 40.92 ( 73.22%) HT: 35.63 VT: 32.22 R: 30.07 RT: 18.40 ( 132Kops/s)
---
pixman/loongson-mmintrin.h | 11 ++++++
pixman/pixman-mmx.c | 85 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 96 insertions(+), 0 deletions(-)
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 76ae892..8295ba0 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -84,6 +84,17 @@ _mm_empty (void)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pmaddhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
{
__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 7511216..d3dd978 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -174,9 +174,12 @@ typedef struct
mmxdatafield mmx_4x0080;
mmxdatafield mmx_565_rgb;
mmxdatafield mmx_565_unpack_multiplier;
+ mmxdatafield mmx_565_pack_multiplier;
mmxdatafield mmx_565_r;
mmxdatafield mmx_565_g;
mmxdatafield mmx_565_b;
+ mmxdatafield mmx_packed_565_rb;
+ mmxdatafield mmx_packed_565_g;
#ifndef USE_LOONGSON_MMI
mmxdatafield mmx_mask_0;
mmxdatafield mmx_mask_1;
@@ -202,9 +205,12 @@ static const mmx_data_t c =
MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
+ MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
+ MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
+ MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
#ifndef USE_LOONGSON_MMI
MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
@@ -562,6 +568,27 @@ pack_565 (__m64 pixel, __m64 target, int pos)
#endif
}
+static force_inline __m64
+pack_4xpacked565 (__m64 a, __m64 b)
+{
+ __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
+ __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
+
+ __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
+ __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
+
+ __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
+ __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
+
+ t0 = _mm_or_si64 (t0, g0);
+ t1 = _mm_or_si64 (t1, g1);
+
+ t0 = shift(t0, -5);
+ t1 = shift(t1, -5 + 16);
+
+ return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+}
+
#ifndef _MSC_VER
static force_inline __m64
@@ -2086,6 +2113,60 @@ pixman_fill_mmx (uint32_t *bits,
}
static void
+mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ s = *src++;
+ *dst = CONVERT_8888_TO_0565 (s);
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m64 vdest;
+ __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
+ __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
+
+ vdest = pack_4xpacked565 (vsrc0, vsrc1);
+
+ *(__m64 *)dst = vdest;
+
+ w -= 4;
+ src += 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ *dst = CONVERT_8888_TO_0565 (s);
+ dst++;
+ w--;
+ }
+ }
+}
+
+static void
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -3444,6 +3525,10 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
--
1.7.3.4
More information about the Pixman
mailing list