[Pixman] [PATCH 2/5] mmx: add and use expand_4xpacked565 function

Matt Turner mattst88 at gmail.com
Fri May 18 11:41:51 PDT 2012


Loongson:
add_0565_0565 =  L1:  14.39  L2:  13.98  M: 11.28 ( 15.22%)  HT: 10.11  VT:  9.74  R:  9.39  RT:  6.05 (  67Kops/s)
add_0565_0565 =  L1:  15.37  L2:  14.91  M: 11.83 ( 16.06%)  HT: 10.53  VT: 10.15  R:  9.74  RT:  6.19 (  68Kops/s)

ARM/iwMMXt:
add_0565_0565 =  L1:  11.12  L2:  10.40  M:  8.82 ( 10.65%)  HT:  7.98  VT:  7.41  R:  7.57  RT:  5.21 (  54Kops/s)
add_0565_0565 =  L1:  12.87  L2:  11.58  M: 10.11 ( 12.50%)  HT:  9.06  VT:  8.66  R:  7.70  RT:  5.62 (  58Kops/s)
---
 pixman/loongson-mmintrin.h |   21 +++++++++++++++++++++
 pixman/pixman-mmx.c        |   44 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 8295ba0..1a114fe 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -77,6 +77,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
 	return ret;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pcmpeqw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_empty (void)
 {
@@ -150,6 +161,16 @@ _mm_shuffle_pi16 (__m64 __m, int64_t __n)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psllh  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_si64 (__m64 __m, int64_t __count)
 {
 	__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 01a2bc9..d98d7dd 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -185,6 +185,9 @@ typedef struct
     mmxdatafield mmx_565_b;
     mmxdatafield mmx_packed_565_rb;
     mmxdatafield mmx_packed_565_g;
+    mmxdatafield mmx_expand_565_g;
+    mmxdatafield mmx_expand_565_b;
+    mmxdatafield mmx_expand_565_r;
 #ifndef USE_LOONGSON_MMI
     mmxdatafield mmx_mask_0;
     mmxdatafield mmx_mask_1;
@@ -216,6 +219,9 @@ static const mmx_data_t c =
     MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
     MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
     MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
+    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
+    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
+    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
 #ifndef USE_LOONGSON_MMI
     MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
     MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
@@ -518,6 +524,34 @@ expand565 (__m64 pixel, int pos)
     return _mm_srli_pi16 (pixel, 8);
 }
 
+/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
+ *
+ *    AARRGGBBRRGGBB
+ */
+static force_inline void
+expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1)
+{
+    __m64 t0, t1, alpha = _mm_cmpeq_pi32 (_mm_setzero_si64 (), _mm_setzero_si64 ());
+    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
+    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
+    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
+
+    /* Replicate high bits into empty low bits. */
+    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
+    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
+    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
+
+    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
+    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
+    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
+
+    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
+    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
+
+    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
+    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
+}
+
 static force_inline __m64
 expand8888 (__m64 in, int pos)
 {
@@ -3341,14 +3375,12 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
     while (w >= 4)
     {
 	__m64 vsrc = ldq_u ((__m64 *)src);
+	__m64 mm0, mm1;
 
-	__m64 mm0 = expand565 (vsrc, 0);
-	__m64 mm1 = expand565 (vsrc, 1);
-	__m64 mm2 = expand565 (vsrc, 2);
-	__m64 mm3 = expand565 (vsrc, 3);
+	expand_4xpacked565 (vsrc, &mm0, &mm1);
 
-	*(__m64 *)(dst + 0) = _mm_or_si64 (pack8888 (mm0, mm1), MC (ff000000));
-	*(__m64 *)(dst + 2) = _mm_or_si64 (pack8888 (mm2, mm3), MC (ff000000));
+	*(__m64 *)(dst + 0) = mm0;
+	*(__m64 *)(dst + 2) = mm1;
 
 	dst += 4;
 	src += 4;
-- 
1.7.3.4



More information about the Pixman mailing list