[Pixman] [PATCH 2/5] mmx: add and use expand_4xpacked565 function
Matt Turner
mattst88 at gmail.com
Fri May 18 11:41:51 PDT 2012
Loongson:
add_0565_0565 = L1: 14.39 L2: 13.98 M: 11.28 ( 15.22%) HT: 10.11 VT: 9.74 R: 9.39 RT: 6.05 ( 67Kops/s)
add_0565_0565 = L1: 15.37 L2: 14.91 M: 11.83 ( 16.06%) HT: 10.53 VT: 10.15 R: 9.74 RT: 6.19 ( 68Kops/s)
ARM/iwMMXt:
add_0565_0565 = L1: 11.12 L2: 10.40 M: 8.82 ( 10.65%) HT: 7.98 VT: 7.41 R: 7.57 RT: 5.21 ( 54Kops/s)
add_0565_0565 = L1: 12.87 L2: 11.58 M: 10.11 ( 12.50%) HT: 9.06 VT: 8.66 R: 7.70 RT: 5.62 ( 58Kops/s)
---
pixman/loongson-mmintrin.h | 21 +++++++++++++++++++++
pixman/pixman-mmx.c | 44 ++++++++++++++++++++++++++++++++++++++------
2 files changed, 59 insertions(+), 6 deletions(-)
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 8295ba0..1a114fe 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -77,6 +77,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
return ret;
}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pcmpeqw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty (void)
{
@@ -150,6 +161,16 @@ _mm_shuffle_pi16 (__m64 __m, int64_t __n)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int64_t __count)
+{
+ __m64 ret;
+ asm("psllh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+ return ret;
+}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64 (__m64 __m, int64_t __count)
{
__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 01a2bc9..d98d7dd 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -185,6 +185,9 @@ typedef struct
mmxdatafield mmx_565_b;
mmxdatafield mmx_packed_565_rb;
mmxdatafield mmx_packed_565_g;
+ mmxdatafield mmx_expand_565_g;
+ mmxdatafield mmx_expand_565_b;
+ mmxdatafield mmx_expand_565_r;
#ifndef USE_LOONGSON_MMI
mmxdatafield mmx_mask_0;
mmxdatafield mmx_mask_1;
@@ -216,6 +219,9 @@ static const mmx_data_t c =
MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
+ MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
+ MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
+ MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
#ifndef USE_LOONGSON_MMI
MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
@@ -518,6 +524,34 @@ expand565 (__m64 pixel, int pos)
return _mm_srli_pi16 (pixel, 8);
}
+/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
+ *
+ * AARRGGBBRRGGBB
+ */
+static force_inline void
+expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1)
+{
+ __m64 t0, t1, alpha = _mm_cmpeq_pi32 (_mm_setzero_si64 (), _mm_setzero_si64 ());
+ __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
+ __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
+ __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
+
+ /* Replicate high bits into empty low bits. */
+ r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
+ g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
+ b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
+
+ r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
+ g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
+ b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
+
+ t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
+ t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
+
+ *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
+ *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
+}
+
static force_inline __m64
expand8888 (__m64 in, int pos)
{
@@ -3341,14 +3375,12 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
while (w >= 4)
{
__m64 vsrc = ldq_u ((__m64 *)src);
+ __m64 mm0, mm1;
- __m64 mm0 = expand565 (vsrc, 0);
- __m64 mm1 = expand565 (vsrc, 1);
- __m64 mm2 = expand565 (vsrc, 2);
- __m64 mm3 = expand565 (vsrc, 3);
+ expand_4xpacked565 (vsrc, &mm0, &mm1);
- *(__m64 *)(dst + 0) = _mm_or_si64 (pack8888 (mm0, mm1), MC (ff000000));
- *(__m64 *)(dst + 2) = _mm_or_si64 (pack8888 (mm2, mm3), MC (ff000000));
+ *(__m64 *)(dst + 0) = mm0;
+ *(__m64 *)(dst + 2) = mm1;
dst += 4;
src += 4;
--
1.7.3.4
More information about the Pixman
mailing list