[Nouveau] [PATCH] xv: add MMX / SSE acceleration for YV12 -> YUYV repacking
Ilia Mirkin
imirkin at alum.mit.edu
Tue Mar 9 01:02:01 UTC 2021
This is used by the blit adaptor. Might as well try to accelerate it.
When testing with it hacked to take effect for nvc0, saw, a decrease of
NVPutImage usage in the X process from 68% -> 43% (MMX) -> 24% (SSE)
(which is approximately a 7x speed-up to the function, assuming other
parts remained equal).
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
I did some basic testing with a patch to force the texture path to do
this conversion rather than to NV12, testing all 3 cases. However I need
to do better testing of edge cases, which I will do before pushing.
src/nouveau_xv.c | 94 ++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 80 insertions(+), 14 deletions(-)
diff --git a/src/nouveau_xv.c b/src/nouveau_xv.c
index b2d75c5..16aca93 100644
--- a/src/nouveau_xv.c
+++ b/src/nouveau_xv.c
@@ -25,7 +25,7 @@
#include "config.h"
#endif
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__MMX__)
#include <immintrin.h>
#endif
@@ -568,7 +568,7 @@ NVCopyData420(unsigned char *src1, unsigned char *src2, unsigned char *src3,
{
CARD32 *dst;
CARD8 *s1, *s2, *s3;
- int i, j;
+ int i, j, l, e;
#define su(X) (((j & 1) && j < (h-1)) ? ((unsigned)((signed int)s2[X] + \
(signed int)(s2 + srcPitch2)[X]) / 2) : (s2[X]))
@@ -576,29 +576,95 @@ NVCopyData420(unsigned char *src1, unsigned char *src2, unsigned char *src3,
(signed int)(s3 + srcPitch2)[X]) / 2) : (s3[X]))
w >>= 1;
+#ifdef __MMX__
+ l = w >> 3;
+ e = w & 7;
+#else
+ l = w >> 2;
+ e = w & 3;
+#endif
for (j = 0; j < h; j++) {
dst = (CARD32*)dst1;
s1 = src1; s2 = src2; s3 = src3;
i = w;
- while (i > 4) {
+ for (i = 0; i < l; i++) {
+#ifdef __MMX__
+ __m64 mm_v = *(__m64 *)&s2[0];
+ __m64 mm_u = *(__m64 *)&s3[0];
+
+ if (j & 1 && j < (h - 1)) {
+ __m64 mm_vnext = *(__m64 *)&(s2 + srcPitch2)[0];
+#ifdef __SSE__
+ mm_v = _mm_avg_pu8(mm_v, mm_vnext);
+#else /* __SSE__ */
+ __m64 zero = _m_from_int(0);
+ /* make 16-bit wide values */
+ __m64 mm_vnext16_1 = _mm_unpacklo_pi8(mm_vnext, zero);
+ __m64 mm_vnext16_2 = _mm_unpackhi_pi8(mm_vnext, zero);
+ __m64 mm_v16_1 = _mm_unpacklo_pi8(mm_v, zero);
+ __m64 mm_v16_2 = _mm_unpackhi_pi8(mm_v, zero);
+ /* add together */
+ mm_v16_1 = _mm_add_pi16(mm_v16_1, mm_vnext16_1);
+ mm_v16_2 = _mm_add_pi16(mm_v16_2, mm_vnext16_2);
+ /* divide by 2 */
+ mm_v16_1 = _mm_srli_pi16(mm_v16_1, 1);
+ mm_v16_2 = _mm_srli_pi16(mm_v16_2, 1);
+ /* put back into 8-bit values */
+ mm_v = _mm_packs_pu16(mm_v16_1, mm_v16_2);
+#endif
+
+ /* repeat for u */
+ __m64 mm_unext = *(__m64 *)&(s3 + srcPitch2)[0];
+#ifdef __SSE__
+ mm_u = _mm_avg_pu8(mm_u, mm_unext);
+#else /* __SSE__ */
+ /* make 16-bit wide values */
+ __m64 mm_unext16_1 = _mm_unpacklo_pi8(mm_unext, zero);
+ __m64 mm_unext16_2 = _mm_unpackhi_pi8(mm_unext, zero);
+ __m64 mm_u16_1 = _mm_unpacklo_pi8(mm_u, zero);
+ __m64 mm_u16_2 = _mm_unpackhi_pi8(mm_u, zero);
+ /* add together */
+ mm_u16_1 = _mm_add_pi16(mm_u16_1, mm_unext16_1);
+ mm_u16_2 = _mm_add_pi16(mm_u16_2, mm_unext16_2);
+ /* divide by 2 */
+ mm_u16_1 = _mm_srli_pi16(mm_u16_1, 1);
+ mm_u16_2 = _mm_srli_pi16(mm_u16_2, 1);
+ /* put back into 8-bit values */
+ mm_u = _mm_packs_pu16(mm_u16_1, mm_u16_2);
+#endif
+ }
+
+ __m64 mm_y1 = *(__m64 *)s1;
+ __m64 mm_y2 = *(__m64 *)&s1[8];
+
+ __m64 mm_uv1 = _mm_unpacklo_pi8(mm_u, mm_v);
+ __m64 mm_uv2 = _mm_unpackhi_pi8(mm_u, mm_v);
+
+ *(__m64 *)&dst[0] = _mm_unpacklo_pi8(mm_y1, mm_uv1);
+ *(__m64 *)&dst[2] = _mm_unpackhi_pi8(mm_y1, mm_uv1);
+ *(__m64 *)&dst[4] = _mm_unpacklo_pi8(mm_y2, mm_uv2);
+ *(__m64 *)&dst[6] = _mm_unpackhi_pi8(mm_y2, mm_uv2);
+
+ dst += 8; s2 += 8; s3 += 8; s1 += 16;
+#else /* __MMX__ */
#if X_BYTE_ORDER == X_BIG_ENDIAN
- dst[0] = (s1[0] << 24) | (s1[1] << 8) | (sv(0) << 16) | su(0);
- dst[1] = (s1[2] << 24) | (s1[3] << 8) | (sv(1) << 16) | su(1);
- dst[2] = (s1[4] << 24) | (s1[5] << 8) | (sv(2) << 16) | su(2);
- dst[3] = (s1[6] << 24) | (s1[7] << 8) | (sv(3) << 16) | su(3);
+ dst[0] = (s1[0] << 24) | (s1[1] << 8) | (sv(0) << 16) | su(0);
+ dst[1] = (s1[2] << 24) | (s1[3] << 8) | (sv(1) << 16) | su(1);
+ dst[2] = (s1[4] << 24) | (s1[5] << 8) | (sv(2) << 16) | su(2);
+ dst[3] = (s1[6] << 24) | (s1[7] << 8) | (sv(3) << 16) | su(3);
#else
- dst[0] = s1[0] | (s1[1] << 16) | (sv(0) << 8) | (su(0) << 24);
- dst[1] = s1[2] | (s1[3] << 16) | (sv(1) << 8) | (su(1) << 24);
- dst[2] = s1[4] | (s1[5] << 16) | (sv(2) << 8) | (su(2) << 24);
- dst[3] = s1[6] | (s1[7] << 16) | (sv(3) << 8) | (su(3) << 24);
+ dst[0] = s1[0] | (s1[1] << 16) | (sv(0) << 8) | (su(0) << 24);
+ dst[1] = s1[2] | (s1[3] << 16) | (sv(1) << 8) | (su(1) << 24);
+ dst[2] = s1[4] | (s1[5] << 16) | (sv(2) << 8) | (su(2) << 24);
+ dst[3] = s1[6] | (s1[7] << 16) | (sv(3) << 8) | (su(3) << 24);
#endif
- dst += 4; s2 += 4; s3 += 4; s1 += 8;
- i -= 4;
+ dst += 4; s2 += 4; s3 += 4; s1 += 8;
+#endif /* __MMX__ */
}
- while (i--) {
+ for (i = 0; i < e; i++) {
#if X_BYTE_ORDER == X_BIG_ENDIAN
dst[0] = (s1[0] << 24) | (s1[1] << 8) | (sv(0) << 16) | su(0);
#else
--
2.26.2
More information about the Nouveau
mailing list