[Mesa-dev] [PATCH] mesa: Implement faster streaming memcpy

Wed Jul 8 14:07:19 PDT 2015

WARNING: No perf data, please keep reading though)

This implements the suggestion provided by the paper, "Fast USWC to WB Memory
Copy"
(https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers).
This is described throughout the paper, but the sample code lives in Figure 3-3.
That paper purports a roughly 40% performance gain in Mbyte/second over the
original implementation done by Matt.

Section 3.1.2 is the summary of why an intermediate cache buffer is used. It
claims that if you use the naive implementation, fill buffers are contended for.
To be honest, I can't quite fathom the underlying explanation, but I'll think
about it some more. Most importantly would be to get the perf data... This patch
does need performance data. I don't currently have a platform that this would
benefit (BYT or BSW), so I can't get anything useful. As soon as I get a
platform to test it on, I will - meanwhile, maybe whomever tested the original
patch the first time around come run this through?

Cc: Matt Turner <mattst88 at gmail.com>
Cc: Chad Versace <chad.versace at linux.intel.com>
Cc: Kristian Høgsberg <krh at bitplanet.net>
Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 src/mesa/main/streaming-load-memcpy.c | 61 +++++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/mesa/main/streaming-load-memcpy.c b/src/mesa/main/streaming-load-memcpy.c
index d7147af..3cd310a 100644
--- a/src/mesa/main/streaming-load-memcpy.c
+++ b/src/mesa/main/streaming-load-memcpy.c
@@ -30,6 +30,8 @@
 #include "main/streaming-load-memcpy.h"
 #include <smmintrin.h>
 
+static uint8_t rsvd_space[4096];
+
 /* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
  * read performance from uncached memory.
  */
@@ -59,23 +61,54 @@ _mesa_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
       len -= MIN2(bytes_before_alignment_boundary, len);
    }
 
-   while (len >= 64) {
-      __m128i *dst_cacheline = (__m128i *)d;
-      __m128i *src_cacheline = (__m128i *)s;
+   while (len > 64) {
+      __m128i *cached_buffer = (__m128i *)rsvd_space;
+      size_t streaming_len = len > 4096 ? 4096 : len;
+
+      __asm__ volatile("mfence" ::: "memory");
+
+      while (streaming_len >= 64) {
+         __m128i *src_cacheline = (__m128i *)s;
+
+         __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0);
+         __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1);
+         __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2);
+         __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);
+
+         _mm_store_si128(cached_buffer + 0, temp1);
+         _mm_store_si128(cached_buffer + 1, temp2);
+         _mm_store_si128(cached_buffer + 2, temp3);
+         _mm_store_si128(cached_buffer + 3, temp4);
+
+         s += 64;
+         streaming_len -= 64;
+         cached_buffer += 4;
+      }
+
+      cached_buffer = (__m128i *)rsvd_space;
+      streaming_len = len > 4096 ? 4096 : len;
+
+      __asm__ volatile("mfence" ::: "memory");
+
+      while (streaming_len >= 64) {
+         __m128i *dst_cacheline = (__m128i *)d;
+
+         __m128i temp1 = _mm_stream_load_si128(cached_buffer + 0);
+         __m128i temp2 = _mm_stream_load_si128(cached_buffer + 1);
+         __m128i temp3 = _mm_stream_load_si128(cached_buffer + 2);
+         __m128i temp4 = _mm_stream_load_si128(cached_buffer + 3);
 
-      __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0);
-      __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1);
-      __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2);
-      __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);
+         _mm_store_si128(dst_cacheline + 0, temp1);
+         _mm_store_si128(dst_cacheline + 1, temp2);
+         _mm_store_si128(dst_cacheline + 2, temp3);
+         _mm_store_si128(dst_cacheline + 3, temp4);
 
-      _mm_store_si128(dst_cacheline + 0, temp1);
-      _mm_store_si128(dst_cacheline + 1, temp2);
-      _mm_store_si128(dst_cacheline + 2, temp3);
-      _mm_store_si128(dst_cacheline + 3, temp4);
+         d += 64;
+         streaming_len -= 64;
+         cached_buffer += 4;
 
-      d += 64;
-      s += 64;
-      len -= 64;
+         len -= 64;
+      }
    }
 
    /* memcpy() the tail. */
-- 
2.4.5