[Mesa-dev] [PATCH 2/4] i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear

Tapani Pälli tapani.palli at intel.com
Thu Sep 13 10:46:52 UTC 2018


From: Scott D Phillips <scott.d.phillips at intel.com>

The reference for MOVNTDQA says:

    For WC memory type, the nontemporal hint may be implemented by
    loading a temporary internal buffer with the equivalent of an
    aligned cache line without filling this data to the cache.
    [...] Subsequent MOVNTDQA reads to unread portions of the WC
    cache line will receive data from the temporary internal
    buffer if data is available.

This hidden cache line sized temporary buffer can improve the
read performance from wc maps.

v2: Add mfence at start of tiled_to_linear for streaming loads (Chris)
v3: add Android build support (Tapani)

Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
Reviewed-by: Matt Turner <mattst88 at gmail.com>
Acked-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/Android.mk           | 22 +++++++++
 src/mesa/drivers/dri/i965/Makefile.am          |  7 +++
 src/mesa/drivers/dri/i965/Makefile.sources     |  6 ++-
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 62 ++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/meson.build          | 18 ++++++--
 5 files changed, 110 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index 324d087220a..03c773a5686 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -51,6 +51,27 @@ I965_PERGEN_LIBS := \
 	libmesa_i965_gen10 \
 	libmesa_i965_gen11
 
+
+# ---------------------------------------
+# Build libmesa_intel_tiled_memcpy
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_intel_tiled_memcpy
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(intel_tiled_memcpy_FILES)
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_CFLAGS += \
+        -DUSE_SSE41 -msse4.1 -mstackrealign
+endif
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
 # ---------------------------------------
 # Build libmesa_i965_gen4
 # ---------------------------------------
@@ -289,6 +310,7 @@ LOCAL_SRC_FILES := \
 LOCAL_WHOLE_STATIC_LIBRARIES := \
 	$(MESA_DRI_WHOLE_STATIC_LIBRARIES) \
 	$(I965_PERGEN_LIBS) \
+	libmesa_intel_tiled_memcpy \
 	libmesa_intel_dev \
 	libmesa_intel_common \
 	libmesa_isl \
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 0afa7a2f216..d9e06930d38 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
 
 noinst_LTLIBRARIES = \
 	libi965_dri.la \
+	libintel_tiled_memcpy.la \
 	$(I965_PERGEN_LIBS)
 
+libintel_tiled_memcpy_la_SOURCES = \
+	$(intel_tiled_memcpy_FILES)
+libintel_tiled_memcpy_la_CFLAGS = \
+	$(AM_CFLAGS) $(SSE41_CFLAGS)
+
 libi965_dri_la_SOURCES = \
 	$(i965_FILES) \
 	$(i965_oa_GENERATED_FILES)
@@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \
 	$(top_builddir)/src/intel/compiler/libintel_compiler.la \
 	$(top_builddir)/src/intel/blorp/libblorp.la \
 	$(I965_PERGEN_LIBS) \
+	libintel_tiled_memcpy.la
 	$(LIBDRM_LIBS)
 
 BUILT_SOURCES = $(i965_oa_GENERATED_FILES)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index db6591ab90a..ce7633c53c4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,11 +110,13 @@ i965_FILES = \
 	intel_tex_image.c \
 	intel_tex_obj.h \
 	intel_tex_validate.c \
-	intel_tiled_memcpy.c \
-	intel_tiled_memcpy.h \
 	intel_upload.c \
 	libdrm_macros.h
 
+intel_tiled_memcpy_FILES = \
+	intel_tiled_memcpy.c \
+	intel_tiled_memcpy.h
+
 i965_gen4_FILES = \
 	genX_blorp_exec.c \
 	genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 7c6bde990d6..fac5427d2ed 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,6 +36,10 @@
 #include "brw_context.h"
 #include "intel_tiled_memcpy.h"
 
+#if defined(USE_SSE41)
+#include "main/streaming-load-memcpy.h"
+#include <smmintrin.h>
+#endif
 #if defined(__SSSE3__)
 #include <tmmintrin.h>
 #elif defined(__SSE2__)
@@ -213,6 +217,31 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
    return dst;
 }
 
+#if defined(USE_SSE41)
+static ALWAYS_INLINE void *
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+   if (count == 16) {
+      __m128i val = _mm_stream_load_si128((__m128i *)src);
+      _mm_store_si128((__m128i *)dest, val);
+      return dest;
+   } else if (count == 64) {
+      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+      _mm_store_si128(((__m128i *)dest) + 0, val0);
+      _mm_store_si128(((__m128i *)dest) + 1, val1);
+      _mm_store_si128(((__m128i *)dest) + 2, val2);
+      _mm_store_si128(((__m128i *)dest) + 3, val3);
+      return dest;
+   } else {
+      assert(count < 64); /* and (count < 16) for ytiled */
+      return memcpy(dest, src, count);
+   }
+}
+#endif
+
 /**
  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
@@ -677,6 +706,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    } else {
@@ -687,6 +722,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    }
@@ -719,6 +760,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    } else {
@@ -729,6 +776,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+      else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    }
@@ -868,6 +921,15 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
       unreachable("unsupported tiling");
    }
 
+#if defined(USE_SSE41)
+   if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) {
+      /* The hidden cacheline sized register used by movntdqa can apparently
+       * give you stale data, so do an mfence to invalidate it.
+       */
+      _mm_mfence();
+   }
+#endif
+
    /* Round out to tile boundaries. */
    xt0 = ALIGN_DOWN(xt1, tw);
    xt3 = ALIGN_UP  (xt2, tw);
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index b95e2d76489..94d4b154ce6 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -129,12 +129,15 @@ files_i965 = files(
   'intel_tex_image.c',
   'intel_tex_obj.h',
   'intel_tex_validate.c',
-  'intel_tiled_memcpy.c',
-  'intel_tiled_memcpy.h',
   'intel_upload.c',
   'libdrm_macros.h',
 )
 
+files_intel_tiled_memcpy = files(
+  'intel_tiled_memcpy.c',
+  'intel_tiled_memcpy.h',
+)
+
 i965_gen_libs = []
 foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
   i965_gen_libs += static_library(
@@ -176,6 +179,15 @@ i965_oa_sources = custom_target(
   ],
 )
 
+intel_tiled_memcpy = static_library(
+  'intel_tiled_memcpy',
+  [files_intel_tiled_memcpy],
+  include_directories : [
+    inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+  ],
+  c_args : [c_vis_args, no_override_init_args, '-msse2', sse41_args],
+)
+
 libi965 = static_library(
   'i965',
   [files_i965, i965_oa_sources, ir_expression_operation_h,
@@ -187,7 +199,7 @@ libi965 = static_library(
   cpp_args : [cpp_vis_args, c_sse2_args],
   link_with : [
     i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
-    libblorp,
+    libblorp, intel_tiled_memcpy,
   ],
   dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
 )
-- 
2.14.4



More information about the mesa-dev mailing list