[Mesa-dev] [PATCH 2/3] i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear

Tapani Pälli tapani.palli at intel.com
Mon Sep 24 11:19:38 UTC 2018


From: Scott D Phillips <scott.d.phillips at intel.com>

The reference for MOVNTDQA says:

    For WC memory type, the nontemporal hint may be implemented by
    loading a temporary internal buffer with the equivalent of an
    aligned cache line without filling this data to the cache.
    [...] Subsequent MOVNTDQA reads to unread portions of the WC
    cache line will receive data from the temporary internal
    buffer if data is available.

This hidden cache line sized temporary buffer can improve the
read performance from wc maps.

v2: Add mfence at start of tiled_to_linear for streaming loads (Chris)
v3: add Android build support (Tapani)
v4: squash 'fix i915: Fix streaming loads for intel_tiled_memcpy'
    separate sse41 to own static library (Tapani)

Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk> (v2)
Reviewed-by: Matt Turner <mattst88 at gmail.com> (v2)
Acked-by: Kenneth Graunke <kenneth at whitecape.org> (v2)
Signed-off-by: Tapani Pälli <tapani.palli at intel.com>
---
 src/mesa/drivers/dri/i965/Android.mk          |  38 +++++
 src/mesa/drivers/dri/i965/Makefile.am         |  14 ++
 src/mesa/drivers/dri/i965/Makefile.sources    |  10 +-
 .../drivers/dri/i965/intel_tiled_memcpy.c     | 160 +++++++++---------
 .../drivers/dri/i965/intel_tiled_memcpy.h     |  77 ++++++++-
 .../dri/i965/intel_tiled_memcpy_normal.c      |  59 +++++++
 .../dri/i965/intel_tiled_memcpy_sse41.c       |  61 +++++++
 .../dri/i965/intel_tiled_memcpy_sse41.h       |  59 +++++++
 src/mesa/drivers/dri/i965/meson.build         |  38 ++++-
 9 files changed, 426 insertions(+), 90 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/intel_tiled_memcpy_normal.c
 create mode 100644 src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.c
 create mode 100644 src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.h

diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index 324d087220a..e125eb6d394 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -51,6 +51,42 @@ I965_PERGEN_LIBS := \
 	libmesa_i965_gen10 \
 	libmesa_i965_gen11
 
+
+# ---------------------------------------
+# Build libmesa_intel_tiled_memcpy
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_intel_tiled_memcpy
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(intel_tiled_memcpy_FILES)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libmesa_intel_tiled_memcpy_sse41
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_intel_tiled_memcpy_sse41
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(intel_tiled_memcpy_sse41_FILES)
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_CFLAGS += \
+	-DUSE_SSE41 -msse4.1 -mstackrealign
+endif
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
 # ---------------------------------------
 # Build libmesa_i965_gen4
 # ---------------------------------------
@@ -289,6 +325,8 @@ LOCAL_SRC_FILES := \
 LOCAL_WHOLE_STATIC_LIBRARIES := \
 	$(MESA_DRI_WHOLE_STATIC_LIBRARIES) \
 	$(I965_PERGEN_LIBS) \
+	libmesa_intel_tiled_memcpy \
+	libmesa_intel_tiled_memcpy_sse41 \
 	libmesa_intel_dev \
 	libmesa_intel_common \
 	libmesa_isl \
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 0afa7a2f216..dc19da2c4a6 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -92,8 +92,20 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
 
 noinst_LTLIBRARIES = \
 	libi965_dri.la \
+	libintel_tiled_memcpy.la \
+	libintel_tiled_memcpy_sse41.la \
 	$(I965_PERGEN_LIBS)
 
+libintel_tiled_memcpy_la_SOURCES = \
+	$(intel_tiled_memcpy_FILES)
+libintel_tiled_memcpy_la_CFLAGS = \
+	$(AM_CFLAGS)
+
+libintel_tiled_memcpy_sse41_la_SOURCES = \
+	$(intel_tiled_memcpy_sse41_FILES)
+libintel_tiled_memcpy_sse41_la_CFLAGS = \
+	$(AM_CFLAGS) $(SSE41_CFLAGS)
+
 libi965_dri_la_SOURCES = \
 	$(i965_FILES) \
 	$(i965_oa_GENERATED_FILES)
@@ -104,6 +116,8 @@ libi965_dri_la_LIBADD = \
 	$(top_builddir)/src/intel/compiler/libintel_compiler.la \
 	$(top_builddir)/src/intel/blorp/libblorp.la \
 	$(I965_PERGEN_LIBS) \
+	libintel_tiled_memcpy.la \
+	libintel_tiled_memcpy_sse41.la \
 	$(LIBDRM_LIBS)
 
 BUILT_SOURCES = $(i965_oa_GENERATED_FILES)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index db6591ab90a..0ab0e42fb18 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,11 +110,17 @@ i965_FILES = \
 	intel_tex_image.c \
 	intel_tex_obj.h \
 	intel_tex_validate.c \
-	intel_tiled_memcpy.c \
-	intel_tiled_memcpy.h \
 	intel_upload.c \
 	libdrm_macros.h
 
+intel_tiled_memcpy_FILES = \
+	intel_tiled_memcpy_normal.c \
+	intel_tiled_memcpy.h
+
+intel_tiled_memcpy_sse41_FILES = \
+	intel_tiled_memcpy_sse41.c \
+	intel_tiled_memcpy_sse41.h
+
 i965_gen4_FILES = \
 	genX_blorp_exec.c \
 	genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 76a92b4d41f..b6bf96706f8 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -566,6 +566,31 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    }
 }
 
+#if defined(INLINE_SSE41)
+static ALWAYS_INLINE void *
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+   if (count == 16) {
+      __m128i val = _mm_stream_load_si128((__m128i *)src);
+      _mm_storeu_si128((__m128i *)dest, val);
+      return dest;
+   } else if (count == 64) {
+      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+      _mm_storeu_si128(((__m128i *)dest) + 0, val0);
+      _mm_storeu_si128(((__m128i *)dest) + 1, val1);
+      _mm_storeu_si128(((__m128i *)dest) + 2, val2);
+      _mm_storeu_si128(((__m128i *)dest) + 3, val3);
+      return dest;
+   } else {
+      assert(count < 64); /* and (count < 16) for ytiled */
+      return memcpy(dest, src, count);
+   }
+}
+#endif
+
 static mem_copy_fn
 choose_copy_function(mem_copy_fn_type copy_type)
 {
@@ -574,6 +599,10 @@ choose_copy_function(mem_copy_fn_type copy_type)
       return memcpy;
    case INTEL_COPY_RGBA8:
       return rgba8_copy;
+#if defined(INLINE_SSE41)
+   case INTEL_COPY_STREAMING_LOAD:
+      return _memcpy_streaming_load;
+#endif
    default:
       assert(!"unreachable");
    }
@@ -696,6 +725,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (mem_copy == _memcpy_streaming_load)
+         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    } else {
@@ -706,6 +741,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (mem_copy == _memcpy_streaming_load)
+         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    }
@@ -740,6 +781,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (copy_type == INTEL_COPY_STREAMING_LOAD)
+         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    } else {
@@ -750,6 +797,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (copy_type == INTEL_COPY_STREAMING_LOAD)
+         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
       else
          unreachable("not reached");
    }
@@ -768,14 +821,14 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  * 'dst' is the address of (0, 0) in the destination tiled texture.
  * 'src' is the address of (xt1, yt1) in the source linear texture.
  */
-void
-linear_to_tiled(uint32_t xt1, uint32_t xt2,
-                uint32_t yt1, uint32_t yt2,
-                char *dst, const char *src,
-                uint32_t dst_pitch, int32_t src_pitch,
-                bool has_swizzling,
-                enum isl_tiling tiling,
-                mem_copy_fn_type copy_type)
+static void
+intel_linear_to_tiled(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      uint32_t dst_pitch, int32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type)
 {
    tile_copy_fn tile_copy;
    uint32_t xt0, xt3;
@@ -859,14 +912,14 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
  * 'dst' is the address of (xt1, yt1) in the destination linear texture.
  * 'src' is the address of (0, 0) in the source tiled texture.
  */
-void
-tiled_to_linear(uint32_t xt1, uint32_t xt2,
-                uint32_t yt1, uint32_t yt2,
-                char *dst, const char *src,
-                int32_t dst_pitch, uint32_t src_pitch,
-                bool has_swizzling,
-                enum isl_tiling tiling,
-                mem_copy_fn_type copy_type)
+static void
+intel_tiled_to_linear(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      int32_t dst_pitch, uint32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type)
 {
    tile_copy_fn tile_copy;
    uint32_t xt0, xt3;
@@ -889,6 +942,15 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
       unreachable("unsupported tiling");
    }
 
+#if defined(INLINE_SSE41)
+   if (copy_type == INTEL_COPY_STREAMING_LOAD) {
+      /* The hidden cacheline sized register used by movntdqa can apparently
+       * give you stale data, so do an mfence to invalidate it.
+       */
+      _mm_mfence();
+   }
+#endif
+
    /* Round out to tile boundaries. */
    xt0 = ALIGN_DOWN(xt1, tw);
    xt3 = ALIGN_UP  (xt2, tw);
@@ -938,69 +1000,3 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
       }
    }
 }
-
-
-/**
- * Determine which copy function to use for the given format combination
- *
- * The only two possible copy functions which are ever returned are a
- * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
- * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
- * symmetric), it doesn't matter whether the copy is from the tiled image
- * to the untiled or vice versa.  The copy function required is the same in
- * either case so this function can be used.
- *
- * \param[in]  tiledFormat The format of the tiled image
- * \param[in]  format      The GL format of the client data
- * \param[in]  type        The GL type of the client data
- * \param[out] mem_copy    Will be set to one of either the standard
- *                         library's memcpy or a different copy function
- *                         that performs an RGBA to BGRA conversion
- * \param[out] cpp         Number of bytes per channel
- *
- * \return true if the format and type combination are valid
- */
-bool
-intel_get_memcpy_type(mesa_format tiledFormat, GLenum format, GLenum type,
-                      mem_copy_fn_type *copy_type, uint32_t *cpp)
-{
-   *copy_type = INTEL_COPY_INVALID;
-
-   if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
-       !(format == GL_RGBA || format == GL_BGRA))
-      return false; /* Invalid type/format combination */
-
-   if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
-       (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
-      *cpp = 1;
-      *copy_type = INTEL_COPY_MEMCPY;
-   } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
-              (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
-      *cpp = 4;
-      if (format == GL_BGRA) {
-         *copy_type = INTEL_COPY_MEMCPY;
-      } else if (format == GL_RGBA) {
-         *copy_type = INTEL_COPY_RGBA8;
-      }
-   } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
-              (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
-      *cpp = 4;
-      if (format == GL_BGRA) {
-         /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
-          * use the same function.
-          */
-         *copy_type = INTEL_COPY_RGBA8;
-      } else if (format == GL_RGBA) {
-         *copy_type = INTEL_COPY_MEMCPY;
-      }
-   }
-
-   if (*copy_type == INTEL_COPY_INVALID)
-      return false;
-
-   return true;
-}
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
index 70934410298..90aadf9e090 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -38,11 +38,21 @@
 typedef enum {
   INTEL_COPY_MEMCPY = 0,
   INTEL_COPY_RGBA8,
+  INTEL_COPY_STREAMING_LOAD,
   INTEL_COPY_INVALID,
 } mem_copy_fn_type;
 
 typedef void *(*mem_copy_fn)(void *dest, const void *src, size_t n);
 
+typedef void (*tiled_to_linear_fn)
+   (uint32_t xt1, uint32_t xt2,
+    uint32_t yt1, uint32_t yt2,
+    char *dst, const char *src,
+    int32_t dst_pitch, uint32_t src_pitch,
+    bool has_swizzling,
+    enum isl_tiling tiling,
+    mem_copy_fn_type copy_type);
+
 void
 linear_to_tiled(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
@@ -61,8 +71,69 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
                 enum isl_tiling tiling,
                 mem_copy_fn_type copy_type);
 
-bool intel_get_memcpy_type(mesa_format tiledFormat, GLenum format,
-                           GLenum type, mem_copy_fn_type *copy_type,
-                           uint32_t *cpp);
+/**
+ * Determine which copy function to use for the given format combination
+ *
+ * The only two possible copy functions which are ever returned are a
+ * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
+ * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
+ * symmetric), it doesn't matter whether the copy is from the tiled image
+ * to the untiled or vice versa.  The copy function required is the same in
+ * either case so this function can be used.
+ *
+ * \param[in]  tiledFormat The format of the tiled image
+ * \param[in]  format      The GL format of the client data
+ * \param[in]  type        The GL type of the client data
+ * \param[out] mem_copy    Will be set to one of either the standard
+ *                         library's memcpy or a different copy function
+ *                         that performs an RGBA to BGRA conversion
+ * \param[out] cpp         Number of bytes per channel
+ *
+ * \return true if the format and type combination are valid
+ */
+static MAYBE_UNUSED bool
+intel_get_memcpy_type(mesa_format tiledFormat, GLenum format, GLenum type,
+                      mem_copy_fn_type *copy_type, uint32_t *cpp)
+{
+   *copy_type = INTEL_COPY_INVALID;
+
+   if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
+       !(format == GL_RGBA || format == GL_BGRA))
+      return false; /* Invalid type/format combination */
+
+   if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
+       (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
+      *cpp = 1;
+      *copy_type = INTEL_COPY_MEMCPY;
+   } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
+              (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
+      *cpp = 4;
+      if (format == GL_BGRA) {
+         *copy_type = INTEL_COPY_MEMCPY;
+      } else if (format == GL_RGBA) {
+         *copy_type = INTEL_COPY_RGBA8;
+      }
+   } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
+              (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
+      *cpp = 4;
+      if (format == GL_BGRA) {
+         /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
+          * use the same function.
+          */
+         *copy_type = INTEL_COPY_RGBA8;
+      } else if (format == GL_RGBA) {
+         *copy_type = INTEL_COPY_MEMCPY;
+      }
+   }
+
+   if (*copy_type == INTEL_COPY_INVALID)
+      return false;
+
+   return true;
+}
 
 #endif /* INTEL_TILED_MEMCPY */
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy_normal.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_normal.c
new file mode 100644
index 00000000000..c246067541b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_normal.c
@@ -0,0 +1,59 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2012 Intel Corporation
+ * Copyright 2013 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chad Versace <chad.versace at linux.intel.com>
+ *    Frank Henigman <fjhenigman at google.com>
+ */
+
+
+#include "intel_tiled_memcpy.c"
+
+void
+linear_to_tiled(uint32_t xt1, uint32_t xt2,
+                uint32_t yt1, uint32_t yt2,
+                char *dst, const char *src,
+                uint32_t dst_pitch, int32_t src_pitch,
+                bool has_swizzling,
+                enum isl_tiling tiling,
+                mem_copy_fn_type copy_type)
+{
+   intel_linear_to_tiled(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
+
+void
+tiled_to_linear(uint32_t xt1, uint32_t xt2,
+                uint32_t yt1, uint32_t yt2,
+                char *dst, const char *src,
+                int32_t dst_pitch, uint32_t src_pitch,
+                bool has_swizzling,
+                enum isl_tiling tiling,
+                mem_copy_fn_type copy_type)
+{
+   intel_tiled_to_linear(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.c
new file mode 100644
index 00000000000..bc33ea11839
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.c
@@ -0,0 +1,61 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2012 Intel Corporation
+ * Copyright 2013 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chad Versace <chad.versace at linux.intel.com>
+ *    Frank Henigman <fjhenigman at google.com>
+ */
+
+#define INLINE_SSE41
+
+#include "intel_tiled_memcpy_sse41.h"
+#include "intel_tiled_memcpy.c"
+
+void
+linear_to_tiled_sse41(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      uint32_t dst_pitch, int32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type)
+{
+   intel_linear_to_tiled(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
+
+void
+tiled_to_linear_sse41(uint32_t xt1, uint32_t xt2,
+                     uint32_t yt1, uint32_t yt2,
+                     char *dst, const char *src,
+                     int32_t dst_pitch, uint32_t src_pitch,
+                     bool has_swizzling,
+                     enum isl_tiling tiling,
+                     mem_copy_fn_type copy_type)
+{
+   intel_tiled_to_linear(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.h
new file mode 100644
index 00000000000..5ddd6d01bb8
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.h
@@ -0,0 +1,59 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2012 Intel Corporation
+ * Copyright 2013 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chad Versace <chad.versace at linux.intel.com>
+ *    Frank Henigman <fjhenigman at google.com>
+ */
+
+#ifndef INTEL_TILED_MEMCPY_SSE41_H
+#define INTEL_TILED_MEMCPY_SSE41_H
+
+#include <stdint.h>
+#include "main/mtypes.h"
+#include "isl/isl.h"
+
+#include "intel_tiled_memcpy.h"
+
+void
+linear_to_tiled_sse41(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      uint32_t dst_pitch, int32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type);
+
+void
+tiled_to_linear_sse41(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      int32_t dst_pitch, uint32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type);
+
+#endif /* INTEL_TILED_MEMCPY_SSE41_H */
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index b95e2d76489..bf366a6c157 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -129,12 +129,20 @@ files_i965 = files(
   'intel_tex_image.c',
   'intel_tex_obj.h',
   'intel_tex_validate.c',
-  'intel_tiled_memcpy.c',
-  'intel_tiled_memcpy.h',
   'intel_upload.c',
   'libdrm_macros.h',
 )
 
+files_intel_tiled_memcpy = files(
+  'intel_tiled_memcpy_normal.c',
+  'intel_tiled_memcpy.h',
+)
+
+files_intel_tiled_memcpy_sse41 = files(
+  'intel_tiled_memcpy_sse41.c',
+  'intel_tiled_memcpy_sse41.h',
+)
+
 i965_gen_libs = []
 foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
   i965_gen_libs += static_library(
@@ -176,6 +184,30 @@ i965_oa_sources = custom_target(
   ],
 )
 
+intel_tiled_memcpy = static_library(
+  'intel_tiled_memcpy',
+  [files_intel_tiled_memcpy],
+  include_directories : [
+    inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+  ],
+  c_args : [c_vis_args, no_override_init_args, '-msse2'],
+)
+
+if with_sse41
+intel_tiled_memcpy_sse41 = static_library(
+  'intel_tiled_memcpy_sse41',
+  [files_intel_tiled_memcpy_sse41],
+  include_directories : [
+    inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+  ],
+  link_args : [ '-Wl,--exclude-libs=ALL' ],
+  c_args : [c_vis_args, no_override_init_args, '-Wl,--exclude-libs=ALL', '-msse2', sse41_args],
+)
+else
+intel_tiled_memcpy_sse41 = []
+endif
+
+
 libi965 = static_library(
   'i965',
   [files_i965, i965_oa_sources, ir_expression_operation_h,
@@ -187,7 +219,7 @@ libi965 = static_library(
   cpp_args : [cpp_vis_args, c_sse2_args],
   link_with : [
     i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
-    libblorp,
+    libblorp, intel_tiled_memcpy, intel_tiled_memcpy_sse41
   ],
   dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
 )
-- 
2.17.1



More information about the mesa-dev mailing list