[Mesa-dev] [PATCH 11/22] i965: Support WC uploads for Y-tiling

Sat Aug 5 09:40:03 UTC 2017

Y-tiling makes a mess of our cacheline WCB, forcing evictions and writes
between each pixel of the linear_to_ytiled routines, effectively
reducing the upload to UC performance (i.e. terrible). This patch takes
the simple approach of doing the detiling into a temporary page and then
copying the page to WC (so we have a very simple and efficient upload
into WC, whilst keeping the detiler in cache).
---
 src/mesa/drivers/dri/i965/intel_tex_subimage.c |  13 +-
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 161 ++++++++++++++++++++++++-
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.h |   1 +
 3 files changed, 160 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 56c6cbf7b8..e7486300ab 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -83,9 +83,6 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    struct brw_context *brw = brw_context(ctx);
    struct intel_texture_image *image = intel_texture_image(texImage);
 
-   /* The miptree's buffer. */
-   struct brw_bo *bo;
-
    mem_copy_fn mem_copy;
 
    /* This fastpath is restricted to specific texture types:
@@ -136,20 +133,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    if (brw->gen < 5 && brw->has_swizzling)
       return false;
 
-   bo = image->mt->bo;
-
-   /* Uploading into Y-tiling surfaces using WC is slow as each sequential
-    * write falls outside of the WCB, completely nerfing the WC performance.
-    */
-   if (!bo->cache_coherent && image->mt->surf.tiling == ISL_TILING_Y0)
-      return false;
-
    /* Since we are going to write raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
    assert(image->mt->surf.logical_level0_px.depth == 1);
    assert(image->mt->surf.logical_level0_px.array_len == 1);
 
+   struct brw_bo *bo = image->mt->bo;
    if (brw_batch_references(&brw->batch, bo)) {
       perf_debug("Flushing before mapping a referenced bo.\n");
       intel_batchbuffer_flush(brw);
@@ -209,6 +199,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
          map,
          pixels - (ptrdiff_t) level_y * src_pitch - (ptrdiff_t) level_x * cpp,
          image->mt->surf.row_pitch, src_pitch,
+	 bo->cache_coherent,
          brw->has_swizzling,
          image->mt->surf.tiling,
          mem_copy
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 65dd950c08..7c9fefeda9 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -324,7 +324,7 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 static inline void
 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  uint32_t y0, uint32_t y1,
-                 char *dst, const char *src,
+                 char * __restrict__ dst, const char * __restrict__ src,
                  int32_t src_pitch,
                  uint32_t swizzle_bit,
                  mem_copy_fn mem_copy,
@@ -378,6 +378,104 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    }
 }
 
+#if defined(__SSSE3__)
+static inline void
+linear_to_ytiled0(char * __restrict__ dst, const char * __restrict__ src,
+                  int32_t src_pitch,
+                  uint32_t swizzle_bit,
+                  mem_copy_fn mem_copy)
+{
+   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
+    * as the tile).  Thus the destination offset for (x,y) is the sum of:
+    *   (x % column_width)                    // position within column
+    *   (x / column_width) * bytes_per_column // column number * bytes per column
+    *   y * column_width
+    *
+    * The copy destination offset for each range copied is the sum of
+    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
+    */
+   const uint32_t column_width = ytile_span;
+   const uint32_t bytes_per_column = column_width * ytile_height;
+
+   uint32_t x, y;
+
+   for (y = 0; y < ytile_height; y++) {
+      __m128i xmm[8];
+      uint32_t swizzle = 0;
+
+      if (mem_copy == memcpy) {
+         for (x = 0; x < 8; x++)
+            xmm[x] = _mm_loadu_si128((__m128i *)src + x);
+
+         for (x = 0; x < 8; x++) {
+            _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]);
+            swizzle ^= swizzle_bit;
+         }
+      } else {
+         for (x = 0; x < 4; x++)
+            xmm[x] = _mm_shuffle_epi8( _mm_loadu_si128((__m128i *)src + x),
+                                       *(__m128i *) rgba8_permutation);
+
+         for (x = 0; x < 4; x++) {
+            _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]);
+            swizzle ^= swizzle_bit;
+         }
+
+         for (x = 4; x < 8; x++)
+            xmm[x] = _mm_shuffle_epi8( _mm_loadu_si128((__m128i *)src + x),
+                                     *(__m128i *) rgba8_permutation);
+
+         for (x = 4; x < 8; x++) {
+            _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]);
+            swizzle ^= swizzle_bit;
+         }
+      }
+
+      src += src_pitch;
+   }
+}
+
+/**
+ * Copy texture data from linear to Y tile layout through WC.
+ *
+ * \copydoc tile_copy_fn
+ */
+static inline void
+linear_to_ytiled_wc(char * __restrict__ dst,
+                     const char * __restrict__ src,
+                     int32_t src_pitch,
+                     mem_copy_fn mem_copy)
+{
+   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
+    * as the tile).  Thus the destination offset for (x,y) is the sum of:
+    *   (x % column_width)                    // position within column
+    *   (x / column_width) * bytes_per_column // column number * bytes per column
+    *   y * column_width
+    *
+    * The copy destination offset for each range copied is the sum of
+    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
+    */
+   const uint32_t column_width = ytile_span;
+   const uint32_t bytes_per_column = column_width * ytile_height;
+   uint8_t tmp[4096] __attribute__((aligned(16)));
+
+   uint32_t x, y;
+
+   for (y = 0; y < ytile_height; y++) {
+      __m128i xmm[8];
+
+      for (x = 0; x < 8; x++)
+         xmm[x] = _mm_loadu_si128((__m128i *)src + x);
+      for (x = 0; x < 8; x++)
+         _mm_store_si128((__m128i *)(tmp + y * column_width + x*bytes_per_column), xmm[x]);
+
+      src += src_pitch;
+   }
+
+   mem_copy(dst, tmp, sizeof(tmp));
+}
+#endif
+
 /**
  * Copy texture data from X tile layout to linear.
  *
@@ -547,6 +645,14 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                         mem_copy_fn mem_copy)
 {
    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
+#ifdef __SSSE3__
+      if (mem_copy == memcpy)
+         return linear_to_ytiled0(dst, src, src_pitch, swizzle_bit,
+                                  memcpy);
+      else if (mem_copy == rgba8_copy_aligned_dst)
+         return linear_to_ytiled0(dst, src, src_pitch, swizzle_bit,
+                                 rgba8_copy_aligned_dst);
+#else
       if (mem_copy == memcpy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
@@ -554,6 +660,7 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, src_pitch, swizzle_bit,
                                  rgba8_copy, rgba8_copy_aligned_dst);
+#endif /* __SSSE3__ */
       else
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, src_pitch, swizzle_bit,
@@ -575,6 +682,45 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 }
 
 /**
+ * Copy texture data from linear to Y tile layout using WC, faster.
+ *
+ * Same as \ref linear_to_ytiled but optimised for WC.
+ *
+ * \copydoc tile_copy_fn
+ */
+static FLATTEN void
+linear_to_ytiled_wc_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+                           uint32_t y0, uint32_t y1,
+                           char *dst, const char *src,
+                           int32_t src_pitch,
+                           uint32_t swizzle_bit,
+                           mem_copy_fn mem_copy)
+{
+#if defined(__SSSE3__)
+   if (x3 - x0 == ytile_width && y1 - y0 == ytile_height) {
+      if (mem_copy == memcpy)
+         return linear_to_ytiled_wc(dst, src, src_pitch, memcpy);
+      else
+         return linear_to_ytiled_wc(dst, src, src_pitch, mem_copy);
+   } else
+#endif
+   {
+      if (mem_copy == memcpy)
+         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
+                                 dst, src, src_pitch, swizzle_bit,
+				 memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
+         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
+                                 dst, src, src_pitch, swizzle_bit,
+                                 rgba8_copy, rgba8_copy_aligned_dst);
+      else
+         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
+                                 dst, src, src_pitch, swizzle_bit,
+                                 mem_copy, mem_copy);
+   }
+}
+
+/**
  * Copy texture data from X tile layout to linear, faster.
  *
  * Same as \ref xtile_to_linear but faster, because it passes constant
@@ -680,6 +826,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
                 char *dst, const char *src,
                 uint32_t dst_pitch, int32_t src_pitch,
+		bool cache_coherent,
                 bool has_swizzling,
                 enum isl_tiling tiling,
                 mem_copy_fn mem_copy)
@@ -700,7 +847,12 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
       tw = ytile_width;
       th = ytile_height;
       span = ytile_span;
-      tile_copy = linear_to_ytiled_faster;
+      if (cache_coherent)
+         tile_copy = linear_to_ytiled_faster;
+      else if (has_swizzling)
+         unreachable("unsupported tiling");
+      else
+         tile_copy = linear_to_ytiled_wc_faster;
    } else {
       unreachable("unsupported tiling");
    }
@@ -718,14 +870,15 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
     * Looping x inside y is the faster memory access pattern.
     */
    for (yt = yt0; yt < yt3; yt += th) {
+      uint32_t y0 = MAX2(yt1, yt);
+      uint32_t y1 = MIN2(yt2, yt + th);
+
       for (xt = xt0; xt < xt3; xt += tw) {
          /* The area to update is [x0,x3) x [y0,y1).
           * May not want the whole tile, hence the min and max.
           */
          uint32_t x0 = MAX2(xt1, xt);
-         uint32_t y0 = MAX2(yt1, yt);
          uint32_t x3 = MIN2(xt2, xt + tw);
-         uint32_t y1 = MIN2(yt2, yt + th);
 
          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
           * the middle interval is the longest span-aligned part.
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
index e9c43920a1..9d6c71d1cf 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -42,6 +42,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
                 char *dst, const char *src,
                 uint32_t dst_pitch, int32_t src_pitch,
+                bool cache_coherent,
                 bool has_swizzling,
                 enum isl_tiling tiling,
                 mem_copy_fn mem_copy);
-- 
2.13.3