[Mesa-dev] [PATCH 4/4] i965/tiled_memcpy: Add support for Yf and Ys tiling/detiling

Wed Feb 28 04:22:45 UTC 2018

Yf and Ys are a family of tilings similar to Y. The actual address
bit interleavings for Yf* and Ys* depend upon the bits-per-pixel
value of the surface, where 128-, 32-, and 8-bpp tiles are square
and 64- and 16-bpp tiles have a 2:1 aspect ratio.

The address bit layout of Yf and Ys are the same in the low
12 bits (4-kbytes); however Ys tiles are actually 64-kbytes in
size, but can be handled as if they were composed of 4-kbyte
sub-tiles with a different overall tile ordering.

Because all of Y, Yf*, and Ys* have the same least significant
6 bits of address bit layout, the same tiling/detiling routine can
be used between them. The inner loop that writes or reads cache
lines at a time is the same, while the outer loop (which walks in
a mostly linear order) uses different address increment values for
each tiling format to adjust the tile addresses.
---
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 219 ++++++++++++++++++++-----
 1 file changed, 179 insertions(+), 40 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index a78e2b97d45..2b040a69524 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -57,6 +57,12 @@ static const uint32_t xtile_span = 64;
 static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
+static const uint32_t std_ytile128_width = 256;
+static const uint32_t std_ytile128_height = 16;
+static const uint32_t std_ytile32_width = 128;
+static const uint32_t std_ytile32_height = 32;
+static const uint32_t std_ytile8_width = 64;
+static const uint32_t std_ytile8_height = 64;
 
 static inline uint32_t
 ror(uint32_t n, uint32_t d)
@@ -253,6 +259,48 @@ ytile_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
    return src + (((y >> 5) * (src_pitch >> 7) + (x >> 7)) << 12);
 }
 
+static char *
+yf128_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
+{
+   return src + (((y >> 4) * (src_pitch >> 8) + (x >> 8)) << 12);
+}
+
+static char *
+yf32_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
+{
+   return src + (((y >> 5) * (src_pitch >> 7) + (x >> 7)) << 12);
+}
+
+static char *
+yf8_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
+{
+   return src + (((y >> 6) * (src_pitch >> 6) + (x >> 6)) << 12);
+}
+
+static char *
+ys128_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
+{
+   return src + (((y & 0x10) << 8) + ((y & 0x20) << 9) + ((x & 0x100) << 5) +
+                 ((x & 200) << 6) +
+                 (((y >> 6) * (src_pitch >> 10) + (x >> 10)) << 16));
+}
+
+static char *
+ys32_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
+{
+   return src + (((y & 0x20) << 7) + ((y & 0x40) << 8) + ((x & 0x80) << 6) +
+                 ((x & 100) << 7) +
+                 (((y >> 7) * (src_pitch >> 9) + (x >> 9)) << 16));
+}
+
+static char *
+ys8_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)
+{
+   return src + (((y & 0x40) << 6) + ((y & 0x80) << 7) + ((x & 0x40) << 7) +
+                 ((x & 80) << 8) +
+                 (((y >> 8) * (src_pitch >> 8) + (x >> 8)) << 16));
+}
+
 /**
  * Copy texture data from linear to X tile layout.
  *
@@ -302,7 +350,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 }
 
 /**
- * Copy texture data from linear to Y tile layout.
+ * Copy texture data from linear to Y tile layout. This function tiles a
+ * single 4KB portion of the tiling (even for the 64KB tiling variants)
  *
  * \copydoc tile_copy_fn
  */
@@ -312,28 +361,71 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  char *dst, const char *src,
                  int32_t src_pitch,
                  uint32_t swizzle_bit,
-                 UNUSED enum isl_tiling tiling,
-                 UNUSED int cpp,
+                 enum isl_tiling tiling,
+                 int cpp,
                  mem_copy_fn mem_copy,
                  mem_copy_fn mem_copy_align16)
 {
-   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
-    * as the tile).  Thus the destination offset for (x,y) is the sum of:
-    *   (x % column_width)                    // position within column
-    *   (x / column_width) * bytes_per_column // column number * bytes per column
-    *   y * column_width
+   /* The Y tilings are a family of different tilings with the following
+    * linear-to-tiled address mapping for the low 12-bits of the tiled
+    * addresses:
+    *
+    * Tiling        bpp        11 10  9  8  7  6  5  4  3  2  1  0
+    * ------------------------------------------------------------
+    * TileYF/TileYS 64 & 128   u7 v3 u6 v2 u5 u4 v1 v0 u3 u2 u1 u0
+    * TileYF/TileYS 16 &  32   u6 v4 u5 v3 u4 v2 v1 v0 u3 u2 u1 u0
+    * TileYF/TileYS        8   u5 v5 u4 v4 v3 v2 v1 v0 u3 u2 u1 u0
+    * TileY                    u6 u5 u4 v4 v3 v2 v1 v0 u3 u2 u1 u0
     *
-    * The copy destination offset for each range copied is the sum of
-    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
+    * The low 6-bits (addressing 64B, one cache line) of the tiling is common
+    * across all variants.
     */
    const uint32_t column_width = ytile_span;
-   const uint32_t bytes_per_column = column_width * ytile_height;
 
    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
 
-   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
-   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
+   uint32_t xinc_16, x_mask;
+   uint32_t yinc_1, yinc_4, y_mask;
+
+   uint32_t xo0, xo1, xo2;
+   uint32_t yo0, yo1, yo2;
+
+#define YF_128_X(x) (((x) & 0xF) | (((x) & 0x30) << 2) | (((x) & 0x40) << 3) | (((x) & 0x80) << 4))
+#define YF_32_X(x)  (((x) & 0xF) | (((x) & 0x10) << 3) | (((x) & 0x20) << 4) | (((x) & 0x40) << 5))
+#define YF_8_X(x)   (((x) & 0xF) | (((x) & 0x10) << 5) | (((x) & 0x20) << 6))
+#define Y0_X(x)     (((x) & 0xF) | (((x) & 0x70) << 5))
+
+#define YF_128_Y(y) ((((y) & 0x03) << 4) | (((y) & 0x04) << 6) | (((y) & 0x08) << 7))
+#define YF_32_Y(y)  ((((y) & 0x07) << 4) | (((y) & 0x08) << 5) | (((y) & 0x10) << 6))
+#define YF_8_Y(y)   ((((y) & 0x1F) << 4) | (((y) & 0x20) << 5))
+#define Y0_Y(y)     ((((y) & 0x1F) << 4))
+
+#define TILING_INIT(TILING)                           \
+   do {                                               \
+      x_mask = TILING ## _X(~0);                      \
+      y_mask = TILING ## _Y(~0);                      \
+      xo0 = TILING ## _X(x0);                         \
+      xo1 = TILING ## _X(x1);                         \
+      xo2 = TILING ## _X(x2);                         \
+      yo0 = TILING ## _Y(y0);                         \
+      yo1 = TILING ## _Y(y1);                         \
+      yo2 = TILING ## _Y(y2);                         \
+      xinc_16 = (TILING ## _X(16) | ~x_mask) & 0xFFF; \
+      yinc_1 = (TILING ## _Y(1) | ~y_mask) & 0xFFF;   \
+      yinc_4 = (TILING ## _Y(4) | ~y_mask) & 0xFFF;   \
+   } while (0)
+
+   if (tiling == ISL_TILING_Y0)
+      TILING_INIT(Y0);
+   else if (cpp == 16 || cpp == 8)
+      TILING_INIT(YF_128);
+   else if (cpp == 4 || cpp == 2)
+      TILING_INIT(YF_32);
+   else if (cpp == 1)
+      TILING_INIT(YF_8);
+   else
+      unreachable("not reached");
 
    /* Bit 9 of the destination offset control swizzling.
     * Only the X offset contributes to bit 9 of the total offset,
@@ -342,13 +434,15 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
     */
    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
+   if (tiling != ISL_TILING_Y0)
+      swizzle0 = swizzle1 = swizzle_bit = 0;
 
-   uint32_t x, yo;
+   uint32_t x, y, yo;
 
    src += (ptrdiff_t)y0 * src_pitch;
 
    if (y0 != y1) {
-      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
+      for (y = y0, yo = yo0; y < y1; y++, yo = (yo + yinc_1) & y_mask) {
          uint32_t xo = xo1;
          uint32_t swizzle = swizzle1;
 
@@ -359,7 +453,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           */
          for (x = x1; x < x2; x += ytile_span) {
             mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
-            xo += bytes_per_column;
+            xo = (xo + xinc_16) & x_mask;
             swizzle ^= swizzle_bit;
          }
 
@@ -369,7 +463,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       }
    }
 
-   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
+   for (y = y1, yo = yo1; y < y2; y += 4, yo = (yo + yinc_4) & y_mask) {
       uint32_t xo = xo1;
       uint32_t swizzle = swizzle1;
 
@@ -388,7 +482,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
          mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
          mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
-         xo += bytes_per_column;
+         xo = (xo + xinc_16) & x_mask;
          swizzle ^= swizzle_bit;
       }
 
@@ -403,7 +497,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    }
 
    if (y2 != y3) {
-      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
+      for (y = y2, yo = yo2; y < y3; y++, yo = (yo + yinc_1) & y_mask) {
          uint32_t xo = xo1;
          uint32_t swizzle = swizzle1;
 
@@ -414,7 +508,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           */
          for (x = x1; x < x2; x += ytile_span) {
             mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
-            xo += bytes_per_column;
+            xo = (xo + xinc_16) & x_mask;
             swizzle ^= swizzle_bit;
          }
 
@@ -478,28 +572,35 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  char *dst, const char *src,
                  int32_t dst_pitch,
                  uint32_t swizzle_bit,
-                 UNUSED enum isl_tiling tiling,
-                 UNUSED int cpp,
+                 enum isl_tiling tiling,
+                 int cpp,
                  mem_copy_fn mem_copy,
                  mem_copy_fn mem_copy_align16)
 {
-   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
-    * as the tile).  Thus the destination offset for (x,y) is the sum of:
-    *   (x % column_width)                    // position within column
-    *   (x / column_width) * bytes_per_column // column number * bytes per column
-    *   y * column_width
-    *
-    * The copy destination offset for each range copied is the sum of
-    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
+   /* See comments in linear_to_ytiled about the theory of operation for Y
+    * tilings and the definition of the TILING_INIT macro used here.
     */
    const uint32_t column_width = ytile_span;
-   const uint32_t bytes_per_column = column_width * ytile_height;
 
    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
 
-   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
-   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
+   uint32_t xinc_16, x_mask;
+   uint32_t yinc_1, yinc_4, y_mask;
+
+   uint32_t xo0, xo1, xo2;
+   uint32_t yo0, yo1, yo2;
+
+   if (tiling == ISL_TILING_Y0)
+      TILING_INIT(Y0);
+   else if (cpp == 16 || cpp == 8)
+      TILING_INIT(YF_128);
+   else if (cpp == 4 || cpp == 2)
+      TILING_INIT(YF_32);
+   else if (cpp == 1)
+      TILING_INIT(YF_8);
+   else
+      unreachable("not reached");
 
    /* Bit 9 of the destination offset control swizzling.
     * Only the X offset contributes to bit 9 of the total offset,
@@ -508,13 +609,15 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
     */
    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
+   if (tiling != ISL_TILING_Y0)
+      swizzle0 = swizzle1 = swizzle_bit = 0;
 
-   uint32_t x, yo;
+   uint32_t x, y, yo;
 
    dst += (ptrdiff_t)y0 * dst_pitch;
 
    if (y0 != y1) {
-      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
+      for (y = y0, yo = yo0; y < y1; y++, yo = (yo + yinc_1) & y_mask) {
          uint32_t xo = xo1;
          uint32_t swizzle = swizzle1;
 
@@ -525,7 +628,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           */
          for (x = x1; x < x2; x += ytile_span) {
             mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
-            xo += bytes_per_column;
+            xo = (xo + xinc_16) & x_mask;
             swizzle ^= swizzle_bit;
          }
 
@@ -535,7 +638,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       }
    }
 
-   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
+   for (y = y1, yo = yo1; y < y2; y += 4, yo = (yo + yinc_4) & y_mask) {
       uint32_t xo = xo1;
       uint32_t swizzle = swizzle1;
 
@@ -554,7 +657,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
          mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
          mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
-         xo += bytes_per_column;
+         xo = (xo + xinc_16) & x_mask;
          swizzle ^= swizzle_bit;
       }
 
@@ -569,7 +672,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    }
 
    if (y2 != y3) {
-      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
+      for (y = y2, yo = yo2; y < y3; y++, yo = (yo + yinc_1) & y_mask) {
          uint32_t xo = xo1;
          uint32_t swizzle = swizzle1;
 
@@ -580,7 +683,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
           */
          for (x = x1; x < x2; x += ytile_span) {
             mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
-            xo += bytes_per_column;
+            xo = (xo + xinc_16) & x_mask;
             swizzle ^= swizzle_bit;
          }
 
@@ -814,6 +917,24 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
       span = ytile_span;
       tile_copy = linear_to_ytiled_faster;
       tile_addr = ytile_addr;
+   } else if (isl_tiling_is_std_y(tiling)) {
+      if (cpp == 16 || cpp == 8) {
+         tw = std_ytile128_width;
+         th = std_ytile128_height;
+         tile_addr = tiling == ISL_TILING_Ys ? ys128_addr : yf128_addr;
+      } else if (cpp == 4 || cpp == 2) {
+         tw = std_ytile32_width;
+         th = std_ytile32_height;
+         tile_addr = tiling == ISL_TILING_Ys ? ys32_addr : yf32_addr;
+      } else if (cpp == 1) {
+         tw = std_ytile8_width;
+         th = std_ytile8_height;
+         tile_addr = tiling == ISL_TILING_Ys ? ys8_addr : yf8_addr;
+      } else {
+         unreachable("not reached");
+      }
+      span = ytile_span;
+      tile_copy = linear_to_ytiled_faster;
    } else {
       unreachable("unsupported tiling");
    }
@@ -911,6 +1032,24 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
       span = ytile_span;
       tile_copy = ytiled_to_linear_faster;
       tile_addr = ytile_addr;
+   } else if (isl_tiling_is_std_y(tiling)) {
+      if (cpp == 16 || cpp == 8) {
+         tw = std_ytile128_width;
+         th = std_ytile128_height;
+         tile_addr = tiling == ISL_TILING_Ys ? ys128_addr : yf128_addr;
+      } else if (cpp == 4 || cpp == 2) {
+         tw = std_ytile32_width;
+         th = std_ytile32_height;
+         tile_addr = tiling == ISL_TILING_Ys ? ys32_addr : yf32_addr;
+      } else if (cpp == 1) {
+         tw = std_ytile8_width;
+         th = std_ytile8_height;
+         tile_addr = tiling == ISL_TILING_Ys ? ys8_addr : yf8_addr;
+      } else {
+         unreachable("not reached");
+      }
+      span = ytile_span;
+      tile_copy = linear_to_ytiled_faster;
    } else {
       unreachable("unsupported tiling");
    }
-- 
2.14.3