<div dir="auto"><div><br><div class="gmail_extra"><br><div class="gmail_quote">On Feb 27, 2018 11:22 PM, "Scott D Phillips" <<a href="mailto:scott.d.phillips@intel.com">scott.d.phillips@intel.com</a>> wrote:<br type="attribution"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Yf and Ys are a family of tilings similar to Y. The actual address<br>
bit interleavings for Yf* and Ys* depend upon the bits-per-pixel<br>
value of the surface, where 128-, 32-, and 8-bpp tiles are square<br>
and 64- and 16-bpp tiles have a 2:1 aspect ratio.<br>
<br>
The address bit layout of Yf and Ys are the same in the low<br>
12 bits (4-kbytes); however Ys tiles are actually 64-kbytes in<br>
size, but can be handled as if they were composed of 4-kbyte<br>
sub-tiles with a different overall tile ordering.<br>
<br>
Because all of Y, Yf*, and Ys* have the same least significant<br>
6 bits of address bit layout, the same tiling/detiling routine can<br>
be used between them. The inner loop that writes or reads cache<br>
lines at a time is the same, while the outer loop (which walks in<br>
a mostly linear order) uses different address increment values for<br>
each tiling format to adjust the tile addresses.<br>
---<br>
src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c | 219 ++++++++++++++++++++-----<br>
1 file changed, 179 insertions(+), 40 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c<br>
index a78e2b97d45..2b040a69524 100644<br>
--- a/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c<br>
+++ b/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c<br>
@@ -57,6 +57,12 @@ static const uint32_t xtile_span = 64;<br>
static const uint32_t ytile_width = 128;<br>
static const uint32_t ytile_height = 32;<br>
static const uint32_t ytile_span = 16;<br>
+static const uint32_t std_ytile128_width = 256;<br>
+static const uint32_t std_ytile128_height = 16;<br>
+static const uint32_t std_ytile32_width = 128;<br>
+static const uint32_t std_ytile32_height = 32;<br>
+static const uint32_t std_ytile8_width = 64;<br>
+static const uint32_t std_ytile8_height = 64;<br>
<br>
static inline uint32_t<br>
ror(uint32_t n, uint32_t d)<br>
@@ -253,6 +259,48 @@ ytile_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
return src + (((y >> 5) * (src_pitch >> 7) + (x >> 7)) << 12);<br>
}<br>
<br>
+static char *<br>
+yf128_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
+{<br>
+ return src + (((y >> 4) * (src_pitch >> 8) + (x >> 8)) << 12);<br>
+}<br>
+<br>
+static char *<br>
+yf32_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
+{<br>
+ return src + (((y >> 5) * (src_pitch >> 7) + (x >> 7)) << 12);<br>
+}<br>
+<br>
+static char *<br>
+yf8_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
+{<br>
+ return src + (((y >> 6) * (src_pitch >> 6) + (x >> 6)) << 12);<br>
+}<br>
+<br>
+static char *<br>
+ys128_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
+{<br>
+ return src + (((y & 0x10) << 8) + ((y & 0x20) << 9) + ((x & 0x100) << 5) +<br>
+ ((x & 200) << 6) +<br></blockquote></div></div></div><div dir="auto"><br></div><div dir="auto">0x200?</div><div dir="auto"><br></div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ (((y >> 6) * (src_pitch >> 10) + (x >> 10)) << 16));<br>
+}<br>
+<br>
+static char *<br>
+ys32_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
+{<br>
+ return src + (((y & 0x20) << 7) + ((y & 0x40) << 8) + ((x & 0x80) << 6) +<br>
+ ((x & 100) << 7) +<br></blockquote></div></div></div><div dir="auto"><br></div><div dir="auto">0x100? Looks like there are more of these below.</div><div dir="auto"><br></div><div dir="auto"><div class="gmail_extra"><div class="gmail_quote"><blockquote class="quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ (((y >> 7) * (src_pitch >> 9) + (x >> 9)) << 16));<br>
+}<br>
+<br>
+static char *<br>
+ys8_addr(uint32_t x, uint32_t y, char *src, uint32_t src_pitch)<br>
+{<br>
+ return src + (((y & 0x40) << 6) + ((y & 0x80) << 7) + ((x & 0x40) << 7) +<br>
+ ((x & 80) << 8) +<br>
+ (((y >> 8) * (src_pitch >> 8) + (x >> 8)) << 16));<br>
+}<br>
+<br>
/**<br>
* Copy texture data from linear to X tile layout.<br>
*<br>
@@ -302,7 +350,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
}<br>
<br>
/**<br>
- * Copy texture data from linear to Y tile layout.<br>
+ * Copy texture data from linear to Y tile layout. This function tiles a<br>
+ * single 4KB portion of the tiling (even for the 64KB tiling variants)<br>
*<br>
* \copydoc tile_copy_fn<br>
*/<br>
@@ -312,28 +361,71 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
char *dst, const char *src,<br>
int32_t src_pitch,<br>
uint32_t swizzle_bit,<br>
- UNUSED enum isl_tiling tiling,<br>
- UNUSED int cpp,<br>
+ enum isl_tiling tiling,<br>
+ int cpp,<br>
mem_copy_fn mem_copy,<br>
mem_copy_fn mem_copy_align16)<br>
{<br>
- /* Y tiles consist of columns that are 'ytile_span' wide (and the same height<br>
- * as the tile). Thus the destination offset for (x,y) is the sum of:<br>
- * (x % column_width) // position within column<br>
- * (x / column_width) * bytes_per_column // column number * bytes per column<br>
- * y * column_width<br>
+ /* The Y tilings are a family of different tilings with the following<br>
+ * linear-to-tiled address mapping for the low 12-bits of the tiled<br>
+ * addresses:<br>
+ *<br>
+ * Tiling bpp 11 10 9 8 7 6 5 4 3 2 1 0<br>
+ * ------------------------------<wbr>------------------------------<br>
+ * TileYF/TileYS 64 & 128 u7 v3 u6 v2 u5 u4 v1 v0 u3 u2 u1 u0<br>
+ * TileYF/TileYS 16 & 32 u6 v4 u5 v3 u4 v2 v1 v0 u3 u2 u1 u0<br>
+ * TileYF/TileYS 8 u5 v5 u4 v4 v3 v2 v1 v0 u3 u2 u1 u0<br>
+ * TileY u6 u5 u4 v4 v3 v2 v1 v0 u3 u2 u1 u0<br>
*<br>
- * The copy destination offset for each range copied is the sum of<br>
- * an X offset 'xo0' or 'xo' and a Y offset 'yo.'<br>
+ * The low 6-bits (addressing 64B, one cache line) of the tiling is common<br>
+ * across all variants.<br>
*/<br>
const uint32_t column_width = ytile_span;<br>
- const uint32_t bytes_per_column = column_width * ytile_height;<br>
<br>
uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));<br>
uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));<br>
<br>
- uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;<br>
- uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;<br>
+ uint32_t xinc_16, x_mask;<br>
+ uint32_t yinc_1, yinc_4, y_mask;<br>
+<br>
+ uint32_t xo0, xo1, xo2;<br>
+ uint32_t yo0, yo1, yo2;<br>
+<br>
+#define YF_128_X(x) (((x) & 0xF) | (((x) & 0x30) << 2) | (((x) & 0x40) << 3) | (((x) & 0x80) << 4))<br>
+#define YF_32_X(x) (((x) & 0xF) | (((x) & 0x10) << 3) | (((x) & 0x20) << 4) | (((x) & 0x40) << 5))<br>
+#define YF_8_X(x) (((x) & 0xF) | (((x) & 0x10) << 5) | (((x) & 0x20) << 6))<br>
+#define Y0_X(x) (((x) & 0xF) | (((x) & 0x70) << 5))<br>
+<br>
+#define YF_128_Y(y) ((((y) & 0x03) << 4) | (((y) & 0x04) << 6) | (((y) & 0x08) << 7))<br>
+#define YF_32_Y(y) ((((y) & 0x07) << 4) | (((y) & 0x08) << 5) | (((y) & 0x10) << 6))<br>
+#define YF_8_Y(y) ((((y) & 0x1F) << 4) | (((y) & 0x20) << 5))<br>
+#define Y0_Y(y) ((((y) & 0x1F) << 4))<br>
+<br>
+#define TILING_INIT(TILING) \<br>
+ do { \<br>
+ x_mask = TILING ## _X(~0); \<br>
+ y_mask = TILING ## _Y(~0); \<br>
+ xo0 = TILING ## _X(x0); \<br>
+ xo1 = TILING ## _X(x1); \<br>
+ xo2 = TILING ## _X(x2); \<br>
+ yo0 = TILING ## _Y(y0); \<br>
+ yo1 = TILING ## _Y(y1); \<br>
+ yo2 = TILING ## _Y(y2); \<br>
+ xinc_16 = (TILING ## _X(16) | ~x_mask) & 0xFFF; \<br>
+ yinc_1 = (TILING ## _Y(1) | ~y_mask) & 0xFFF; \<br>
+ yinc_4 = (TILING ## _Y(4) | ~y_mask) & 0xFFF; \<br>
+ } while (0)<br>
+<br>
+ if (tiling == ISL_TILING_Y0)<br>
+ TILING_INIT(Y0);<br>
+ else if (cpp == 16 || cpp == 8)<br>
+ TILING_INIT(YF_128);<br>
+ else if (cpp == 4 || cpp == 2)<br>
+ TILING_INIT(YF_32);<br>
+ else if (cpp == 1)<br>
+ TILING_INIT(YF_8);<br>
+ else<br>
+ unreachable("not reached");<br>
<br>
/* Bit 9 of the destination offset control swizzling.<br>
* Only the X offset contributes to bit 9 of the total offset,<br>
@@ -342,13 +434,15 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;<br>
uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;<br>
+ if (tiling != ISL_TILING_Y0)<br>
+ swizzle0 = swizzle1 = swizzle_bit = 0;<br>
<br>
- uint32_t x, yo;<br>
+ uint32_t x, y, yo;<br>
<br>
src += (ptrdiff_t)y0 * src_pitch;<br>
<br>
if (y0 != y1) {<br>
- for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {<br>
+ for (y = y0, yo = yo0; y < y1; y++, yo = (yo + yinc_1) & y_mask) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
@@ -359,7 +453,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
for (x = x1; x < x2; x += ytile_span) {<br>
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);<br>
- xo += bytes_per_column;<br>
+ xo = (xo + xinc_16) & x_mask;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
@@ -369,7 +463,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
}<br>
}<br>
<br>
- for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {<br>
+ for (y = y1, yo = yo1; y < y2; y += 4, yo = (yo + yinc_4) & y_mask) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
@@ -388,7 +482,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);<br>
mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);<br>
mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);<br>
- xo += bytes_per_column;<br>
+ xo = (xo + xinc_16) & x_mask;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
@@ -403,7 +497,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
}<br>
<br>
if (y2 != y3) {<br>
- for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {<br>
+ for (y = y2, yo = yo2; y < y3; y++, yo = (yo + yinc_1) & y_mask) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
@@ -414,7 +508,7 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
for (x = x1; x < x2; x += ytile_span) {<br>
mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);<br>
- xo += bytes_per_column;<br>
+ xo = (xo + xinc_16) & x_mask;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
@@ -478,28 +572,35 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
char *dst, const char *src,<br>
int32_t dst_pitch,<br>
uint32_t swizzle_bit,<br>
- UNUSED enum isl_tiling tiling,<br>
- UNUSED int cpp,<br>
+ enum isl_tiling tiling,<br>
+ int cpp,<br>
mem_copy_fn mem_copy,<br>
mem_copy_fn mem_copy_align16)<br>
{<br>
- /* Y tiles consist of columns that are 'ytile_span' wide (and the same height<br>
- * as the tile). Thus the destination offset for (x,y) is the sum of:<br>
- * (x % column_width) // position within column<br>
- * (x / column_width) * bytes_per_column // column number * bytes per column<br>
- * y * column_width<br>
- *<br>
- * The copy destination offset for each range copied is the sum of<br>
- * an X offset 'xo0' or 'xo' and a Y offset 'yo.'<br>
+ /* See comments in linear_to_ytiled about the theory of operation for Y<br>
+ * tilings and the definition of the TILING_INIT macro used here.<br>
*/<br>
const uint32_t column_width = ytile_span;<br>
- const uint32_t bytes_per_column = column_width * ytile_height;<br>
<br>
uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));<br>
uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));<br>
<br>
- uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;<br>
- uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;<br>
+ uint32_t xinc_16, x_mask;<br>
+ uint32_t yinc_1, yinc_4, y_mask;<br>
+<br>
+ uint32_t xo0, xo1, xo2;<br>
+ uint32_t yo0, yo1, yo2;<br>
+<br>
+ if (tiling == ISL_TILING_Y0)<br>
+ TILING_INIT(Y0);<br>
+ else if (cpp == 16 || cpp == 8)<br>
+ TILING_INIT(YF_128);<br>
+ else if (cpp == 4 || cpp == 2)<br>
+ TILING_INIT(YF_32);<br>
+ else if (cpp == 1)<br>
+ TILING_INIT(YF_8);<br>
+ else<br>
+ unreachable("not reached");<br>
<br>
/* Bit 9 of the destination offset control swizzling.<br>
* Only the X offset contributes to bit 9 of the total offset,<br>
@@ -508,13 +609,15 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;<br>
uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;<br>
+ if (tiling != ISL_TILING_Y0)<br>
+ swizzle0 = swizzle1 = swizzle_bit = 0;<br>
<br>
- uint32_t x, yo;<br>
+ uint32_t x, y, yo;<br>
<br>
dst += (ptrdiff_t)y0 * dst_pitch;<br>
<br>
if (y0 != y1) {<br>
- for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {<br>
+ for (y = y0, yo = yo0; y < y1; y++, yo = (yo + yinc_1) & y_mask) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
@@ -525,7 +628,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
for (x = x1; x < x2; x += ytile_span) {<br>
mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);<br>
- xo += bytes_per_column;<br>
+ xo = (xo + xinc_16) & x_mask;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
@@ -535,7 +638,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
}<br>
}<br>
<br>
- for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {<br>
+ for (y = y1, yo = yo1; y < y2; y += 4, yo = (yo + yinc_4) & y_mask) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
@@ -554,7 +657,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);<br>
mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);<br>
mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);<br>
- xo += bytes_per_column;<br>
+ xo = (xo + xinc_16) & x_mask;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
@@ -569,7 +672,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
}<br>
<br>
if (y2 != y3) {<br>
- for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {<br>
+ for (y = y2, yo = yo2; y < y3; y++, yo = (yo + yinc_1) & y_mask) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
@@ -580,7 +683,7 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
for (x = x1; x < x2; x += ytile_span) {<br>
mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);<br>
- xo += bytes_per_column;<br>
+ xo = (xo + xinc_16) & x_mask;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
@@ -814,6 +917,24 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,<br>
span = ytile_span;<br>
tile_copy = linear_to_ytiled_faster;<br>
tile_addr = ytile_addr;<br>
+ } else if (isl_tiling_is_std_y(tiling)) {<br>
+ if (cpp == 16 || cpp == 8) {<br>
+ tw = std_ytile128_width;<br>
+ th = std_ytile128_height;<br>
+ tile_addr = tiling == ISL_TILING_Ys ? ys128_addr : yf128_addr;<br>
+ } else if (cpp == 4 || cpp == 2) {<br>
+ tw = std_ytile32_width;<br>
+ th = std_ytile32_height;<br>
+ tile_addr = tiling == ISL_TILING_Ys ? ys32_addr : yf32_addr;<br>
+ } else if (cpp == 1) {<br>
+ tw = std_ytile8_width;<br>
+ th = std_ytile8_height;<br>
+ tile_addr = tiling == ISL_TILING_Ys ? ys8_addr : yf8_addr;<br>
+ } else {<br>
+ unreachable("not reached");<br>
+ }<br>
+ span = ytile_span;<br>
+ tile_copy = linear_to_ytiled_faster;<br>
} else {<br>
unreachable("unsupported tiling");<br>
}<br>
@@ -911,6 +1032,24 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,<br>
span = ytile_span;<br>
tile_copy = ytiled_to_linear_faster;<br>
tile_addr = ytile_addr;<br>
+ } else if (isl_tiling_is_std_y(tiling)) {<br>
+ if (cpp == 16 || cpp == 8) {<br>
+ tw = std_ytile128_width;<br>
+ th = std_ytile128_height;<br>
+ tile_addr = tiling == ISL_TILING_Ys ? ys128_addr : yf128_addr;<br>
+ } else if (cpp == 4 || cpp == 2) {<br>
+ tw = std_ytile32_width;<br>
+ th = std_ytile32_height;<br>
+ tile_addr = tiling == ISL_TILING_Ys ? ys32_addr : yf32_addr;<br>
+ } else if (cpp == 1) {<br>
+ tw = std_ytile8_width;<br>
+ th = std_ytile8_height;<br>
+ tile_addr = tiling == ISL_TILING_Ys ? ys8_addr : yf8_addr;<br>
+ } else {<br>
+ unreachable("not reached");<br>
+ }<br>
+ span = ytile_span;<br>
+ tile_copy = linear_to_ytiled_faster;<br>
} else {<br>
unreachable("unsupported tiling");<br>
}<br>
<font color="#888888">--<br>
2.14.3<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></blockquote></div><br></div></div></div>