<div dir="ltr">I'm a bit concerned about how complex things are getting. I've been looking at this for the past hour or so and I don't have anything yet. I'll keep thinking.<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Jan 23, 2018 at 6:42 AM, Scott D Phillips <span dir="ltr"><<a href="mailto:scott.d.phillips@intel.com" target="_blank">scott.d.phillips@intel.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">TileY's low 6 address bits are: v1 v0 u3 u2 u1 u0<br>
Thus a cache line in the tiled surface is composed of a 2d area of<br>
16x4 bytes of the linear surface.<br>
<br>
Add a special case where the area being copied is 4-line aligned<br>
and a multiple of 4-lines so that entire cache lines will be<br>
written at a time.<br>
<br>
On Apollolake, this increases tiling throughput to wc maps by<br>
83.1412% +/- 1.81446%<br>
---<br>
src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c | 63 ++++++++++++++++++++++----<br>
1 file changed, 55 insertions(+), 8 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c<br>
index e2b7b3496d..e45f3fec1e 100644<br>
--- a/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c<br>
+++ b/src/mesa/drivers/dri/i965/<wbr>intel_tiled_memcpy.c<br>
@@ -287,8 +287,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
*/<br>
static inline void<br>
linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
- uint32_t y0, uint32_t y1,<br>
- char *dst, const char *src,<br>
+ uint32_t y0, uint32_t y3,<br>
+ char *dst, const char *src0,<br>
int32_t src_pitch,<br>
uint32_t swizzle_bit,<br>
mem_copy_fn mem_copy,<br>
@@ -306,6 +306,9 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
const uint32_t column_width = ytile_span;<br>
const uint32_t bytes_per_column = column_width * ytile_height;<br>
<br>
+ uint32_t y1 = ALIGN_UP(y0, 4);<br>
+ uint32_t y2 = ALIGN_DOWN(y3, 4);<br>
+<br>
uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;<br>
uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;<br>
<br>
@@ -319,26 +322,70 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
<br>
uint32_t x, yo;<br>
<br>
- src += (ptrdiff_t)y0 * src_pitch;<br>
+ const char *src = src0 + (ptrdiff_t)y1 * src_pitch;<br>
<br>
- for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {<br>
+ for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {<br>
uint32_t xo = xo1;<br>
uint32_t swizzle = swizzle1;<br>
<br>
- mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);<br>
+ if (x0 != x1) {<br>
+ mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);<br>
+ mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);<br>
+ mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);<br>
+ mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);<br>
+ }<br>
<br>
/* Step by spans/columns. As it happens, the swizzle bit flips<br>
* at each step so we don't need to calculate it explicitly.<br>
*/<br>
for (x = x1; x < x2; x += ytile_span) {<br>
- mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);<br>
+ mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);<br>
+ mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);<br>
+ mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);<br>
+ mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);<br>
xo += bytes_per_column;<br>
swizzle ^= swizzle_bit;<br>
}<br>
<br>
- mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);<br>
+ if (x2 != x3) {<br>
+ mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);<br>
+ mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);<br>
+ mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);<br>
+ mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);<br>
+ }<br>
<br>
- src += src_pitch;<br>
+ src += 4 * src_pitch;<br>
+ }<br>
+<br>
+ if (y0 != y1 || y2 != y3) {<br>
+ src = src0 + (ptrdiff_t)y0 * src_pitch;<br>
+<br>
+ for (yo = y0 * column_width; yo < y3 * column_width; yo += column_width) {<br>
+ uint32_t xo = xo1;<br>
+ uint32_t swizzle = swizzle1;<br>
+<br>
+ if (yo >= y1 * column_width && yo < y2 * column_width) {<br>
+ if (y2 == y3)<br>
+ break;<br>
+ yo = y2 * column_width;<br>
+ src = src0 + y2 * src_pitch;<br>
+ }<br>
+<br>
+ mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);<br>
+<br>
+ /* Step by spans/columns. As it happens, the swizzle bit flips<br>
+ * at each step so we don't need to calculate it explicitly.<br>
+ */<br>
+ for (x = x1; x < x2; x += ytile_span) {<br>
+ mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);<br>
+ xo += bytes_per_column;<br>
+ swizzle ^= swizzle_bit;<br>
+ }<br>
+<br>
+ mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);<br>
+<br>
+ src += src_pitch;<br>
+ }<br>
}<br>
}<br>
<span class="HOEnZb"><font color="#888888"><br>
--<br>
2.16.1<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div>