[Mesa-dev] [PATCH 11/22] i965: Support WC uploads for Y-tiling
Chris Wilson
chris at chris-wilson.co.uk
Sat Aug 5 09:40:03 UTC 2017
Y-tiling makes a mess of our cacheline WCB, forcing evictions and writes
between each pixel of the linear_to_ytiled routines, effectively
reducing the upload to UC performance (i.e. terrible). This patch takes
the simple approach of doing the detiling into a temporary page and then
copying the page to WC (so we have a very simple and efficient upload
into WC, whilst keeping the detiler in cache).
---
src/mesa/drivers/dri/i965/intel_tex_subimage.c | 13 +-
src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 161 ++++++++++++++++++++++++-
src/mesa/drivers/dri/i965/intel_tiled_memcpy.h | 1 +
3 files changed, 160 insertions(+), 15 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 56c6cbf7b8..e7486300ab 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -83,9 +83,6 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
struct brw_context *brw = brw_context(ctx);
struct intel_texture_image *image = intel_texture_image(texImage);
- /* The miptree's buffer. */
- struct brw_bo *bo;
-
mem_copy_fn mem_copy;
/* This fastpath is restricted to specific texture types:
@@ -136,20 +133,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
if (brw->gen < 5 && brw->has_swizzling)
return false;
- bo = image->mt->bo;
-
- /* Uploading into Y-tiling surfaces using WC is slow as each sequential
- * write falls outside of the WCB, completely nerfing the WC performance.
- */
- if (!bo->cache_coherent && image->mt->surf.tiling == ISL_TILING_Y0)
- return false;
-
/* Since we are going to write raw data to the miptree, we need to resolve
* any pending fast color clears before we start.
*/
assert(image->mt->surf.logical_level0_px.depth == 1);
assert(image->mt->surf.logical_level0_px.array_len == 1);
+ struct brw_bo *bo = image->mt->bo;
if (brw_batch_references(&brw->batch, bo)) {
perf_debug("Flushing before mapping a referenced bo.\n");
intel_batchbuffer_flush(brw);
@@ -209,6 +199,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
map,
pixels - (ptrdiff_t) level_y * src_pitch - (ptrdiff_t) level_x * cpp,
image->mt->surf.row_pitch, src_pitch,
+ bo->cache_coherent,
brw->has_swizzling,
image->mt->surf.tiling,
mem_copy
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 65dd950c08..7c9fefeda9 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -324,7 +324,7 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
static inline void
linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t y0, uint32_t y1,
- char *dst, const char *src,
+ char * __restrict__ dst, const char * __restrict__ src,
int32_t src_pitch,
uint32_t swizzle_bit,
mem_copy_fn mem_copy,
@@ -378,6 +378,104 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
}
}
+#if defined(__SSSE3__)
+static inline void
+linear_to_ytiled0(char * __restrict__ dst, const char * __restrict__ src,
+ int32_t src_pitch,
+ uint32_t swizzle_bit,
+ mem_copy_fn mem_copy)
+{
+ /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
+ * as the tile). Thus the destination offset for (x,y) is the sum of:
+ * (x % column_width) // position within column
+ * (x / column_width) * bytes_per_column // column number * bytes per column
+ * y * column_width
+ *
+ * The copy destination offset for each range copied is the sum of
+ * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
+ */
+ const uint32_t column_width = ytile_span;
+ const uint32_t bytes_per_column = column_width * ytile_height;
+
+ uint32_t x, y;
+
+ for (y = 0; y < ytile_height; y++) {
+ __m128i xmm[8];
+ uint32_t swizzle = 0;
+
+ if (mem_copy == memcpy) {
+ for (x = 0; x < 8; x++)
+ xmm[x] = _mm_loadu_si128((__m128i *)src + x);
+
+ for (x = 0; x < 8; x++) {
+ _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]);
+ swizzle ^= swizzle_bit;
+ }
+ } else {
+ for (x = 0; x < 4; x++)
+ xmm[x] = _mm_shuffle_epi8( _mm_loadu_si128((__m128i *)src + x),
+ *(__m128i *) rgba8_permutation);
+
+ for (x = 0; x < 4; x++) {
+ _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]);
+ swizzle ^= swizzle_bit;
+ }
+
+ for (x = 4; x < 8; x++)
+ xmm[x] = _mm_shuffle_epi8( _mm_loadu_si128((__m128i *)src + x),
+ *(__m128i *) rgba8_permutation);
+
+ for (x = 4; x < 8; x++) {
+ _mm_store_si128((__m128i *)(dst + ((y * column_width + x*bytes_per_column) ^ swizzle)), xmm[x]);
+ swizzle ^= swizzle_bit;
+ }
+ }
+
+ src += src_pitch;
+ }
+}
+
+/**
+ * Copy texture data from linear to Y tile layout through WC.
+ *
+ * \copydoc tile_copy_fn
+ */
+static inline void
+linear_to_ytiled_wc(char * __restrict__ dst,
+ const char * __restrict__ src,
+ int32_t src_pitch,
+ mem_copy_fn mem_copy)
+{
+ /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
+ * as the tile). Thus the destination offset for (x,y) is the sum of:
+ * (x % column_width) // position within column
+ * (x / column_width) * bytes_per_column // column number * bytes per column
+ * y * column_width
+ *
+ * The copy destination offset for each range copied is the sum of
+ * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
+ */
+ const uint32_t column_width = ytile_span;
+ const uint32_t bytes_per_column = column_width * ytile_height;
+ uint8_t tmp[4096] __attribute__((aligned(16)));
+
+ uint32_t x, y;
+
+ for (y = 0; y < ytile_height; y++) {
+ __m128i xmm[8];
+
+ for (x = 0; x < 8; x++)
+ xmm[x] = _mm_loadu_si128((__m128i *)src + x);
+ for (x = 0; x < 8; x++)
+ _mm_store_si128((__m128i *)(tmp + y * column_width + x*bytes_per_column), xmm[x]);
+
+ src += src_pitch;
+ }
+
+ mem_copy(dst, tmp, sizeof(tmp));
+}
+#endif
+
/**
* Copy texture data from X tile layout to linear.
*
@@ -547,6 +645,14 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
mem_copy_fn mem_copy)
{
if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
+#ifdef __SSSE3__
+ if (mem_copy == memcpy)
+ return linear_to_ytiled0(dst, src, src_pitch, swizzle_bit,
+ memcpy);
+ else if (mem_copy == rgba8_copy_aligned_dst)
+ return linear_to_ytiled0(dst, src, src_pitch, swizzle_bit,
+ rgba8_copy_aligned_dst);
+#else
if (mem_copy == memcpy)
return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
@@ -554,6 +660,7 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, src_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_dst);
+#endif /* __SSSE3__ */
else
return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, src_pitch, swizzle_bit,
@@ -575,6 +682,45 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
}
/**
+ * Copy texture data from linear to Y tile layout using WC, faster.
+ *
+ * Same as \ref linear_to_ytiled but optimised for WC.
+ *
+ * \copydoc tile_copy_fn
+ */
+static FLATTEN void
+linear_to_ytiled_wc_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+ uint32_t y0, uint32_t y1,
+ char *dst, const char *src,
+ int32_t src_pitch,
+ uint32_t swizzle_bit,
+ mem_copy_fn mem_copy)
+{
+#if defined(__SSSE3__)
+ if (x3 - x0 == ytile_width && y1 - y0 == ytile_height) {
+ if (mem_copy == memcpy)
+ return linear_to_ytiled_wc(dst, src, src_pitch, memcpy);
+ else
+ return linear_to_ytiled_wc(dst, src, src_pitch, mem_copy);
+ } else
+#endif
+ {
+ if (mem_copy == memcpy)
+ return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
+ dst, src, src_pitch, swizzle_bit,
+ memcpy, memcpy);
+ else if (mem_copy == rgba8_copy)
+ return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
+ dst, src, src_pitch, swizzle_bit,
+ rgba8_copy, rgba8_copy_aligned_dst);
+ else
+ return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
+ dst, src, src_pitch, swizzle_bit,
+ mem_copy, mem_copy);
+ }
+}
+
+/**
* Copy texture data from X tile layout to linear, faster.
*
* Same as \ref xtile_to_linear but faster, because it passes constant
@@ -680,6 +826,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
uint32_t yt1, uint32_t yt2,
char *dst, const char *src,
uint32_t dst_pitch, int32_t src_pitch,
+ bool cache_coherent,
bool has_swizzling,
enum isl_tiling tiling,
mem_copy_fn mem_copy)
@@ -700,7 +847,12 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
tw = ytile_width;
th = ytile_height;
span = ytile_span;
- tile_copy = linear_to_ytiled_faster;
+ if (cache_coherent)
+ tile_copy = linear_to_ytiled_faster;
+ else if (has_swizzling)
+ unreachable("unsupported tiling");
+ else
+ tile_copy = linear_to_ytiled_wc_faster;
} else {
unreachable("unsupported tiling");
}
@@ -718,14 +870,15 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
* Looping x inside y is the faster memory access pattern.
*/
for (yt = yt0; yt < yt3; yt += th) {
+ uint32_t y0 = MAX2(yt1, yt);
+ uint32_t y1 = MIN2(yt2, yt + th);
+
for (xt = xt0; xt < xt3; xt += tw) {
/* The area to update is [x0,x3) x [y0,y1).
* May not want the whole tile, hence the min and max.
*/
uint32_t x0 = MAX2(xt1, xt);
- uint32_t y0 = MAX2(yt1, yt);
uint32_t x3 = MIN2(xt2, xt + tw);
- uint32_t y1 = MIN2(yt2, yt + th);
/* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
* the middle interval is the longest span-aligned part.
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
index e9c43920a1..9d6c71d1cf 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -42,6 +42,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
uint32_t yt1, uint32_t yt2,
char *dst, const char *src,
uint32_t dst_pitch, int32_t src_pitch,
+ bool cache_coherent,
bool has_swizzling,
enum isl_tiling tiling,
mem_copy_fn mem_copy);
--
2.13.3
More information about the mesa-dev
mailing list