[Mesa-dev] [PATCH v2 24/25] radeonsi: disable SDMA clears and copies for sparse buffers

Tue Mar 28 09:12:14 UTC 2017

From: Nicolai Hähnle <nicolai.haehnle at amd.com>

VM faults cannot be disabled for SDMA on <= VI.

We could still use SDMA by asking the winsys about which parts of the
buffers are committed. This is left as a potential future improvement.
---
 src/gallium/drivers/radeonsi/cik_sdma.c  | 7 +++++--
 src/gallium/drivers/radeonsi/si_cp_dma.c | 1 +
 src/gallium/drivers/radeonsi/si_dma.c    | 7 +++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index bee35cd..90f4f21 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -73,21 +73,22 @@ static void cik_sdma_clear_buffer(struct pipe_context *ctx,
 				  struct pipe_resource *dst,
 				  uint64_t offset,
 				  uint64_t size,
 				  unsigned clear_value)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct radeon_winsys_cs *cs = sctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = r600_resource(dst);
 
-	if (!cs || offset % 4 != 0 || size % 4 != 0) {
+	if (!cs || offset % 4 != 0 || size % 4 != 0 ||
+	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
 		ctx->clear_buffer(ctx, dst, offset, size, &clear_value, 4);
 		return;
 	}
 
 	/* Mark the buffer range of destination as valid (initialized),
 	 * so that transfer_map knows it should wait for the GPU when mapping
 	 * that range. */
 	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
 
 	offset += rdst->gpu_address;
@@ -519,21 +520,23 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
 static void cik_sdma_copy(struct pipe_context *ctx,
 			  struct pipe_resource *dst,
 			  unsigned dst_level,
 			  unsigned dstx, unsigned dsty, unsigned dstz,
 			  struct pipe_resource *src,
 			  unsigned src_level,
 			  const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	if (!sctx->b.dma.cs)
+	if (!sctx->b.dma.cs ||
+	    src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
 		goto fallback;
 
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		cik_sdma_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
 		return;
 	}
 
 	if (cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
 				  src, src_level, src_box))
 		return;
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 812fcbc..f75ce05 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -195,20 +195,21 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 
 	/* Mark the buffer range of destination as valid (initialized),
 	 * so that transfer_map knows it should wait for the GPU when mapping
 	 * that range. */
 	util_range_add(&rdst->valid_buffer_range, offset,
 		       offset + dma_clear_size);
 
 	/* dma_clear_buffer can use clear_buffer on failure. Make sure that
 	 * doesn't happen. We don't want an infinite recursion: */
 	if (sctx->b.dma.cs &&
+	    !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
 	    (offset % 4 == 0) &&
 	    /* CP DMA is very slow. Always use SDMA for big clears. This
 	     * alone improves DeusEx:MD performance by 70%. */
 	    (size > 128 * 1024 ||
 	     /* Buffers not used by the GFX IB yet will be cleared by SDMA.
 	      * This happens to move most buffer clears to SDMA, including
 	      * DCC and CMASK clears, because pipe->clear clears them before
 	      * si_emit_framebuffer_state (in a draw call) adds them.
 	      * For example, DeusEx:MD has 21 buffer clears per frame and all
 	      * of them are moved to SDMA thanks to this. */
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 9dbee3a..b236161 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -82,21 +82,22 @@ static void si_dma_clear_buffer(struct pipe_context *ctx,
 				struct pipe_resource *dst,
 				uint64_t offset,
 				uint64_t size,
 				unsigned clear_value)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct radeon_winsys_cs *cs = sctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = r600_resource(dst);
 
-	if (!cs || offset % 4 != 0 || size % 4 != 0) {
+	if (!cs || offset % 4 != 0 || size % 4 != 0 ||
+	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
 		ctx->clear_buffer(ctx, dst, offset, size, &clear_value, 4);
 		return;
 	}
 
 	/* Mark the buffer range of destination as valid (initialized),
 	 * so that transfer_map knows it should wait for the GPU when mapping
 	 * that range. */
 	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
 
 	offset += rdst->gpu_address;
@@ -226,21 +227,23 @@ static void si_dma_copy(struct pipe_context *ctx,
 			const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode;
 	unsigned src_w, dst_w;
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL ||
+	    src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
+	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
 		goto fallback;
 	}
 
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width);
 		return;
 	}
 
 	/* XXX: Using the asynchronous DMA engine for multi-dimensional
 	 * operations seems to cause random GPU lockups for various people.
-- 
2.9.3