[Mesa-dev] [PATCH 8/8] gallium/radeon: derive buffer placement and flags only once per buffer

Thu Aug 18 19:46:44 UTC 2016

From: Marek Olšák <marek.olsak at amd.com>

Invalidated buffers don't have to do this.
---
 src/gallium/drivers/radeon/r600_buffer_common.c | 147 +++++++++++++-----------
 src/gallium/drivers/radeon/r600_pipe_common.h   |   2 +
 2 files changed, 80 insertions(+), 69 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 4480293..113a7dc 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -98,91 +98,108 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 	/* Setting the CS to NULL will prevent doing checks we have done already. */
 	return ctx->ws->buffer_map(resource->buf, NULL, usage);
 }
 
 bool r600_init_resource(struct r600_common_screen *rscreen,
 			struct r600_resource *res,
 			uint64_t size, unsigned alignment)
 {
 	struct r600_texture *rtex = (struct r600_texture*)res;
 	struct pb_buffer *old_buf, *new_buf;
-	enum radeon_bo_flag flags = 0;
-
-	switch (res->b.b.usage) {
-	case PIPE_USAGE_STREAM:
-		flags = RADEON_FLAG_GTT_WC;
-		/* fall through */
-	case PIPE_USAGE_STAGING:
-		/* Transfers are likely to occur more often with these resources. */
-		res->domains = RADEON_DOMAIN_GTT;
-		break;
-	case PIPE_USAGE_DYNAMIC:
-		/* Older kernels didn't always flush the HDP cache before
-		 * CS execution
-		 */
-		if (rscreen->info.drm_major == 2 &&
-		    rscreen->info.drm_minor < 40) {
+
+	if (!res->initialized) {
+		res->flags = 0;
+
+		switch (res->b.b.usage) {
+		case PIPE_USAGE_STREAM:
+			res->flags = RADEON_FLAG_GTT_WC;
+			/* fall through */
+		case PIPE_USAGE_STAGING:
+			/* Transfers are likely to occur more often with these
+			 * resources. */
 			res->domains = RADEON_DOMAIN_GTT;
-			flags |= RADEON_FLAG_GTT_WC;
+			break;
+		case PIPE_USAGE_DYNAMIC:
+			/* Older kernels didn't always flush the HDP cache before
+			 * CS execution
+			 */
+			if (rscreen->info.drm_major == 2 &&
+			    rscreen->info.drm_minor < 40) {
+				res->domains = RADEON_DOMAIN_GTT;
+				res->flags |= RADEON_FLAG_GTT_WC;
+				break;
+			}
+			res->flags |= RADEON_FLAG_CPU_ACCESS;
+			/* fall through */
+		case PIPE_USAGE_DEFAULT:
+		case PIPE_USAGE_IMMUTABLE:
+		default:
+			/* Not listing GTT here improves performance in some
+			 * apps. */
+			res->domains = RADEON_DOMAIN_VRAM;
+			res->flags |= RADEON_FLAG_GTT_WC;
 			break;
 		}
-		flags |= RADEON_FLAG_CPU_ACCESS;
-		/* fall through */
-	case PIPE_USAGE_DEFAULT:
-	case PIPE_USAGE_IMMUTABLE:
-	default:
-		/* Not listing GTT here improves performance in some apps. */
-		res->domains = RADEON_DOMAIN_VRAM;
-		flags |= RADEON_FLAG_GTT_WC;
-		break;
-	}
 
-	if (res->b.b.target == PIPE_BUFFER &&
-	    res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
-			      PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
-		/* Use GTT for all persistent mappings with older kernels,
-		 * because they didn't always flush the HDP cache before CS
-		 * execution.
-		 *
-		 * Write-combined CPU mappings are fine, the kernel ensures all CPU
-		 * writes finish before the GPU executes a command stream.
+		if (res->b.b.target == PIPE_BUFFER &&
+		    res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
+				      PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
+			/* Use GTT for all persistent mappings with older
+			 * kernels, because they didn't always flush the HDP
+			 * cache before CS execution.
+			 *
+			 * Write-combined CPU mappings are fine, the kernel
+			 * ensures all CPU writes finish before the GPU
+			 * executes a command stream.
+			 */
+			if (rscreen->info.drm_major == 2 &&
+			    rscreen->info.drm_minor < 40)
+				res->domains = RADEON_DOMAIN_GTT;
+			else if (res->domains & RADEON_DOMAIN_VRAM)
+				res->flags |= RADEON_FLAG_CPU_ACCESS;
+		}
+
+		/* Tiled textures are unmappable. Always put them in VRAM. */
+		if (res->b.b.target != PIPE_BUFFER &&
+		    rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
+			res->domains = RADEON_DOMAIN_VRAM;
+			res->flags &= ~RADEON_FLAG_CPU_ACCESS;
+			res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
+				 RADEON_FLAG_GTT_WC;
+		}
+
+		/* If VRAM is just stolen system memory, allow both VRAM and
+		 * GTT, whichever has free space. If a buffer is evicted from
+		 * VRAM to GTT, it will stay there.
 		 */
-		if (rscreen->info.drm_major == 2 &&
-		    rscreen->info.drm_minor < 40)
-			res->domains = RADEON_DOMAIN_GTT;
-		else if (res->domains & RADEON_DOMAIN_VRAM)
-			flags |= RADEON_FLAG_CPU_ACCESS;
-	}
+		if (!rscreen->info.has_dedicated_vram &&
+		    res->domains == RADEON_DOMAIN_VRAM)
+			res->domains = RADEON_DOMAIN_VRAM_GTT;
 
-	/* Tiled textures are unmappable. Always put them in VRAM. */
-	if (res->b.b.target != PIPE_BUFFER &&
-	    rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
-		res->domains = RADEON_DOMAIN_VRAM;
-		flags &= ~RADEON_FLAG_CPU_ACCESS;
-		flags |= RADEON_FLAG_NO_CPU_ACCESS |
-			 RADEON_FLAG_GTT_WC;
-	}
+		if (rscreen->debug_flags & DBG_NO_WC)
+			res->flags &= ~RADEON_FLAG_GTT_WC;
 
-	/* If VRAM is just stolen system memory, allow both VRAM and GTT,
-	 * whichever has free space. If a buffer is evicted from VRAM to GTT,
-	 * it will stay there.
-	 */
-	if (!rscreen->info.has_dedicated_vram &&
-	    res->domains == RADEON_DOMAIN_VRAM)
-		res->domains = RADEON_DOMAIN_VRAM_GTT;
+		/* Set expected VRAM and GART usage for the buffer. */
+		res->vram_usage = 0;
+		res->gart_usage = 0;
 
-	if (rscreen->debug_flags & DBG_NO_WC)
-		flags &= ~RADEON_FLAG_GTT_WC;
+		if (res->domains & RADEON_DOMAIN_VRAM)
+			res->vram_usage = size;
+		else if (res->domains & RADEON_DOMAIN_GTT)
+			res->gart_usage = size;
+
+		res->initialized = true;
+	}
 
 	/* Allocate a new resource. */
 	new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
-					     res->domains, flags);
+					     res->domains, res->flags);
 	if (!new_buf) {
 		return false;
 	}
 
 	/* Replace the pointer such that if res->buf wasn't NULL, it won't be
 	 * NULL. This should prevent crashes with multiple contexts using
 	 * the same buffer where one of the contexts invalidates it while
 	 * the others are using it. */
 	old_buf = res->buf;
 	res->buf = new_buf; /* should be atomic */
@@ -190,29 +207,20 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	if (rscreen->info.has_virtual_memory)
 		res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf);
 	else
 		res->gpu_address = 0;
 
 	pb_reference(&old_buf, NULL);
 
 	util_range_set_empty(&res->valid_buffer_range);
 	res->TC_L2_dirty = false;
 
-	/* Set expected VRAM and GART usage for the buffer. */
-	res->vram_usage = 0;
-	res->gart_usage = 0;
-
-	if (res->domains & RADEON_DOMAIN_VRAM)
-		res->vram_usage = size;
-	else if (res->domains & RADEON_DOMAIN_GTT)
-		res->gart_usage = size;
-
 	/* Print debug information. */
 	if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) {
 		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
 			res->gpu_address, res->gpu_address + res->buf->size,
 			res->buf->size);
 	}
 	return true;
 }
 
 static void r600_buffer_destroy(struct pipe_screen *screen,
@@ -496,20 +504,21 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
 {
 	struct r600_resource *rbuffer;
 
 	rbuffer = MALLOC_STRUCT(r600_resource);
 
 	rbuffer->b.b = *templ;
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;
 	rbuffer->b.vtbl = &r600_buffer_vtbl;
 	rbuffer->buf = NULL;
+	rbuffer->initialized = false;
 	rbuffer->TC_L2_dirty = false;
 	rbuffer->is_shared = false;
 	util_range_init(&rbuffer->valid_buffer_range);
 	return rbuffer;
 }
 
 struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 					 const struct pipe_resource *templ,
 					 unsigned alignment)
 {
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 5375044..358d5f4 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -169,20 +169,22 @@ struct r600_resource {
 
 	/* Winsys objects. */
 	struct pb_buffer		*buf;
 	uint64_t			gpu_address;
 	/* Memory usage if the buffer placement is optimal. */
 	uint64_t			vram_usage;
 	uint64_t			gart_usage;
 
 	/* Resource state. */
 	enum radeon_bo_domain		domains;
+	enum radeon_bo_flag		flags;
+	bool				initialized;
 
 	/* The buffer range which is initialized (with a write transfer,
 	 * streamout, DMA, or as a random access target). The rest of
 	 * the buffer is considered invalid and can be mapped unsynchronized.
 	 *
 	 * This allows unsychronized mapping of a buffer range which hasn't
 	 * been used yet. It's for applications which forget to use
 	 * the unsynchronized map flag and expect the driver to figure it out.
          */
 	struct util_range		valid_buffer_range;
-- 
2.7.4