Mesa (master): radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer

Marek Olšák mareko at kemper.freedesktop.org
Tue Oct 17 20:03:30 UTC 2017


Module: Mesa
Branch: master
Commit: 2f4705afde707e8eb41b9414c25df91aa1ea2fb3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2f4705afde707e8eb41b9414c25df91aa1ea2fb3

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sun Oct  8 03:44:07 2017 +0200

radeonsi: if there's just const buffer 0, set it in place of CONST/SSBO pointer

SI_SGPR_CONST_AND_SHADER_BUFFERS now contains the pointer to const buffer 0
if there is no other buffer there.

Benefits:
- there is no constbuf descriptor upload and shader load

It's assumed that all constant addresses are within bounds. Non-constant
addresses are clamped against the last declared CONST variable.
This only works if the state tracker ensures the bound constant buffer
matches what the shader needs.

Once we get 32-bit pointers, we can only do this for user constant buffers
where the driver is in charge of the upload so that it can guarantee a 32-bit
address.

The real performance benefit might not be measurable.

These apps get 100% theoretical benefit in all shaders (except where noted):
- antichamber
- barman arkham origins
- borderlands 2
- borderlands pre-sequel
- brutal legend
- civilization BE
- CS:GO
- deadcore
- dota 2 -- most shaders
- europa universalis
- grid autosport -- most shaders
- left 4 dead 2
- legend of grimrock
- life is strange
- payday 2
- portal
- rocket league
- serious sam 3 bfe
- talos principle
- team fortress 2
- thea
- unigine heaven
- unigine valley -- also sanctuary and tropics
- wasteland 2
- xcom: enemy unknown & enemy within
- tesseract
- unity (engine)

Changed stats only:
    SGPRS: 2059998 -> 2086238 (1.27 %)
    VGPRS: 1626888 -> 1626904 (0.00 %)
    Spilled SGPRs: 7902 -> 7865 (-0.47 %)
    Code Size: 60924520 -> 60982660 (0.10 %) bytes
    Max Waves: 374539 -> 374526 (-0.00 %)

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

---

 src/gallium/drivers/radeonsi/si_descriptors.c | 23 +++++++--
 src/gallium/drivers/radeonsi/si_shader.c      | 72 +++++++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_shader.h      |  2 +-
 src/gallium/drivers/radeonsi/si_state.h       |  3 ++
 4 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 0c1fca8718..da6efa8394 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -126,6 +126,7 @@ static void si_init_descriptors(struct si_descriptors *desc,
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
 	desc->shader_userdata_offset = shader_userdata_index * 4;
+	desc->slot_index_to_bind_directly = -1;
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
@@ -148,6 +149,20 @@ static bool si_upload_descriptors(struct si_context *sctx,
 	if (!upload_size)
 		return true;
 
+	/* If there is just one active descriptor, bind it directly. */
+	if ((int)desc->first_active_slot == desc->slot_index_to_bind_directly &&
+	    desc->num_active_slots == 1) {
+		uint32_t *descriptor = &desc->list[desc->slot_index_to_bind_directly *
+						   desc->element_dw_size];
+
+		/* The buffer is already in the buffer list. */
+		r600_resource_reference(&desc->buffer, NULL);
+		desc->gpu_list = NULL;
+		desc->gpu_address = si_desc_extract_buffer_address(descriptor);
+		si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+		return true;
+	}
+
 	uint32_t *ptr;
 	int buffer_offset;
 	u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
@@ -2531,14 +2546,15 @@ void si_init_all_descriptors(struct si_context *sctx)
 		bool gfx9_gs = false;
 		unsigned num_sampler_slots = SI_NUM_IMAGES / 2 + SI_NUM_SAMPLERS;
 		unsigned num_buffer_slots = SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS;
+		struct si_descriptors *desc;
 
 		if (sctx->b.chip_class >= GFX9) {
 			gfx9_tcs = i == PIPE_SHADER_TESS_CTRL;
 			gfx9_gs = i == PIPE_SHADER_GEOMETRY;
 		}
 
-		si_init_buffer_resources(&sctx->const_and_shader_buffers[i],
-					 si_const_and_shader_buffer_descriptors(sctx, i),
+		desc = si_const_and_shader_buffer_descriptors(sctx, i);
+		si_init_buffer_resources(&sctx->const_and_shader_buffers[i], desc,
 					 num_buffer_slots,
 					 gfx9_tcs ? GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS :
 					 gfx9_gs ? GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS :
@@ -2547,8 +2563,9 @@ void si_init_all_descriptors(struct si_context *sctx)
 					 RADEON_USAGE_READ,
 					 RADEON_PRIO_SHADER_RW_BUFFER,
 					 RADEON_PRIO_CONST_BUFFER);
+		desc->slot_index_to_bind_directly = si_get_constbuf_slot(0);
 
-		struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, i);
+		desc = si_sampler_and_image_descriptors(sctx, i);
 		si_init_descriptors(desc,
 				    gfx9_tcs ? GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES :
 				    gfx9_gs ? GFX9_SGPR_GS_SAMPLERS_AND_IMAGES :
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f72e5af31f..c3fe13deea 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1973,6 +1973,7 @@ static LLVMValueRef fetch_constant(
 	unsigned swizzle)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct si_shader_selector *sel = ctx->shader->selector;
 	const struct tgsi_ind_register *ireg = &reg->Indirect;
 	unsigned buf, idx;
 
@@ -1996,9 +1997,60 @@ static LLVMValueRef fetch_constant(
 		return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
 	}
 
+	idx = reg->Register.Index * 4 + swizzle;
+	if (reg->Register.Indirect) {
+		addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
+	} else {
+		addr = LLVMConstInt(ctx->i32, idx * 4, 0);
+	}
+
+	/* Fast path when user data SGPRs point to constant buffer 0 directly. */
+	if (sel->info.const_buffers_declared == 1 &&
+	    sel->info.shader_buffers_declared == 0) {
+		LLVMValueRef ptr =
+			LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
+
+		/* This enables use of s_load_dword and flat_load_dword for const buffer 0
+		 * loads, and up to x4 load opcode merging. However, it leads to horrible
+		 * code reducing SIMD wave occupancy from 8 to 2 in many cases.
+		 *
+		 * Using s_buffer_load_dword (x1) seems to be the best option right now.
+		 */
+#if 0 /* keep this codepath disabled */
+		if (!reg->Register.Indirect) {
+			addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
+			LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
+			return bitcast(bld_base, type, result);
+		}
+#endif
+
+		/* Do the bounds checking with a descriptor, because
+		 * doing computation and manual bounds checking of 64-bit
+		 * addresses generates horrible VALU code with very high
+		 * VGPR usage and very low SIMD occupancy.
+		 */
+		ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
+		ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
+
+		LLVMValueRef desc_elems[] = {
+			LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
+			LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
+			LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
+			LLVMConstInt(ctx->i32,
+				S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+				S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+				S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+				S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+				S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
+		};
+		LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
+		LLVMValueRef result = buffer_load_const(ctx, desc, addr);
+		return bitcast(bld_base, type, result);
+	}
+
 	assert(reg->Register.Dimension);
 	buf = reg->Dimension.Index;
-	idx = reg->Register.Index * 4 + swizzle;
 
 	if (reg->Dimension.Indirect) {
 		LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
@@ -2012,12 +2064,6 @@ static LLVMValueRef fetch_constant(
 	} else
 		bufp = load_const_buffer_desc(ctx, buf);
 
-	if (reg->Register.Indirect) {
-		addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
-	} else {
-		addr = LLVMConstInt(ctx->i32, idx * 4, 0);
-	}
-
 	return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
 }
 
@@ -4255,10 +4301,18 @@ static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
 					    struct si_function_info *fninfo,
 					    bool assign_params)
 {
+	LLVMTypeRef const_shader_buf_type;
+
+	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+	    ctx->shader->selector->info.shader_buffers_declared == 0)
+		const_shader_buf_type = ctx->f32;
+	else
+		const_shader_buf_type = ctx->v4i32;
+
 	unsigned const_and_shader_buffers =
 		add_arg(fninfo, ARG_SGPR,
-			si_const_array(ctx->v4i32,
-				       SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
+			si_const_array(const_shader_buf_type, 0));
+
 	unsigned samplers_and_images =
 		add_arg(fninfo, ARG_SGPR,
 			si_const_array(ctx->v8i32,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ba80f550e4..ebe956e709 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -161,7 +161,7 @@ enum {
 	SI_SGPR_RW_BUFFERS_HI,
 	SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
 	SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI,
-	SI_SGPR_CONST_AND_SHADER_BUFFERS,
+	SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
 	SI_SGPR_CONST_AND_SHADER_BUFFERS_HI,
 	SI_SGPR_SAMPLERS_AND_IMAGES,
 	SI_SGPR_SAMPLERS_AND_IMAGES_HI,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index eb1901b5a9..7eb0aa3c92 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -279,6 +279,9 @@ struct si_descriptors {
 	ubyte shader_userdata_offset;
 	/* The size of one descriptor. */
 	ubyte element_dw_size;
+	/* If there is only one slot enabled, bind it directly instead of
+	 * uploading descriptors. -1 if disabled. */
+	signed char slot_index_to_bind_directly;
 };
 
 struct si_buffer_resources {




More information about the mesa-commit mailing list