[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Sat Jun 10 20:05:30 UTC 2017


Slightly faster than bpermute, and seems supported since at least
LLVM 3.9.

v2: Since this supersedes bpermute, remove the bpermute code.
Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
---
 src/amd/common/ac_llvm_build.c           | 47 ++++++++++++++++++++------------
 src/amd/common/ac_llvm_build.h           |  2 +-
 src/amd/common/ac_nir_to_llvm.c          |  8 +++---
 src/gallium/drivers/radeonsi/si_pipe.c   |  2 +-
 src/gallium/drivers/radeonsi/si_pipe.h   |  2 +-
 src/gallium/drivers/radeonsi/si_shader.c |  4 +--
 6 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 237e9291d41..99d41bf52d6 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -783,41 +783,52 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
  */
 LLVMValueRef
 ac_build_ddxy(struct ac_llvm_context *ctx,
-	      bool has_ds_bpermute,
+	      bool has_mov_dpp,
 	      uint32_t mask,
 	      int idx,
 	      LLVMValueRef lds,
 	      LLVMValueRef val)
 {
-	LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
+	LLVMValueRef thread_id, tl, trbl, args[5];
 	LLVMValueRef result;
 
-	thread_id = ac_get_thread_id(ctx);
+	if (has_mov_dpp) {
+		uint32_t tl_ctrl = 0, trbl_ctrl = 0;
 
-	tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-			      LLVMConstInt(ctx->i32, mask, false), "");
-
-	trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-				LLVMConstInt(ctx->i32, idx, false), "");
+		for (unsigned i = 0; i < 4; ++i) {
+			tl_ctrl |= (i & mask) << (2 * i);
+			trbl_ctrl |= ((i & mask) + idx) << (2 * i);
+		}
 
-	if (has_ds_bpermute) {
-		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-				       LLVMConstInt(ctx->i32, 4, false), "");
-		args[1] = val;
+		args[0] = val;
+		args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
+		args[2] = LLVMConstInt(ctx->i32, 0xf, false);
+		args[3] = LLVMConstInt(ctx->i32, 0xf, false);
+		args[4] = LLVMConstInt(ctx->i1, 1, false);
 		tl = ac_build_intrinsic(ctx,
-					"llvm.amdgcn.ds.bpermute", ctx->i32,
-					args, 2,
+					"llvm.amdgcn.mov.dpp.i32", ctx->i32,
+					args, 5,
 					AC_FUNC_ATTR_READNONE |
 					AC_FUNC_ATTR_CONVERGENT);
 
-		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-				       LLVMConstInt(ctx->i32, 4, false), "");
+		args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
 		trbl = ac_build_intrinsic(ctx,
-					  "llvm.amdgcn.ds.bpermute", ctx->i32,
-					  args, 2,
+					  "llvm.amdgcn.mov.dpp.i32", ctx->i32,
+					  args, 5,
 					  AC_FUNC_ATTR_READNONE |
 					  AC_FUNC_ATTR_CONVERGENT);
 	} else {
+		LLVMValueRef tl_tid, trbl_tid;
+
+		thread_id = ac_get_thread_id(ctx);
+
+		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+				LLVMConstInt(ctx->i32, mask, false), "");
+
+		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+					LLVMConstInt(ctx->i32, idx, false), "");
+
+
 		LLVMValueRef store_ptr, load_ptr0, load_ptr1;
 
 		store_ptr = ac_build_gep0(ctx, lds, thread_id);
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index ebb78fbd79b..14260b05018 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -161,7 +161,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx);
 
 LLVMValueRef
 ac_build_ddxy(struct ac_llvm_context *ctx,
-	      bool has_ds_bpermute,
+	      bool has_mov_dpp,
 	      uint32_t mask,
 	      int idx,
 	      LLVMValueRef lds,
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 49117d21bd2..2385c60d316 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -164,7 +164,7 @@ struct nir_to_llvm_context {
 	uint8_t num_output_clips;
 	uint8_t num_output_culls;
 
-	bool has_ds_bpermute;
+	bool has_mov_dpp;
 
 	bool is_gs_copy_shader;
 	LLVMValueRef gs_next_vertex;
@@ -1434,7 +1434,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
 	LLVMValueRef result;
 	ctx->has_ddxy = true;
 
-	if (!ctx->lds && !ctx->has_ds_bpermute)
+	if (!ctx->lds && !ctx->has_mov_dpp)
 		ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module,
 						       LLVMArrayType(ctx->i32, 64),
 						       "ddxy_lds", LOCAL_ADDR_SPACE);
@@ -1454,7 +1454,7 @@ static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
 	else
 		idx = 2;
 
-	result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,
+	result = ac_build_ddxy(&ctx->ac, ctx->has_mov_dpp,
 			      mask, idx, ctx->lds,
 			      src0);
 	return result;
@@ -5858,7 +5858,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 	ac_llvm_context_init(&ctx.ac, ctx.context);
 	ctx.ac.module = ctx.module;
 
-	ctx.has_ds_bpermute = ctx.options->chip_class >= VI;
+	ctx.has_mov_dpp = ctx.options->chip_class >= VI;
 
 	memset(shader_info, 0, sizeof(*shader_info));
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index cb372267cde..7e83d5e5ac4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -944,7 +944,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 		 sscreen->b.info.pfp_fw_version >= 121 &&
 		 sscreen->b.info.me_fw_version >= 87);
 
-	sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
+	sscreen->has_mov_dpp = sscreen->b.chip_class >= VI;
 	sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
 					    sscreen->b.family <= CHIP_POLARIS12) ||
 					   sscreen->b.family == CHIP_VEGA10 ||
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 108929c10c6..ef139fb0cd3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -79,7 +79,7 @@ struct si_screen {
 	unsigned			tess_offchip_block_dw_size;
 	bool				has_distributed_tess;
 	bool				has_draw_indirect_multi;
-	bool				has_ds_bpermute;
+	bool				has_mov_dpp;
 	bool				has_msaa_sample_loc_bug;
 
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 2c92269a575..2eed45d79a5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3442,7 +3442,7 @@ static void si_llvm_emit_ddxy(
 	idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
 
 	val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
-	val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
+	val = ac_build_ddxy(&ctx->ac, ctx->screen->has_mov_dpp,
 			    mask, idx, ctx->lds, val);
 	emit_data->output[emit_data->chan] = val;
 }
@@ -4454,7 +4454,7 @@ static void create_function(struct si_shader_context *ctx)
 	assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
 	shader->info.num_input_vgprs -= num_prolog_vgprs;
 
-	if (!ctx->screen->has_ds_bpermute &&
+	if (!ctx->screen->has_mov_dpp &&
 	    bld_base->info &&
 	    (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
 	     bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
-- 
2.13.0



More information about the mesa-dev mailing list