[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Sat Jun 10 19:52:50 UTC 2017


Slightly faster than bpermute, and seems supported since at least
LLVM 3.9.

Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
---
 src/amd/common/ac_llvm_build.c | 78 +++++++++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 237e9291d41..62a00f214de 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -789,44 +789,74 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
 	      LLVMValueRef lds,
 	      LLVMValueRef val)
 {
-	LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
+	LLVMValueRef thread_id, tl, trbl, args[5];
 	LLVMValueRef result;
 
-	thread_id = ac_get_thread_id(ctx);
-
-	tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-			      LLVMConstInt(ctx->i32, mask, false), "");
+	/* bpermute is VI+, mov_dpp is VI+ too */
+	if (has_ds_bpermute) {
+		uint32_t tl_ctrl = 0, trbl_ctrl = 0;
 
-	trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-				LLVMConstInt(ctx->i32, idx, false), "");
+		for (unsigned i = 0; i < 4; ++i) {
+			tl_ctrl |= (i & mask) << (2 * i);
+			trbl_ctrl |= ((i & mask) + idx) << (2 * i);
+		}
 
-	if (has_ds_bpermute) {
-		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-				       LLVMConstInt(ctx->i32, 4, false), "");
-		args[1] = val;
+		args[0] = val;
+		args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
+		args[2] = LLVMConstInt(ctx->i32, 0xf, false);
+		args[3] = LLVMConstInt(ctx->i32, 0xf, false);
+		args[4] = LLVMConstInt(ctx->i1, 1, false);
 		tl = ac_build_intrinsic(ctx,
-					"llvm.amdgcn.ds.bpermute", ctx->i32,
-					args, 2,
+					"llvm.amdgcn.mov.dpp.i32", ctx->i32,
+					args, 5,
 					AC_FUNC_ATTR_READNONE |
 					AC_FUNC_ATTR_CONVERGENT);
 
-		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-				       LLVMConstInt(ctx->i32, 4, false), "");
+		args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
 		trbl = ac_build_intrinsic(ctx,
-					  "llvm.amdgcn.ds.bpermute", ctx->i32,
-					  args, 2,
+					  "llvm.amdgcn.mov.dpp.i32", ctx->i32,
+					  args, 5,
 					  AC_FUNC_ATTR_READNONE |
 					  AC_FUNC_ATTR_CONVERGENT);
 	} else {
-		LLVMValueRef store_ptr, load_ptr0, load_ptr1;
+		LLVMValueRef tl_tid, trbl_tid;
+
+		thread_id = ac_get_thread_id(ctx);
+
+		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+				LLVMConstInt(ctx->i32, mask, false), "");
+
+		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+					LLVMConstInt(ctx->i32, idx, false), "");
+
+		if (has_ds_bpermute) {
+			args[0] = LLVMBuildMul(ctx->builder, tl_tid,
+					LLVMConstInt(ctx->i32, 4, false), "");
+			args[1] = val;
+			tl = ac_build_intrinsic(ctx,
+						"llvm.amdgcn.ds.bpermute", ctx->i32,
+						args, 2,
+						AC_FUNC_ATTR_READNONE |
+						AC_FUNC_ATTR_CONVERGENT);
+
+			args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
+					LLVMConstInt(ctx->i32, 4, false), "");
+			trbl = ac_build_intrinsic(ctx,
+						"llvm.amdgcn.ds.bpermute", ctx->i32,
+						args, 2,
+						AC_FUNC_ATTR_READNONE |
+						AC_FUNC_ATTR_CONVERGENT);
+		} else {
+			LLVMValueRef store_ptr, load_ptr0, load_ptr1;
 
-		store_ptr = ac_build_gep0(ctx, lds, thread_id);
-		load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
-		load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
+			store_ptr = ac_build_gep0(ctx, lds, thread_id);
+			load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
+			load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
 
-		LLVMBuildStore(ctx->builder, val, store_ptr);
-		tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
-		trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+			LLVMBuildStore(ctx->builder, val, store_ptr);
+			tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
+			trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+		}
 	}
 
 	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
-- 
2.13.0



More information about the mesa-dev mailing list