[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Sat Jun 10 19:52:50 UTC 2017
Slightly faster than bpermute, and seems supported since at least
LLVM 3.9.
Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
---
src/amd/common/ac_llvm_build.c | 78 +++++++++++++++++++++++++++++-------------
1 file changed, 54 insertions(+), 24 deletions(-)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 237e9291d41..62a00f214de 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -789,44 +789,74 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
LLVMValueRef lds,
LLVMValueRef val)
{
- LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
+ LLVMValueRef thread_id, tl, trbl, args[5];
LLVMValueRef result;
- thread_id = ac_get_thread_id(ctx);
-
- tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
- LLVMConstInt(ctx->i32, mask, false), "");
+ /* bpermute is VI+, mov_dpp is VI+ too */
+ if (has_ds_bpermute) {
+ uint32_t tl_ctrl = 0, trbl_ctrl = 0;
- trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
- LLVMConstInt(ctx->i32, idx, false), "");
+ for (unsigned i = 0; i < 4; ++i) {
+ tl_ctrl |= (i & mask) << (2 * i);
+ trbl_ctrl |= ((i & mask) + idx) << (2 * i);
+ }
- if (has_ds_bpermute) {
- args[0] = LLVMBuildMul(ctx->builder, tl_tid,
- LLVMConstInt(ctx->i32, 4, false), "");
- args[1] = val;
+ args[0] = val;
+ args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
+ args[2] = LLVMConstInt(ctx->i32, 0xf, false);
+ args[3] = LLVMConstInt(ctx->i32, 0xf, false);
+ args[4] = LLVMConstInt(ctx->i1, 1, false);
tl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.bpermute", ctx->i32,
- args, 2,
+ "llvm.amdgcn.mov.dpp.i32", ctx->i32,
+ args, 5,
AC_FUNC_ATTR_READNONE |
AC_FUNC_ATTR_CONVERGENT);
- args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
- LLVMConstInt(ctx->i32, 4, false), "");
+ args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
trbl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.bpermute", ctx->i32,
- args, 2,
+ "llvm.amdgcn.mov.dpp.i32", ctx->i32,
+ args, 5,
AC_FUNC_ATTR_READNONE |
AC_FUNC_ATTR_CONVERGENT);
} else {
- LLVMValueRef store_ptr, load_ptr0, load_ptr1;
+ LLVMValueRef tl_tid, trbl_tid;
+
+ thread_id = ac_get_thread_id(ctx);
+
+ tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+ LLVMConstInt(ctx->i32, mask, false), "");
+
+ trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+ LLVMConstInt(ctx->i32, idx, false), "");
+
+ if (has_ds_bpermute) {
+ args[0] = LLVMBuildMul(ctx->builder, tl_tid,
+ LLVMConstInt(ctx->i32, 4, false), "");
+ args[1] = val;
+ tl = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ args, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+
+ args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
+ LLVMConstInt(ctx->i32, 4, false), "");
+ trbl = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ args, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+ } else {
+ LLVMValueRef store_ptr, load_ptr0, load_ptr1;
- store_ptr = ac_build_gep0(ctx, lds, thread_id);
- load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
- load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
+ store_ptr = ac_build_gep0(ctx, lds, thread_id);
+ load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
+ load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
- LLVMBuildStore(ctx->builder, val, store_ptr);
- tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
- trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+ LLVMBuildStore(ctx->builder, val, store_ptr);
+ tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
+ trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
+ }
}
tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
--
2.13.0
More information about the mesa-dev
mailing list