[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.
Nicolai Hähnle
nhaehnle at gmail.com
Mon Jun 12 09:56:32 UTC 2017
On 10.06.2017 21:52, Bas Nieuwenhuizen wrote:
> Slightly faster than bpermute, and seems supported since at least
> LLVM 3.9.
>
> Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
> ---
> src/amd/common/ac_llvm_build.c | 78 +++++++++++++++++++++++++++++-------------
> 1 file changed, 54 insertions(+), 24 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 237e9291d41..62a00f214de 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -789,44 +789,74 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
> LLVMValueRef lds,
> LLVMValueRef val)
> {
> - LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
> + LLVMValueRef thread_id, tl, trbl, args[5];
> LLVMValueRef result;
>
> - thread_id = ac_get_thread_id(ctx);
> -
> - tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> - LLVMConstInt(ctx->i32, mask, false), "");
> + /* bpermute is VI+, mov_dpp is VI+ too */
> + if (has_ds_bpermute) {
> + uint32_t tl_ctrl = 0, trbl_ctrl = 0;
>
> - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> - LLVMConstInt(ctx->i32, idx, false), "");
> + for (unsigned i = 0; i < 4; ++i) {
> + tl_ctrl |= (i & mask) << (2 * i);
> + trbl_ctrl |= ((i & mask) + idx) << (2 * i);
> + }
>
> - if (has_ds_bpermute) {
> - args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> - LLVMConstInt(ctx->i32, 4, false), "");
> - args[1] = val;
> + args[0] = val;
> + args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
> + args[2] = LLVMConstInt(ctx->i32, 0xf, false);
> + args[3] = LLVMConstInt(ctx->i32, 0xf, false);
> + args[4] = LLVMConstInt(ctx->i1, 1, false);
> tl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.bpermute", ctx->i32,
> - args, 2,
> + "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> + args, 5,
> AC_FUNC_ATTR_READNONE |
> AC_FUNC_ATTR_CONVERGENT);
>
> - args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> - LLVMConstInt(ctx->i32, 4, false), "");
> + args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
> trbl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.bpermute", ctx->i32,
> - args, 2,
> + "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> + args, 5,
> AC_FUNC_ATTR_READNONE |
> AC_FUNC_ATTR_CONVERGENT);
> } else {
> - LLVMValueRef store_ptr, load_ptr0, load_ptr1;
> + LLVMValueRef tl_tid, trbl_tid;
> +
> + thread_id = ac_get_thread_id(ctx);
> +
> + tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> + LLVMConstInt(ctx->i32, mask, false), "");
> +
> + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> + LLVMConstInt(ctx->i32, idx, false), "");
> +
> + if (has_ds_bpermute) {
This is dead now...
Apart from this, the new code looks good. Does LLVM already optimize
that down to two VALU instructions by pulling the DPP into the v_add?
With the dead code removed:
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
> + args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> + LLVMConstInt(ctx->i32, 4, false), "");
> + args[1] = val;
> + tl = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.ds.bpermute", ctx->i32,
> + args, 2,
> + AC_FUNC_ATTR_READNONE |
> + AC_FUNC_ATTR_CONVERGENT);
> +
> + args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> + LLVMConstInt(ctx->i32, 4, false), "");
> + trbl = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.ds.bpermute", ctx->i32,
> + args, 2,
> + AC_FUNC_ATTR_READNONE |
> + AC_FUNC_ATTR_CONVERGENT);
> + } else {
> + LLVMValueRef store_ptr, load_ptr0, load_ptr1;
>
> - store_ptr = ac_build_gep0(ctx, lds, thread_id);
> - load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
> - load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
> + store_ptr = ac_build_gep0(ctx, lds, thread_id);
> + load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
> + load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
>
> - LLVMBuildStore(ctx->builder, val, store_ptr);
> - tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
> - trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
> + LLVMBuildStore(ctx->builder, val, store_ptr);
> + tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
> + trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
> + }
> }
>
> tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list