[Mesa-dev] [PATCH] ac: Use mov_dpp for derivatives.

Nicolai Hähnle nhaehnle at gmail.com
Mon Jun 12 09:56:32 UTC 2017


On 10.06.2017 21:52, Bas Nieuwenhuizen wrote:
> Slightly faster than bpermute, and seems supported since at least
> LLVM 3.9.
> 
> Signed-off-by: Bas Nieuwenhuizen <basni at google.com>
> ---
>   src/amd/common/ac_llvm_build.c | 78 +++++++++++++++++++++++++++++-------------
>   1 file changed, 54 insertions(+), 24 deletions(-)
> 
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 237e9291d41..62a00f214de 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -789,44 +789,74 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
>   	      LLVMValueRef lds,
>   	      LLVMValueRef val)
>   {
> -	LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, args[2];
> +	LLVMValueRef thread_id, tl, trbl, args[5];
>   	LLVMValueRef result;
>   
> -	thread_id = ac_get_thread_id(ctx);
> -
> -	tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> -			      LLVMConstInt(ctx->i32, mask, false), "");
> +	/* bpermute is VI+, mov_dpp is VI+ too */
> +	if (has_ds_bpermute) {
> +		uint32_t tl_ctrl = 0, trbl_ctrl = 0;
>   
> -	trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> -				LLVMConstInt(ctx->i32, idx, false), "");
> +		for (unsigned i = 0; i < 4; ++i) {
> +			tl_ctrl |= (i & mask) << (2 * i);
> +			trbl_ctrl |= ((i & mask) + idx) << (2 * i);
> +		}
>   
> -	if (has_ds_bpermute) {
> -		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> -				       LLVMConstInt(ctx->i32, 4, false), "");
> -		args[1] = val;
> +		args[0] = val;
> +		args[1] = LLVMConstInt(ctx->i32, tl_ctrl, false);
> +		args[2] = LLVMConstInt(ctx->i32, 0xf, false);
> +		args[3] = LLVMConstInt(ctx->i32, 0xf, false);
> +		args[4] = LLVMConstInt(ctx->i1, 1, false);
>   		tl = ac_build_intrinsic(ctx,
> -					"llvm.amdgcn.ds.bpermute", ctx->i32,
> -					args, 2,
> +					"llvm.amdgcn.mov.dpp.i32", ctx->i32,
> +					args, 5,
>   					AC_FUNC_ATTR_READNONE |
>   					AC_FUNC_ATTR_CONVERGENT);
>   
> -		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> -				       LLVMConstInt(ctx->i32, 4, false), "");
> +		args[1] = LLVMConstInt(ctx->i32, trbl_ctrl, false);
>   		trbl = ac_build_intrinsic(ctx,
> -					  "llvm.amdgcn.ds.bpermute", ctx->i32,
> -					  args, 2,
> +					  "llvm.amdgcn.mov.dpp.i32", ctx->i32,
> +					  args, 5,
>   					  AC_FUNC_ATTR_READNONE |
>   					  AC_FUNC_ATTR_CONVERGENT);
>   	} else {
> -		LLVMValueRef store_ptr, load_ptr0, load_ptr1;
> +		LLVMValueRef tl_tid, trbl_tid;
> +
> +		thread_id = ac_get_thread_id(ctx);
> +
> +		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> +				LLVMConstInt(ctx->i32, mask, false), "");
> +
> +		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> +					LLVMConstInt(ctx->i32, idx, false), "");
> +
> +		if (has_ds_bpermute) {

This is dead now...

Apart from this, the new code looks good. Does LLVM already optimize 
that down to two VALU instructions by pulling the DPP into the v_add?

With the dead code removed:

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>


> +			args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> +					LLVMConstInt(ctx->i32, 4, false), "");
> +			args[1] = val;
> +			tl = ac_build_intrinsic(ctx,
> +						"llvm.amdgcn.ds.bpermute", ctx->i32,
> +						args, 2,
> +						AC_FUNC_ATTR_READNONE |
> +						AC_FUNC_ATTR_CONVERGENT);
> +
> +			args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> +					LLVMConstInt(ctx->i32, 4, false), "");
> +			trbl = ac_build_intrinsic(ctx,
> +						"llvm.amdgcn.ds.bpermute", ctx->i32,
> +						args, 2,
> +						AC_FUNC_ATTR_READNONE |
> +						AC_FUNC_ATTR_CONVERGENT);
> +		} else {
> +			LLVMValueRef store_ptr, load_ptr0, load_ptr1;
>   
> -		store_ptr = ac_build_gep0(ctx, lds, thread_id);
> -		load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
> -		load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
> +			store_ptr = ac_build_gep0(ctx, lds, thread_id);
> +			load_ptr0 = ac_build_gep0(ctx, lds, tl_tid);
> +			load_ptr1 = ac_build_gep0(ctx, lds, trbl_tid);
>   
> -		LLVMBuildStore(ctx->builder, val, store_ptr);
> -		tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
> -		trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
> +			LLVMBuildStore(ctx->builder, val, store_ptr);
> +			tl = LLVMBuildLoad(ctx->builder, load_ptr0, "");
> +			trbl = LLVMBuildLoad(ctx->builder, load_ptr1, "");
> +		}
>   	}
>   
>   	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list