[Mesa-dev] [PATCH 1/2] ac: add subgroup scan functions for SI

Nicolai Hähnle nhaehnle at gmail.com
Mon May 7 13:25:40 UTC 2018


On 04.05.2018 20:32, Daniel Schürmann wrote:
> This functionality is currently disabled due to a bug in LLVM.
> The idea of this implementation is taken from the ROCm Device Libs:
> https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/ockl/src/wfredscan.cl
> ---
>   src/amd/common/ac_llvm_build.c | 98 ++++++++++++++++++++++++++++------
>   1 file changed, 82 insertions(+), 16 deletions(-)
> 
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index c9b2e36b63..8300c9d6bd 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -2967,26 +2967,66 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
>   	}
>   }
>   
> -/* TODO: add inclusive and excluse scan functions for SI chip class.  */
>   static LLVMValueRef
>   ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
>   {
>   	LLVMValueRef result, tmp;
>   	result = src;
> -	tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> -	tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> -	tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> -	tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> -	tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> -	tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> -	tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
> -	result = ac_build_alu_op(ctx, result, tmp, op);
> +	if (ctx->chip_class >= VI) {
> +		tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +	} else {
> +		LLVMValueRef tid = ac_get_thread_id(ctx);
> +		LLVMValueRef active;
> +		tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
> +				       LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
> +				       ctx->i32_0, "");
> +		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
> +				       ctx->i32_0, "");
> +		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
> +				       ctx->i32_0, "");
> +		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
> +				       ctx->i32_0, "");
> +		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
> +				       ctx->i32_0, "");
> +		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +		tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
> +				       ctx->i32_0, "");
> +		tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
> +		result = ac_build_alu_op(ctx, result, tmp, op);
> +	}
>   	return result;
>   }
>   
> @@ -3015,7 +3055,33 @@ ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
>   	result = LLVMBuildBitCast(ctx->builder,
>   								ac_build_set_inactive(ctx, src, identity),
>   								LLVMTypeOf(identity), "");
> -	result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
> +	if (ctx->chip_class >= VI) {
> +		result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
> +	} else {
> +		LLVMValueRef active, tmp1, tmp2;
> +		LLVMValueRef tid = ac_get_thread_id(ctx);
> +		tmp1 = ac_build_ds_swizzle(ctx, result, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
> +		tmp2 = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
> +				       LLVMConstInt(ctx->i32, 0x4, 0), "");
> +		tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
> +		tmp2 = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
> +				       LLVMConstInt(ctx->i32, 0x8, 0), "");
> +		tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
> +		tmp2 = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
> +				       LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
> +				       LLVMConstInt(ctx->i32, 0x10, 0), "");
> +		tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
> +		tmp2 = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
> +		tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
> +		active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
> +		result = LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
> +	}

This is one of those times where being inflexible in how we use LDS 
hurts us. It would be much faster to just write everything to LDS and 
read back from shifted offsets.

Anyway, it does look correct, so:

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>


>   	result = ac_build_scan(ctx, op, result, identity);
>   
>   	return ac_build_wwm(ctx, result);
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list