[Mesa-dev] [PATCH 12/15] ac: add support for SPV_AMD_shader_ballot
Connor Abbott
connora at valvesoftware.com
Tue Aug 8 01:32:38 UTC 2017
From: Connor Abbott <cwabbott0 at gmail.com>
Using the new WWM and DPP intrinsics introduced in LLVM 6.0. This adds
everything needed to implement SPV_AMD_shader_ballot, including the
Groups capability, to ac_llvm_build.c. That way, it can be shared by a
potential GL_AMD_shader_ballot implementation in the future. Currently,
the implementation only uses the DPP instructions that are available on
VI+, so SI and CI won't be able to use the extension, but it should be
possible (albeit a little tricky) to use ds_swizzle to get support for
SI and CI.
---
src/amd/common/ac_llvm_build.c | 703 +++++++++++++++++++++++++++++++++++++++++
src/amd/common/ac_llvm_build.h | 115 +++++++
2 files changed, 818 insertions(+)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index d4b48d1..c75bf00 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -347,6 +347,709 @@ ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
return LLVMBuildOr(ctx->builder, all, none, "");
}
+LLVMValueRef ac_reduce_iadd(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
+}
+
+LLVMValueRef ac_reduce_fadd(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
+}
+
+LLVMValueRef ac_reduce_fmin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ char name[32], type[8];
+ ac_build_type_name_for_intr(LLVMTypeOf(lhs), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.minnum.%s", type);
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(lhs),
+ (LLVMValueRef []) { lhs, rhs }, 2,
+ AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_reduce_fmax(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ char name[32], type[8];
+ ac_build_type_name_for_intr(LLVMTypeOf(lhs), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(lhs),
+ (LLVMValueRef []) { lhs, rhs }, 2,
+ AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_reduce_imin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntSLT,
+ lhs, rhs, ""),
+ lhs, rhs, "");
+}
+
+LLVMValueRef ac_reduce_imax(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntSGT,
+ lhs, rhs, ""),
+ lhs, rhs, "");
+}
+
+LLVMValueRef ac_reduce_umin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntULT,
+ lhs, rhs, ""),
+ lhs, rhs, "");
+}
+
+LLVMValueRef ac_reduce_umax(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs)
+{
+ return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntUGT,
+ lhs, rhs, ""),
+ lhs, rhs, "");
+}
+
+enum dpp_ctrl {
+ _dpp_quad_perm = 0x000,
+ _dpp_row_sl = 0x100,
+ _dpp_row_sr = 0x110,
+ _dpp_row_rr = 0x120,
+ dpp_wf_sl1 = 0x130,
+ dpp_wf_rl1 = 0x134,
+ dpp_wf_sr1 = 0x138,
+ dpp_wf_rr1 = 0x13C,
+ dpp_row_mirror = 0x140,
+ dpp_row_half_mirror = 0x141,
+ dpp_row_bcast15 = 0x142,
+ dpp_row_bcast31 = 0x143
+};
+
+static inline enum dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+ assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+ return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
+}
+
+static inline enum dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+ assert(amount > 0 && amount < 16);
+ return _dpp_row_sl | amount;
+}
+
+static inline enum dpp_ctrl
+dpp_row_sr(unsigned amount)
+{
+ assert(amount > 0 && amount < 16);
+ return _dpp_row_sr | amount;
+}
+
+static LLVMValueRef
+_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+ enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+ bool bound_ctrl)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
+ LLVMTypeOf(old), (LLVMValueRef[]) {
+ old, src,
+ LLVMConstInt(ctx->i32, dpp_ctrl, 0),
+ LLVMConstInt(ctx->i32, row_mask, 0),
+ LLVMConstInt(ctx->i32, bank_mask, 0),
+ LLVMConstInt(ctx->i1, bound_ctrl, 0) },
+ 6, AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+ enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+ bool bound_ctrl)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ old = ac_to_integer(ctx, old);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
+ bank_mask, bound_ctrl);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ LLVMValueRef old_vector =
+ LLVMBuildBitCast(ctx->builder, old, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ old = LLVMBuildExtractElement(ctx->builder, old_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
+ dpp_ctrl,
+ row_mask,
+ bank_mask,
+ bound_ctrl);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef lane)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.readlane",
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, lane },
+ 2, AC_FUNC_ATTR_NOUNWIND |
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef lane)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_readlane(ctx, src, lane);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_readlane(ctx, src,
+ lane);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned mask)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, LLVMConstInt(ctx->i32, mask, 0) },
+ 2, AC_FUNC_ATTR_NOUNWIND |
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned mask)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_ds_swizzle(ctx, src, mask);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
+ mask);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef inactive)
+{
+ char name[32], type[8];
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ inactive = ac_to_integer(ctx, inactive);
+ ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
+ LLVMValueRef ret =
+ ac_build_intrinsic(ctx, name,
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, inactive }, 2,
+ AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
+{
+ char name[32], type[8];
+ ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
+ (LLVMValueRef []) { src }, 1,
+ AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef
+ac_build_subgroup_inclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef src,
+ ac_reduce_op reduce,
+ LLVMValueRef identity)
+{
+ /* See http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/
+ *
+ * Note that each dpp/reduce pair is supposed to be compiled down to
+ * one instruction by LLVM, at least for 32-bit values.
+ *
+ * TODO: use @llvm.amdgcn.ds.swizzle on SI and CI
+ */
+ LLVMValueRef value = src;
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, src,
+ dpp_row_sr(1), 0xf, 0xf, false));
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, src,
+ dpp_row_sr(2), 0xf, 0xf, false));
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, src,
+ dpp_row_sr(3), 0xf, 0xf, false));
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, value,
+ dpp_row_sr(4), 0xf, 0xe, false));
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, value,
+ dpp_row_sr(8), 0xf, 0xc, false));
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, value,
+ dpp_row_bcast15, 0xa, 0xf, false));
+ value = reduce(ctx, value,
+ ac_build_dpp(ctx, identity, value,
+ dpp_row_bcast31, 0xc, 0xf, false));
+ return value;
+}
+
+LLVMValueRef
+ac_build_subgroup_inclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity)
+{
+ ac_build_optimization_barrier(ctx, &value);
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+ return ac_build_wwm(ctx, value);
+}
+
+
+LLVMValueRef
+ac_build_subgroup_reduce(struct ac_llvm_context *ctx, LLVMValueRef value,
+ ac_reduce_op reduce, LLVMValueRef identity)
+{
+
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+ value = ac_build_readlane(ctx, value, LLVMConstInt(ctx->i32, 63, 0));
+ return ac_build_wwm(ctx, value);
+}
+
+LLVMValueRef
+ac_build_subgroup_reduce_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity)
+{
+ ac_build_optimization_barrier(ctx, &value);
+ return ac_build_subgroup_reduce(ctx, value, reduce, identity);
+}
+
+LLVMValueRef
+ac_build_subgroup_exclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity)
+{
+ value = ac_build_dpp(ctx, identity, value, dpp_wf_sr1, 0xf, 0xf, false);
+ return ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+}
+
+LLVMValueRef
+ac_build_subgroup_exclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity)
+{
+ ac_build_optimization_barrier(ctx, &value);
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_exclusive_scan(ctx, value, reduce, identity);
+ return ac_build_wwm(ctx, value);
+}
+
+LLVMValueRef
+ac_build_swizzle_quad(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned swizzle_mask)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ /* TODO: use @llvm.amdgcn.ds.swizzle on SI and CI */
+ return ac_build_dpp(ctx, LLVMGetUndef(LLVMTypeOf(src)), src,
+ dpp_quad_perm(swizzle_mask & 0x3,
+ (swizzle_mask >> 2) & 0x3,
+ (swizzle_mask >> 4) & 0x3,
+ (swizzle_mask >> 6) & 0x3),
+ 0xf, 0xf, /*bound_ctrl:0*/ true);
+}
+
+LLVMValueRef
+ac_build_swizzle_masked(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned swizzle_mask)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ /* TODO: For some special mask values, we could use DPP instead on VI+.
+ * We might be able to use DPP entirely, but it would be a little
+ * tricky.
+ */
+ return ac_build_ds_swizzle(ctx, src, swizzle_mask);
+}
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef write, LLVMValueRef lane)
+{
+ /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
+ */
+ LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
+ ac_get_thread_id(ctx), "");
+ return LLVMBuildSelect(ctx->builder, pred, write, src, "");
+}
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
+{
+ LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
+ LLVMVectorType(ctx->i32, 2),
+ "");
+ LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
+ ctx->i32_0, "");
+ LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
+ ctx->i32_1, "");
+ LLVMValueRef val =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+ (LLVMValueRef []) { mask_lo, ctx->i32_0 },
+ 2, AC_FUNC_ATTR_READNONE);
+ val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
+ (LLVMValueRef []) { mask_hi, val },
+ 2, AC_FUNC_ATTR_READNONE);
+ return val;
+}
+
+/* return true for exactly one thread in the subgroup/wavefront */
+
+static LLVMValueRef
+ac_build_subgroup_elect(struct ac_llvm_context *ctx)
+{
+ LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
+ /* mbcnt(EXEC) returns the number of active threads with ID less than
+ * ours, so the lowest thread will return 0.
+ */
+ LLVMValueRef active_tid = ac_build_mbcnt(ctx, active_set);
+ return LLVMBuildICmp(ctx->builder, LLVMIntEQ, active_tid, ctx->i32_0,
+ "");
+}
+
+static LLVMValueRef
+ac_build_subgroup_elect_uniform(struct ac_llvm_context *ctx)
+{
+ return LLVMBuildICmp(ctx->builder, LLVMIntEQ, ac_get_thread_id(ctx),
+ ctx->i32_0, "");
+}
+
+#define LOCAL_ADDR_SPACE 3
+
+static LLVMValueRef
+get_shared_temp(struct ac_llvm_context *ctx,
+ LLVMTypeRef type,
+ unsigned max_workgroup_size)
+{
+ /* TODO only make one variable and share it */
+ return LLVMAddGlobalInAddressSpace(
+ ctx->module,
+ LLVMArrayType(type, DIV_ROUND_UP(max_workgroup_size, 64)),
+ "reduce_temp", LOCAL_ADDR_SPACE);
+}
+
+/* given an array of values, emit code to reduce them to a single value using a
+ * given operator. Note that this isn't cross-thread at all; it's just normal
+ * LLVM code.
+ */
+static LLVMValueRef
+reduce_array(struct ac_llvm_context *ctx, LLVMValueRef array,
+ ac_reduce_op reduce)
+{
+ unsigned size = LLVMGetArrayLength(LLVMTypeOf(array));
+ assert(size > 0);
+ if (size == 1)
+ return LLVMBuildExtractValue(ctx->builder, array, 0, "");
+
+ LLVMTypeRef elem_type = LLVMGetElementType(LLVMTypeOf(array));
+
+ unsigned left_size = size / 2;
+ LLVMValueRef left = LLVMGetUndef(LLVMArrayType(elem_type, left_size));
+ for (unsigned i = 0; i < left_size; i++) {
+ LLVMValueRef val = LLVMBuildExtractValue(ctx->builder, array,
+ i, "");
+ left = LLVMBuildInsertValue(ctx->builder, left, val, i, "");
+ }
+ left = reduce_array(ctx, left, reduce);
+
+ unsigned right_size = size - left_size;
+ LLVMValueRef right = LLVMGetUndef(LLVMArrayType(elem_type, right_size));
+ for (unsigned i = 0; i < right_size; i++) {
+ LLVMValueRef val = LLVMBuildExtractValue(ctx->builder, array,
+ i + left_size, "");
+ right = LLVMBuildInsertValue(ctx->builder, right, val, i, "");
+ }
+ right = reduce_array(ctx, right, reduce);
+
+ return reduce(ctx, left, right);
+}
+
+static LLVMValueRef
+_ac_build_group_reduce(struct ac_llvm_context *ctx,
+ LLVMValueRef value, ac_reduce_op reduce,
+ LLVMValueRef identity, bool exclusive_scan,
+ bool uniform,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ if (max_workgroup_size <= 64) {
+ if (exclusive_scan)
+ return identity;
+ else
+ return value;
+ }
+
+ /* Allocate some temporary storage, one value for each wavefront. */
+ LLVMValueRef shared = get_shared_temp(ctx, LLVMTypeOf(value),
+ max_workgroup_size);
+
+ LLVMValueRef func =
+ LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
+ LLVMBasicBlockRef if_block =
+ LLVMAppendBasicBlockInContext(ctx->context, func, "");
+ LLVMBasicBlockRef merge_block =
+ LLVMAppendBasicBlockInContext(ctx->context, func, "");
+
+ /* gather the subgroup-reduced values from each buffer into shared */
+
+ LLVMBuildCondBr(ctx->builder,
+ (uniform ? ac_build_subgroup_elect_uniform :
+ ac_build_subgroup_elect)(ctx),
+ if_block, merge_block);
+ /* if (subgroup_elect()) */
+ {
+ LLVMPositionBuilderAtEnd(ctx->builder, if_block);
+ LLVMValueRef ptr = ac_build_gep0(ctx, shared, wavefront_id);
+ LLVMBuildStore(ctx->builder, value, ptr);
+ LLVMBuildBr(ctx->builder, merge_block);
+ }
+
+ LLVMPositionBuilderAtEnd(ctx->builder, merge_block);
+
+ ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0,
+ AC_FUNC_ATTR_CONVERGENT);
+
+ /* For each wavefront, load every other wavefront's values from the
+ * previous stage.
+ */
+ LLVMValueRef array = LLVMBuildLoad(ctx->builder, shared, "");
+
+ if (exclusive_scan) {
+ /* mask out values from wavefronts greater than or equal to
+ * ours, to implement exclusive scan
+ */
+ for (unsigned i = 0; 64 * i < max_workgroup_size; i++) {
+ LLVMValueRef wf_value =
+ LLVMBuildExtractValue(ctx->builder, array, i,
+ "");
+ LLVMValueRef pred =
+ LLVMBuildICmp(ctx->builder, LLVMIntULT,
+ LLVMConstInt(ctx->i32, i, 0),
+ wavefront_id,
+ "");
+ wf_value = LLVMBuildSelect(ctx->builder, pred,
+ wf_value, identity, "");
+ array = LLVMBuildInsertValue(ctx->builder, array,
+ wf_value, i, "");
+ }
+ }
+
+ /* finally, manually reduce the values from each wavefront without any
+ * cross-thread tricks.
+ */
+ return reduce_array(ctx, array, reduce);
+}
+
+LLVMValueRef
+ac_build_group_reduce(struct ac_llvm_context *ctx,
+ LLVMValueRef value, ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ value = ac_build_subgroup_reduce(ctx, value, reduce, identity);
+ return _ac_build_group_reduce(ctx, value, reduce, identity, false,
+ true, max_workgroup_size, wavefront_id);
+}
+
+LLVMValueRef
+ac_build_group_reduce_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value, ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ value = ac_build_subgroup_reduce_nonuniform(ctx, value, reduce,
+ identity);
+ return _ac_build_group_reduce(ctx, value, reduce, identity, false,
+ false, max_workgroup_size, wavefront_id);
+}
+
+LLVMValueRef
+ac_build_group_exclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value, ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ /* Do the exclusive scan per-wavefront, and at the same time calculate
+ * the fully-reduced value for doing the overall exclusive scan.
+ */
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+ LLVMValueRef reduced = ac_build_readlane(ctx, value,
+ LLVMConstInt(ctx->i32, 63,
+ 0));
+ value = ac_build_dpp(ctx, identity, value, dpp_wf_sr1, 0xf, 0xf,
+ false);
+ reduced = ac_build_wwm(ctx, reduced);
+ value = ac_build_wwm(ctx, value);
+ reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true,
+ true, max_workgroup_size,
+ wavefront_id);
+ return reduce(ctx, value, reduced);
+}
+
+LLVMValueRef
+ac_build_group_exclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ ac_build_optimization_barrier(ctx, &value);
+ /* Do the exclusive scan per-wavefront, and at the same time calculate
+ * the fully-reduced value for doing the overall exclusive scan.
+ */
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+ LLVMValueRef reduced = ac_build_readlane(ctx, value,
+ LLVMConstInt(ctx->i32, 63,
+ 0));
+ value = ac_build_dpp(ctx, identity, value, dpp_wf_sr1, 0xf, 0xf,
+ false);
+ reduced = ac_build_wwm(ctx, reduced);
+ value = ac_build_wwm(ctx, value);
+ reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true,
+ false, max_workgroup_size,
+ wavefront_id);
+ return reduce(ctx, value, reduced);
+}
+
+LLVMValueRef
+ac_build_group_inclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value, ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ /* Do the inclusive scan per-wavefront, and at the same time calculate
+ * the fully-reduced value for doing the overall exclusive scan.
+ */
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+ LLVMValueRef reduced = ac_build_readlane(ctx, value,
+ LLVMConstInt(ctx->i32, 63,
+ 0));
+ reduced = ac_build_wwm(ctx, reduced);
+ value = ac_build_wwm(ctx, value);
+ reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true,
+ true, max_workgroup_size,
+ wavefront_id);
+ return reduce(ctx, value, reduced);
+}
+
+LLVMValueRef
+ac_build_group_inclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id)
+{
+ ac_build_optimization_barrier(ctx, &value);
+ /* Do the inclusive scan per-wavefront, and at the same time calculate
+ * the fully-reduced value for doing the overall exclusive scan.
+ */
+ value = ac_build_set_inactive(ctx, value, identity);
+ value = ac_build_subgroup_inclusive_scan(ctx, value, reduce, identity);
+ LLVMValueRef reduced = ac_build_readlane(ctx, value,
+ LLVMConstInt(ctx->i32, 63,
+ 0));
+ reduced = ac_build_wwm(ctx, reduced);
+ value = ac_build_wwm(ctx, value);
+ reduced = _ac_build_group_reduce(ctx, reduced, reduce, identity, true,
+ false, max_workgroup_size,
+ wavefront_id);
+ return reduce(ctx, value, reduced);
+}
+
LLVMValueRef
ac_build_gather_values_extended(struct ac_llvm_context *ctx,
LLVMValueRef *values,
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 1d9850b..463f3a9 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -84,6 +84,19 @@ void ac_build_optimization_barrier(struct ac_llvm_context *ctx,
LLVMValueRef *pvgpr);
+LLVMValueRef
+ac_build_swizzle_quad(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned swizzle_mask);
+
+LLVMValueRef
+ac_build_swizzle_masked(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned swizzle_mask);
+
+LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef write, LLVMValueRef lane);
+
+LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask);
+
LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value);
LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value);
@@ -92,6 +105,108 @@ LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value);
LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value);
+typedef LLVMValueRef (*ac_reduce_op)(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_iadd(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_fadd(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_fmin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_fmax(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_imax(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_umax(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_fmin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_imin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_reduce_umin(struct ac_llvm_context *ctx, LLVMValueRef lhs,
+ LLVMValueRef rhs);
+
+LLVMValueRef ac_build_subgroup_reduce(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity);
+
+LLVMValueRef ac_build_subgroup_inclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity);
+
+LLVMValueRef ac_build_subgroup_exclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity);
+
+LLVMValueRef ac_build_subgroup_reduce_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity);
+
+LLVMValueRef ac_build_subgroup_inclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity);
+
+LLVMValueRef ac_build_subgroup_exclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity);
+
+LLVMValueRef ac_build_group_reduce(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id);
+
+LLVMValueRef ac_build_group_inclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id);
+
+LLVMValueRef ac_build_group_exclusive_scan(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id);
+
+LLVMValueRef ac_build_group_reduce_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id);
+
+LLVMValueRef ac_build_group_inclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id);
+
+LLVMValueRef ac_build_group_exclusive_scan_nonuniform(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ ac_reduce_op reduce,
+ LLVMValueRef identity,
+ unsigned max_workgroup_size,
+ LLVMValueRef wavefront_id);
+
LLVMValueRef
ac_build_gather_values_extended(struct ac_llvm_context *ctx,
LLVMValueRef *values,
--
2.9.4
More information about the mesa-dev
mailing list