[Mesa-dev] [PATCH 01/15] nir: define intrinsics needed for AMD_shader_ballot
Connor Abbott
connora at valvesoftware.com
Tue Aug 8 01:32:27 UTC 2017
From: Connor Abbott <cwabbott0 at gmail.com>
---
src/compiler/nir/nir.h | 7 +++
src/compiler/nir/nir_intrinsics.h | 124 +++++++++++++++++++++++++++++++++++++-
src/compiler/nir/nir_print.c | 1 +
3 files changed, 129 insertions(+), 3 deletions(-)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 24934f0..4b5d78e 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1076,6 +1076,12 @@ typedef enum {
*/
NIR_INTRINSIC_INTERP_MODE = 9,
+ /*
+ * Constant mask, invocation, swizzle, etc. used for a subgroup operation.
+ * The precise meaning depends on the intrinsic.
+ */
+ NIR_INTRINSIC_SUBGROUP_DATA = 10,
+
NIR_INTRINSIC_NUM_INDEX_FLAGS,
} nir_intrinsic_index_flag;
@@ -1144,6 +1150,7 @@ INTRINSIC_IDX_ACCESSORS(desc_set, DESC_SET, unsigned)
INTRINSIC_IDX_ACCESSORS(binding, BINDING, unsigned)
INTRINSIC_IDX_ACCESSORS(component, COMPONENT, unsigned)
INTRINSIC_IDX_ACCESSORS(interp_mode, INTERP_MODE, unsigned)
+INTRINSIC_IDX_ACCESSORS(subgroup_data, SUBGROUP_DATA, unsigned)
/**
* \group texture information
diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h
index 72c4296..32f52aa 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -93,12 +93,24 @@ INTRINSIC(get_buffer_size, 1, ARR(1), true, 1, 0, 0, xx, xx, xx,
* the comment for NIR_INTRINSIC_CONVERGENT in nir.h for details.
*/
#define CONVERGENT(name, num_srcs, src0_components, src1_components, \
- dest_components) \
- INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \
+ src2_components, dest_components) \
+ INTRINSIC(name, num_srcs, ARR(src0_components, src1_components, \
+ src2_components), \
true, dest_components, 0, 0, xx, xx, xx, \
NIR_INTRINSIC_CAN_REORDER | NIR_INTRINSIC_CAN_ELIMINATE | \
NIR_INTRINSIC_CONVERGENT)
+/*
+ * Similar to CONVERGENT, except optimizations can assume that the intrinsic is
+ * only called in uniform control flow.
+ */
+#define UNIFORM_CONTROL(name, num_srcs, src0_components, src1_components, \
+ dest_components) \
+ INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \
+ true, dest_components, 0, 0, xx, xx, xx, \
+ NIR_INTRINSIC_CAN_REORDER | NIR_INTRINSIC_CAN_ELIMINATE | \
+ NIR_INTRINSIC_UNIFORM_CONTROL)
+
BARRIER(barrier)
BARRIER(discard)
@@ -126,7 +138,7 @@ INTRINSIC(shader_clock, 0, ARR(0), true, 2, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_
* GLSL functions from ARB_shader_ballot.
*/
CROSS_THREAD(ballot, 1, 1, 0, 1)
-CONVERGENT(read_invocation, 2, 0, 1, 0)
+CONVERGENT(read_invocation, 2, 0, 1, 0, 0)
CROSS_THREAD(read_first_invocation, 1, 0, 0, 0)
/*
@@ -148,6 +160,105 @@ CROSS_THREAD(vote_any, 1, 1, 0, 1)
CROSS_THREAD(vote_all, 1, 1, 0, 1)
CROSS_THREAD(vote_eq, 1, 1, 0, 1)
+/* AMD_shader_ballot intrinsics */
+
+/*
+ * This is like CROSS_THREAD for instructions that communicate across an entire
+ * workgroup, instead of just the subgroup.
+ */
+
+#define CROSS_SUBGROUP(name, num_srcs, src0_components, src1_components, \
+ dest_components) \
+ INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \
+ true, dest_components, 0, 0, xx, xx, xx, \
+ NIR_INTRINSIC_CAN_ELIMINATE)
+
+#define CROSS_SUBGROUP_UNIFORM(name, num_srcs, src0_components, \
+ src1_components, dest_components) \
+ INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \
+ true, dest_components, 0, 0, xx, xx, xx, \
+ NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_UNIFORM_CONTROL)
+
+#define REDUCE_OP(name, postfix) \
+ UNIFORM_CONTROL(subgroup_##name##postfix, 1, 0, 0, 0) \
+ CROSS_SUBGROUP(subgroup_##name##postfix##_nonuniform, 1, 0, 0, 0) \
+ CROSS_SUBGROUP_UNIFORM(group_##name##postfix, 1, 0, 0, 0) \
+ CROSS_SUBGROUP(group_##name##postfix##_nonuniform, 1, 0, 0, 0) \
+
+#define GROUP_SUBGROUP_REDUCE(name) \
+ REDUCE_OP(name, ) \
+ REDUCE_OP(name, _inclusive_scan) \
+ REDUCE_OP(name, _exclusive_scan)
+
+GROUP_SUBGROUP_REDUCE(iadd)
+GROUP_SUBGROUP_REDUCE(fadd)
+GROUP_SUBGROUP_REDUCE(fmin)
+GROUP_SUBGROUP_REDUCE(umin)
+GROUP_SUBGROUP_REDUCE(imin)
+GROUP_SUBGROUP_REDUCE(fmax)
+GROUP_SUBGROUP_REDUCE(umax)
+GROUP_SUBGROUP_REDUCE(imax)
+
+#undef GROUP_SUBGROUP_REDUCE
+#undef REDUCE_OP
+
+/* Analogous to vote_any and vote_all, but works across an entire workgroup.
+ * Also, this version can only be called in uniform control flow.
+ */
+CROSS_SUBGROUP_UNIFORM(group_any, 1, 1, 0, 1)
+CROSS_SUBGROUP_UNIFORM(group_all, 1, 1, 0, 1)
+
+/* Similarly, this is analogous to read_invocation but works across an entire
+ * workgroup and can only be called in uniform control flow. Unlike
+ * read_invocation, the second argument is a 3-dimensional local ID instead of
+ * a simple linear index.
+ */
+CROSS_SUBGROUP_UNIFORM(group_broadcast, 2, 1, 3, 1)
+
+#define CROSS_THREAD_WITH_DATA(name, src_components, dest_components) \
+ INTRINSIC(name, 1, ARR(src_components), \
+ true, dest_components, 0, 1, SUBGROUP_DATA, xx, xx, \
+ NIR_INTRINSIC_CAN_REORDER | NIR_INTRINSIC_CAN_ELIMINATE | \
+ NIR_INTRINSIC_CROSS_THREAD)
+
+/* The index is is interpreted as a swizzle value, with bits [2*i:2*i+1]
+ * giving the lane within the quad that should be stored in the i'th lane of
+ * the quad. Inactive lanes return 0. That is, the computation is:
+ *
+ * for (i = 0; i < SubgroupSize; i++) {
+ * for (j = 0; j < 4; j++) {
+ * out[i + j] = is_active[i + swizzle[2*j:2*j+1]] ? in[i + swizzle[2*j:2*j+1]] : 0;
+ * }
+ * }
+ */
+CROSS_THREAD_WITH_DATA(quad_swizzle_amd, 0, 0)
+
+/*
+ * Implements AMD's swizzle-masked intrinsic. The mask is interpreted as 3
+ * 5-bit integers. Implements the following, taken from AMD_shader_ballot:
+ *
+ * for (i = 0; i < SubgroupSize; i++) {
+ * j = (((i & 0x1f) & data[0:4]) | data[5:9]) ^ data[10:14];
+ * j |= i & 0x20;
+ * out[i] = is_active[j] ? in[j] : 0;
+ */
+CROSS_THREAD_WITH_DATA(masked_swizzle_amd, 0, 0)
+
+/*
+ * The opposite of read_invocation - return the first argument, except for the
+ * second argument in the invocation given by the third argument. The second
+ * and third arguments must be dynamically uniform within the subgroup.
+ */
+CONVERGENT(write_invocation, 3, 0, 0, 1, 0)
+
+/*
+ * Implements the AMD mbcnt instruction. Returns:
+ *
+ * bitCount(gl_SubgroupLtMask & src0)
+ */
+INTRINSIC(mbcnt_amd, 1, ARR(1), true, 1, 0, 0, xx, xx, xx,
+ NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+
/**
* Basic Geometry Shader intrinsics.
*
@@ -387,6 +498,13 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx)
SYSTEM_VALUE(blend_const_color_rgba8888_unorm, 1, 0, xx, xx, xx)
SYSTEM_VALUE(blend_const_color_aaaa8888_unorm, 1, 0, xx, xx, xx)
+#define CROSS_THREAD_UNIFORM(name, dest_components, src_components) \
+ INTRINSIC(name, 1, ARR(src_components), true, dest_components, 0, 0, \
+ xx, xx, xx, \
+ NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER | \
+ NIR_INTRINSIC_CROSS_THREAD | NIR_INTRINSIC_CONVERGENT)
+
+
/**
* Barycentric coordinate intrinsics.
*
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index f4811fe..b9cb686 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -595,6 +595,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
[NIR_INTRINSIC_BINDING] = "binding",
[NIR_INTRINSIC_COMPONENT] = "component",
[NIR_INTRINSIC_INTERP_MODE] = "interp_mode",
+ [NIR_INTRINSIC_SUBGROUP_DATA] = "subgroup_data",
};
for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) {
if (!info->index_map[idx])
--
2.9.4
More information about the mesa-dev
mailing list