Mesa (main): nir: add new SSA instruction scheduler grouping loads into indirection groups
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Nov 8 21:48:27 UTC 2021
Module: Mesa
Branch: main
Commit: 33b4eb149ea79d9dd4b80ddda079ad027e5a40bf
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=33b4eb149ea79d9dd4b80ddda079ad027e5a40bf
Author: Marek Olšák <marek.olsak at amd.com>
Date: Tue Oct 26 20:00:58 2021 -0400
nir: add new SSA instruction scheduler grouping loads into indirection groups
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13604>
---
src/compiler/nir/meson.build | 1 +
src/compiler/nir/nir.h | 8 +
src/compiler/nir/nir_group_loads.c | 484 +++++++++++++++++++++++++++++++++++++
3 files changed, 493 insertions(+)
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 26dfc2de516..a309538e2bf 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -114,6 +114,7 @@ files_libnir = files(
'nir_gather_info.c',
'nir_gather_ssa_types.c',
'nir_gather_xfb_info.c',
+ 'nir_group_loads.c',
'nir_gs_count_vertices.c',
'nir_inline_functions.c',
'nir_inline_uniforms.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 9325f821ba4..780959b6906 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4660,6 +4660,14 @@ void nir_gs_count_vertices_and_primitives(const nir_shader *shader,
int *out_prmcnt,
unsigned num_streams);
+typedef enum {
+ nir_group_all,
+ nir_group_same_resource_only,
+} nir_load_grouping;
+
+void nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
+ unsigned max_distance);
+
bool nir_shrink_vec_array_vars(nir_shader *shader, nir_variable_mode modes);
bool nir_split_array_vars(nir_shader *shader, nir_variable_mode modes);
bool nir_split_var_copies(nir_shader *shader);
diff --git a/src/compiler/nir/nir_group_loads.c b/src/compiler/nir/nir_group_loads.c
new file mode 100644
index 00000000000..e290012437c
--- /dev/null
+++ b/src/compiler/nir/nir_group_loads.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright © 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* This is a new block-level load instruction scheduler where loads are grouped
+ * according to their indirection level within a basic block. An indirection
+ * is when a result of one load is used as a source of another load. The result
+ * is that disjoint ALU opcode groups and load (texture) opcode groups are
+ * created where each next load group is the next level of indirection.
+ * It's done by finding the first and last load with the same indirection
+ * level, and moving all unrelated instructions between them after the last
+ * load except for load sources, which are moved before the first load.
+ * It naturally suits hardware that has limits on texture indirections, but
+ * other hardware can benefit too. Only texture, image, and SSBO load and
+ * atomic instructions are grouped.
+ *
+ * There is an option to group only those loads that use the same resource
+ * variable. This increases the chance to get more cache hits than if the loads
+ * were spread out.
+ *
+ * The increased register usage is offset by the increase in observed memory
+ * bandwidth due to more cache hits (dependent on hw behavior) and thus
+ * decrease the subgroup lifetime, which allows registers to be deallocated
+ * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
+ * benefit at all. Doubling the register usage and using those registers to
+ * amplify observed bandwidth can improve performance a lot.
+ *
+ * It's recommended to run a hw-specific instruction scheduler after this to
+ * prevent spilling.
+ */
+
+#include "nir.h"
+
+static bool
+is_memory_load(nir_instr *instr)
+{
+ /* Count texture_size too because it has the same latency as cache hits. */
+ if (instr->type == nir_instr_type_tex)
+ return true;
+
+ if (instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ const char *name = nir_intrinsic_infos[intr->intrinsic].name;
+
+ /* TODO: nir_intrinsics.py could do this */
+ /* load_ubo is ignored because it's usually cheap. */
+ if (!nir_intrinsic_writes_external_memory(intr) &&
+ !strstr(name, "shared") &&
+ (strstr(name, "ssbo") || strstr(name, "image")))
+ return true;
+ }
+
+ return false;
+}
+
+static nir_instr *
+get_intrinsic_resource(nir_intrinsic_instr *intr)
+{
+ /* This is also the list of intrinsics that are grouped. */
+ /* load_ubo is ignored because it's usually cheap. */
+ switch (intr->intrinsic) {
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_sparse_load:
+ case nir_intrinsic_image_deref_sparse_load:
+ /* Group image_size too because it has the same latency as cache hits. */
+ case nir_intrinsic_image_size:
+ case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_bindless_image_load:
+ case nir_intrinsic_bindless_image_sparse_load:
+ case nir_intrinsic_load_ssbo:
+ return intr->src[0].ssa->parent_instr;
+ default:
+ return NULL;
+ }
+}
+
+/* Track only those that we want to group. */
+static bool
+is_grouped_load(nir_instr *instr)
+{
+ /* Count texture_size too because it has the same latency as cache hits. */
+ if (instr->type == nir_instr_type_tex)
+ return true;
+
+ if (instr->type == nir_instr_type_intrinsic)
+ return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;
+
+ return false;
+}
+
+static bool
+can_move(nir_instr *instr, uint8_t current_indirection_level)
+{
+ /* Grouping is done by moving everything else out of the first/last
+ * instruction range of the indirection level.
+ */
+ if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
+ return false;
+
+ if (instr->type == nir_instr_type_alu ||
+ instr->type == nir_instr_type_deref ||
+ instr->type == nir_instr_type_tex ||
+ instr->type == nir_instr_type_load_const ||
+ instr->type == nir_instr_type_ssa_undef)
+ return true;
+
+ if (instr->type == nir_instr_type_intrinsic &&
+ nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
+ return true;
+
+ return false;
+}
+
+static nir_instr *
+get_uniform_inst_resource(nir_instr *instr)
+{
+ if (instr->type == nir_instr_type_tex) {
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+ if (tex->texture_non_uniform)
+ return NULL;
+
+ for (unsigned i = 0; i < tex->num_srcs; i++) {
+ switch (tex->src[i].src_type) {
+ case nir_tex_src_texture_deref:
+ case nir_tex_src_texture_handle:
+ return tex->src[i].src.ssa->parent_instr;
+ default:
+ break;
+ }
+ }
+ return NULL;
+ }
+
+ if (instr->type == nir_instr_type_intrinsic)
+ return get_intrinsic_resource(nir_instr_as_intrinsic(instr));
+
+ return NULL;
+}
+
+struct check_sources_state
+{
+ nir_block *block;
+ uint32_t first_index;
+};
+
+static bool
+has_only_sources_less_than(nir_src *src, void *data)
+{
+ struct check_sources_state *state = (struct check_sources_state *)data;
+
+ /* true if nir_foreach_src should keep going */
+ return state->block != src->ssa->parent_instr->block ||
+ src->ssa->parent_instr->index < state->first_index;
+}
+
+static void
+group_loads(nir_instr *first, nir_instr *last)
+{
+ /* Walk the instruction range between the first and last backward, and
+ * move those that have no uses within the range after the last one.
+ */
+ for (nir_instr *instr = exec_node_data_backward(nir_instr,
+ last->node.prev, node);
+ instr != first;
+ instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
+ /* Only move instructions without side effects. */
+ if (!can_move(instr, first->pass_flags))
+ continue;
+
+ nir_ssa_def *def = nir_instr_ssa_def(instr);
+ if (def) {
+ bool all_uses_after_last = true;
+
+ nir_foreach_use(use, def) {
+ if (use->parent_instr->block == instr->block &&
+ use->parent_instr->index <= last->index) {
+ all_uses_after_last = false;
+ break;
+ }
+ }
+
+ if (all_uses_after_last) {
+ nir_instr *move_instr = instr;
+ /* Set the last instruction because we'll delete the current one. */
+ instr = exec_node_data_forward(nir_instr, instr->node.next, node);
+
+ /* Move the instruction after the last and update its index
+ * to indicate that it's after it.
+ */
+ nir_instr_move(nir_after_instr(last), move_instr);
+ move_instr->index = last->index + 1;
+ }
+ }
+ }
+
+ struct check_sources_state state;
+ state.block = first->block;
+ state.first_index = first->index;
+
+ /* Walk the instruction range between the first and last forward, and move
+ * those that have no sources within the range before the first one.
+ */
+ for (nir_instr *instr = exec_node_data_forward(nir_instr,
+ first->node.next, node);
+ instr != last;
+ instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
+ /* Only move instructions without side effects. */
+ if (!can_move(instr, first->pass_flags))
+ continue;
+
+ if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
+ nir_instr *move_instr = instr;
+ /* Set the last instruction because we'll delete the current one. */
+ instr = exec_node_data_backward(nir_instr, instr->node.prev, node);
+
+ /* Move the instruction before the first and update its index
+ * to indicate that it's before it.
+ */
+ nir_instr_move(nir_before_instr(first), move_instr);
+ move_instr->index = first->index - 1;
+ }
+ }
+}
+
+static bool
+is_pseudo_inst(nir_instr *instr)
+{
+ /* Other instructions do not usually contribute to the shader binary size. */
+ return instr->type != nir_instr_type_alu &&
+ instr->type != nir_instr_type_call &&
+ instr->type != nir_instr_type_tex &&
+ instr->type != nir_instr_type_intrinsic;
+}
+
+static void
+set_instr_indices(nir_block *block)
+{
+ /* Start with 1 because we'll move instruction before the first one
+ * and will want to label it 0.
+ */
+ unsigned counter = 1;
+ nir_instr *last = NULL;
+
+ nir_foreach_instr(instr, block) {
+ /* Make sure grouped instructions don't have the same index as pseudo
+ * instructions.
+ */
+ if (last && is_pseudo_inst(last) && is_grouped_load(instr))
+ counter++;
+
+ /* Set each instruction's index within the block. */
+ instr->index = counter;
+
+ /* Only count non-pseudo instructions. */
+ if (!is_pseudo_inst(instr))
+ counter++;
+
+ last = instr;
+ }
+}
+
+static void
+handle_load_range(nir_instr **first, nir_instr **last,
+ nir_instr *current, unsigned max_distance)
+{
+ if (*first && *last &&
+ (!current || current->index > (*first)->index + max_distance)) {
+ assert(*first != *last);
+ group_loads(*first, *last);
+ set_instr_indices((*first)->block);
+ *first = NULL;
+ *last = NULL;
+ }
+}
+
+static bool
+is_barrier(nir_instr *instr)
+{
+ if (instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ const char *name = nir_intrinsic_infos[intr->intrinsic].name;
+
+
+ if (intr->intrinsic == nir_intrinsic_discard ||
+ intr->intrinsic == nir_intrinsic_discard_if ||
+ intr->intrinsic == nir_intrinsic_terminate ||
+ intr->intrinsic == nir_intrinsic_terminate_if ||
+ /* TODO: nir_intrinsics.py could do this */
+ strstr(name, "barrier"))
+ return true;
+ }
+
+ return false;
+}
+
+struct indirection_state
+{
+ nir_block *block;
+ unsigned indirections;
+};
+
+static unsigned
+get_num_indirections(nir_instr *instr);
+
+static bool
+gather_indirections(nir_src *src, void *data)
+{
+ struct indirection_state *state = (struct indirection_state *)data;
+ nir_instr *instr = src->ssa->parent_instr;
+
+ /* We only count indirections within the same block. */
+ if (instr->block == state->block) {
+ unsigned indirections = get_num_indirections(src->ssa->parent_instr);
+
+ if (instr->type == nir_instr_type_tex || is_memory_load(instr))
+ indirections++;
+
+ state->indirections = MAX2(state->indirections, indirections);
+ }
+
+ return true; /* whether nir_foreach_src should keep going */
+}
+
+/* Return the number of load indirections within the block. */
+static unsigned
+get_num_indirections(nir_instr *instr)
+{
+ /* Don't traverse phis because we could end up in an infinite recursion
+ * if the phi points to the current block (such as a loop body).
+ */
+ if (instr->type == nir_instr_type_phi)
+ return 0;
+
+ if (instr->index != UINT32_MAX)
+ return instr->index; /* we've visited this instruction before */
+
+ struct indirection_state state;
+ state.block = instr->block;
+ state.indirections = 0;
+
+ nir_foreach_src(instr, gather_indirections, &state);
+
+ instr->index = state.indirections;
+ return state.indirections;
+}
+
+static void
+process_block(nir_block *block, nir_load_grouping grouping,
+ unsigned max_distance)
+{
+ int max_indirection = -1;
+ unsigned num_inst_per_level[256] = {0};
+
+ /* UINT32_MAX means the instruction has not been visited. Once
+ * an instruction has been visited and its indirection level has been
+ * determined, we'll store the indirection level in the index. The next
+ * instruction that visits it will use the index instead of recomputing
+ * the indirection level, which would result in an exponetial time
+ * complexity.
+ */
+ nir_foreach_instr(instr, block) {
+ instr->index = UINT32_MAX; /* unknown */
+ }
+
+ /* Count the number of load indirections for each load instruction
+ * within this block. Store it in pass_flags.
+ */
+ nir_foreach_instr(instr, block) {
+ if (is_grouped_load(instr)) {
+ unsigned indirections = get_num_indirections(instr);
+
+ /* pass_flags has only 8 bits */
+ indirections = MIN2(indirections, 255);
+ num_inst_per_level[indirections]++;
+ instr->pass_flags = indirections;
+
+ max_indirection = MAX2(max_indirection, (int)indirections);
+ }
+ }
+
+ /* 255 contains all indirection levels >= 255, so ignore them. */
+ max_indirection = MIN2(max_indirection, 254);
+
+ /* Each indirection level is grouped. */
+ for (int level = 0; level <= max_indirection; level++) {
+ if (num_inst_per_level[level] <= 1)
+ continue;
+
+ set_instr_indices(block);
+
+ nir_instr *resource = NULL;
+ nir_instr *first_load = NULL, *last_load = NULL;
+
+ /* Find the first and last instruction that use the same
+ * resource and are within a certain distance of each other.
+ * If found, group them by moving all movable instructions
+ * between them out.
+ */
+ nir_foreach_instr(current, block) {
+ /* Don't group across barriers. */
+ if (is_barrier(current)) {
+ /* Group unconditionally. */
+ handle_load_range(&first_load, &last_load, NULL, 0);
+ first_load = NULL;
+ last_load = NULL;
+ continue;
+ }
+
+ /* Only group load instructions with the same indirection level. */
+ if (current->pass_flags == level && is_grouped_load(current)) {
+ nir_instr *current_resource;
+
+ switch (grouping) {
+ case nir_group_all:
+ if (!first_load)
+ first_load = current;
+ else
+ last_load = current;
+ break;
+
+ case nir_group_same_resource_only:
+ current_resource = get_uniform_inst_resource(current);
+
+ if (current_resource) {
+ if (!first_load) {
+ first_load = current;
+ resource = current_resource;
+ } else if (current_resource == resource) {
+ last_load = current;
+ }
+ }
+ }
+ }
+
+ /* Group only if we exceeded the maximum distance. */
+ handle_load_range(&first_load, &last_load, current, max_distance);
+ }
+
+ /* Group unconditionally. */
+ handle_load_range(&first_load, &last_load, NULL, 0);
+ }
+}
+
+/* max_distance is the maximum distance between the first and last instruction
+ * in a group.
+ */
+void
+nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
+ unsigned max_distance)
+{
+ nir_foreach_function(function, shader) {
+ if (function->impl) {
+ nir_foreach_block(block, function->impl) {
+ process_block(block, grouping, max_distance);
+ }
+
+ nir_metadata_preserve(function->impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance |
+ nir_metadata_loop_analysis);
+ }
+ }
+}
More information about the mesa-commit
mailing list