Mesa (master): r600/sfn: add lowering passes to get 64 bit ops lowered to 32 bit vec2
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Jan 12 19:37:15 UTC 2021
Module: Mesa
Branch: master
Commit: 165fb5117bf70402e66d34538d4085e060f57fea
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=165fb5117bf70402e66d34538d4085e060f57fea
Author: Gert Wollny <gert.wollny at collabora.com>
Date: Sat Nov 28 16:34:26 2020 +0100
r600/sfn: add lowering passes to get 64 bit ops lowered to 32 bit vec2
The lower_double and lower_int64 don't lower all 64 bit IO ops and merging
to and splitting fromn 64 bit values. So here goes a bunch of lowering
passes that takes care of this and also of merging IO that might have been
split.
Signed-off-by: Gert Wollny <gert.wollny at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7824>
---
src/gallium/drivers/r600/meson.build | 1 +
src/gallium/drivers/r600/sfn/sfn_nir.h | 6 +
.../drivers/r600/sfn/sfn_nir_lower_64bit.cpp | 1064 ++++++++++++++++++++
3 files changed, 1071 insertions(+)
diff --git a/src/gallium/drivers/r600/meson.build b/src/gallium/drivers/r600/meson.build
index 880dad590cb..c5c8a99e1e0 100644
--- a/src/gallium/drivers/r600/meson.build
+++ b/src/gallium/drivers/r600/meson.build
@@ -149,6 +149,7 @@ files_r600 = files(
'sfn/sfn_liverange.h',
'sfn/sfn_nir.cpp',
'sfn/sfn_nir.h',
+ 'sfn/sfn_nir_lower_64bit.cpp',
'sfn/sfn_nir_lower_fs_out_to_vector.cpp',
'sfn/sfn_nir_lower_fs_out_to_vector.h',
'sfn/sfn_nir_lower_tess_io.cpp',
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.h b/src/gallium/drivers/r600/sfn/sfn_nir.h
index f3209109319..2740f425bd5 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.h
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.h
@@ -60,6 +60,12 @@ bool r600_lower_scratch_addresses(nir_shader *shader);
bool r600_lower_ubo_to_align16(nir_shader *shader);
+bool r600_nir_split_64bit_io(nir_shader *sh);
+
+bool r600_nir_64_to_vec2(nir_shader *sh);
+
+bool r600_merge_vec2_stores(nir_shader *shader);
+
class Shader {
public:
std::vector<InstructionBlock>& m_ir;
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp b/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp
new file mode 100644
index 00000000000..14caedd7195
--- /dev/null
+++ b/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp
@@ -0,0 +1,1064 @@
+/* -*- mesa-c++ -*-
+ *
+ * Copyright (c) 2020 Collabora LTD
+ *
+ * Author: Gert Wollny <gert.wollny at collabora.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sfn_nir.h"
+
+#include "nir.h"
+#include "nir_builder.h"
+
+#include <map>
+#include <vector>
+#include <iostream>
+
+namespace r600 {
+
+using std::map;
+using std::pair;
+using std::make_pair;
+using std::vector;
+
+class LowerSplit64BitVar : public NirLowerInstruction {
+public:
+
+ ~LowerSplit64BitVar();
+ using VarSplit = pair<nir_variable*, nir_variable*>;
+ using VarMap = map<unsigned, VarSplit>;
+
+ nir_ssa_def *
+ split_double_load_deref(nir_intrinsic_instr *intr);
+
+ nir_ssa_def *
+ split_double_store_deref(nir_intrinsic_instr *intr);
+
+private:
+ nir_ssa_def *
+ split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
+
+ nir_ssa_def *
+ split_load_deref_var(nir_intrinsic_instr *intr);
+
+ nir_ssa_def *
+ split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref);
+
+ nir_ssa_def *
+ split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
+
+ VarSplit get_var_pair(nir_variable *old_var);
+
+ nir_ssa_def *
+ merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3);
+
+ nir_ssa_def *split_double_load(nir_intrinsic_instr *load1);
+
+ nir_ssa_def *
+ split_store_output(nir_intrinsic_instr *store1);
+
+ nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr);
+
+ nir_ssa_def *
+ split_double_load_ssbo(nir_intrinsic_instr *intr);
+
+ nir_ssa_def *
+ split_double_load_ubo(nir_intrinsic_instr *intr);
+
+ nir_ssa_def *
+ split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
+
+ nir_ssa_def *
+ split_reduction3(nir_alu_instr *alu,
+ nir_op op1, nir_op op2, nir_op reduction);
+
+ nir_ssa_def *
+ split_reduction4(nir_alu_instr *alu,
+ nir_op op1, nir_op op2, nir_op reduction);
+
+ nir_ssa_def *split_bcsel(nir_alu_instr *alu);
+
+ nir_ssa_def *split_load_const(nir_load_const_instr *lc);
+
+ bool filter(const nir_instr *instr) const override;
+ nir_ssa_def *lower(nir_instr *instr) override;
+
+ VarMap m_varmap;
+ vector<nir_variable*> m_old_vars;
+ vector<nir_instr *> m_old_stores;
+};
+
+
+bool
+LowerSplit64BitVar::filter(const nir_instr *instr) const
+{
+ switch (instr->type) {
+ case nir_instr_type_intrinsic: {
+ auto intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_deref:
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ssbo:
+ if (nir_dest_bit_size(intr->dest) != 64)
+ return false;
+ return nir_dest_num_components(intr->dest) >= 3;
+ case nir_intrinsic_store_output:
+ if (nir_src_bit_size(intr->src[0]) != 64)
+ return false;
+ return nir_src_num_components(intr->src[0]) >= 3;
+ case nir_intrinsic_store_deref:
+ if (nir_src_bit_size(intr->src[1]) != 64)
+ return false;
+ return nir_src_num_components(intr->src[1]) >= 3;
+ default:
+ return false;
+ }
+ }
+ case nir_instr_type_alu: {
+ auto alu = nir_instr_as_alu(instr);
+ switch (alu->op) {
+ case nir_op_bcsel:
+ if (nir_dest_num_components(alu->dest.dest) < 3)
+ return false;
+ return nir_dest_bit_size(alu->dest.dest) == 64;
+ case nir_op_bany_fnequal3:
+ case nir_op_bany_fnequal4:
+ case nir_op_ball_fequal3:
+ case nir_op_ball_fequal4:
+ case nir_op_bany_inequal3:
+ case nir_op_bany_inequal4:
+ case nir_op_ball_iequal3:
+ case nir_op_ball_iequal4:
+ case nir_op_fdot3:
+ case nir_op_fdot4:
+ return nir_src_bit_size(alu->src[1].src) == 64;
+ default:
+ return false;
+ }
+ }
+ case nir_instr_type_load_const: {
+ auto lc = nir_instr_as_load_const(instr);
+ if (lc->def.bit_size != 64)
+ return false;
+ return lc->def.num_components >= 3;
+ }
+ default:
+ return false;
+ }
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1,
+ nir_ssa_def *load2, bool out_is_vec3)
+{
+ if (out_is_vec3)
+ return nir_vec3(b, nir_channel(b, load1, 0),
+ nir_channel(b, load1, 1),
+ nir_channel(b, load2, 0));
+ else
+ return nir_vec4(b, nir_channel(b, load1, 0),
+ nir_channel(b, load1, 1),
+ nir_channel(b, load2, 0),
+ nir_channel(b, load2, 1));
+}
+
+LowerSplit64BitVar::~LowerSplit64BitVar()
+{
+ for(auto&& v: m_old_vars)
+ exec_node_remove(&v->node);
+
+ for(auto&& v: m_old_stores)
+ nir_instr_remove(v);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
+{
+ auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
+ if (deref->deref_type == nir_deref_type_var)
+ return split_store_deref_var(intr, deref);
+ else if (deref->deref_type == nir_deref_type_array)
+ return split_store_deref_array(intr, deref);
+ else {
+ assert(0 && "only splitting of stores to vars and arrays is supported");
+ }
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
+{
+ auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
+ if (deref->deref_type == nir_deref_type_var)
+ return split_load_deref_var(intr);
+ else if (deref->deref_type == nir_deref_type_array)
+ return split_load_deref_array(intr, deref->arr.index);
+ else {
+ assert(0 && "only splitting of loads from vars and arrays is supported");
+ }
+ m_old_stores.push_back(&intr->instr);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
+{
+ auto old_var = nir_intrinsic_get_var(intr, 0);
+ unsigned old_components = old_var->type->without_array()->components();
+
+ assert(old_components > 2 && old_components <= 4);
+
+ auto vars = get_var_pair(old_var);
+
+ auto deref1 = nir_build_deref_var(b, vars.first);
+ auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1));
+ auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0);
+
+ auto deref2 = nir_build_deref_var(b, vars.second);
+ auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1));
+
+ auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0);
+
+ return merge_64bit_loads(load1, load2, old_components == 3);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref)
+{
+ auto old_var = nir_intrinsic_get_var(intr, 0);
+ unsigned old_components = old_var->type->without_array()->components();
+
+ assert(old_components > 2 && old_components <= 4);
+
+ auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
+
+ auto vars = get_var_pair(old_var);
+
+ auto deref1 = nir_build_deref_var(b, vars.first);
+ auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1));
+
+ nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3);
+
+ auto deref2 = nir_build_deref_var(b, vars.second);
+ auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1));
+
+ if (old_components == 3)
+ nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
+ else
+ nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
+
+ return progress_replace;
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref)
+{
+ auto old_var = nir_intrinsic_get_var(intr, 0);
+ unsigned old_components = old_var->type->without_array()->components();
+
+ assert(old_components > 2 && old_components <= 4);
+
+ auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
+
+ auto vars = get_var_pair(old_var);
+
+ auto deref1 = nir_build_deref_var(b, vars.first);
+ nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3);
+
+ auto deref2 = nir_build_deref_var(b, vars.second);
+ if (old_components == 3)
+ nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
+ else
+ nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
+
+ return progress_replace;
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
+{
+ auto old_var = nir_intrinsic_get_var(intr, 0);
+ auto vars = get_var_pair(old_var);
+ unsigned old_components = old_var->type->components();
+
+ nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
+ auto *load1 = nir_load_deref(b, deref1);
+
+ nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
+ deref2->type = vars.second->type;
+
+ auto *load2 = nir_load_deref(b, deref2);
+
+ return merge_64bit_loads(load1, load2, old_components == 3);
+}
+
+LowerSplit64BitVar::VarSplit
+LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
+{
+ auto split_vars = m_varmap.find(old_var->data.driver_location);
+
+ assert(old_var->type->without_array()->components() > 2);
+
+ if (split_vars == m_varmap.end()) {
+ auto var1 = nir_variable_clone(old_var, b->shader);
+ auto var2 = nir_variable_clone(old_var, b->shader);
+
+ var1->type = glsl_dvec_type(2);
+ var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2);
+
+ if (old_var->type->is_array()) {
+ var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0);
+ var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0);
+ }
+
+ if (old_var->data.mode == nir_var_shader_in ||
+ old_var->data.mode == nir_var_shader_out) {
+ ++var2->data.driver_location;
+ ++var2->data.location;
+ nir_shader_add_variable(b->shader, var1);
+ nir_shader_add_variable(b->shader, var2);
+ } else if (old_var->data.mode == nir_var_function_temp) {
+ exec_list_push_tail(&b->impl->locals, &var1->node);
+ exec_list_push_tail(&b->impl->locals, &var2->node);
+ }
+
+ m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
+ }
+ return m_varmap[old_var->data.driver_location];
+}
+
+
+nir_ssa_def *
+LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
+{
+ unsigned old_components = nir_dest_num_components(load1->dest);
+ auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
+ nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
+
+ load1->dest.ssa.num_components = 2;
+ sem.num_slots = 1;
+ nir_intrinsic_set_io_semantics(load1, sem);
+
+ load2->dest.ssa.num_components = old_components - 2;
+ sem.location += 1;
+ nir_intrinsic_set_io_semantics(load2, sem);
+ nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
+ nir_builder_instr_insert(b, &load2->instr);
+
+ return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3);
+}
+
+
+nir_ssa_def *
+LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
+{
+ auto src = store1->src[0];
+ unsigned old_components = nir_src_num_components(src);
+ nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
+
+ auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
+ auto src1 = nir_channels(b, src.ssa, 3);
+ auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
+
+ nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1));
+ nir_intrinsic_set_write_mask(store1, 3);
+
+ nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2));
+ nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
+
+ sem.num_slots = 1;
+ nir_intrinsic_set_io_semantics(store1, sem);
+
+ sem.location += 1;
+ nir_intrinsic_set_io_semantics(store2, sem);
+ nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
+
+ nir_builder_instr_insert(b, &store2->instr);
+ return progress_keep;
+}
+
+
+nir_ssa_def *
+LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
+{
+ unsigned second_components = nir_dest_num_components(intr->dest) - 2;
+ nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
+ load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
+ nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
+ nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
+ nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
+ load2->num_components = second_components;
+
+ nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
+ nir_builder_instr_insert(b, &load2->instr);
+
+ intr->dest.ssa.num_components = intr->num_components = 2;
+
+ if (second_components == 1)
+ return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0),
+ nir_channel(b, &intr->dest.ssa, 1),
+ nir_channel(b, &load2->dest.ssa, 0));
+ else
+ return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0),
+ nir_channel(b, &intr->dest.ssa, 1),
+ nir_channel(b, &load2->dest.ssa, 0),
+ nir_channel(b, &load2->dest.ssa, 1));
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
+{
+ unsigned second_components = nir_dest_num_components(intr->dest) - 2;
+ nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
+
+ auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
+ nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0);
+ load2->num_components = second_components;
+ nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
+
+ nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
+ nir_builder_instr_insert(b, &load2->instr);
+
+ intr->dest.ssa.num_components = intr->num_components = 2;
+
+ return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
+}
+
+
+nir_ssa_def *
+LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
+{
+ unsigned second_components = nir_dest_num_components(intr->dest) - 2;
+ nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
+ load2->src[0] = intr->src[0];
+ load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
+ nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
+ nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
+ nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
+ nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
+ nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16);
+
+ load2->num_components = second_components;
+
+ nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
+ nir_builder_instr_insert(b, &load2->instr);
+
+ intr->dest.ssa.num_components = intr->num_components = 2;
+
+ return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction)
+{
+ auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
+ auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
+ return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
+ nir_op op1, nir_op op2, nir_op reduction)
+{
+ nir_ssa_def *src[2][2];
+
+ src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
+ src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
+
+ src[1][0] = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2);
+ src[1][1] = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2);
+
+ return split_reduction(src, op1, op2, reduction);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
+ nir_op op1, nir_op op2, nir_op reduction)
+{
+ nir_ssa_def *src[2][2];
+
+ src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
+ src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
+
+ src[1][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc);
+ src[1][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc);
+
+ return split_reduction(src, op1, op2, reduction);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
+{
+ static nir_ssa_def *dest[4];
+ for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) {
+ dest[i] = nir_bcsel(b,
+ nir_channel(b, alu->src[0].src.ssa, i),
+ nir_channel(b, alu->src[1].src.ssa, i),
+ nir_channel(b, alu->src[2].src.ssa, i));
+ }
+ return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest));
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
+{
+ nir_ssa_def *ir[4];
+ for (unsigned i = 0; i < lc->def.num_components; ++i)
+ ir[i] = nir_imm_double(b, lc->value[i].f64);
+
+ return nir_vec(b, ir, lc->def.num_components);
+}
+
+nir_ssa_def *
+LowerSplit64BitVar::lower(nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_intrinsic: {
+ auto intr = nir_instr_as_intrinsic(instr);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_deref:
+ return this->split_double_load_deref(intr);
+ case nir_intrinsic_load_uniform:
+ return split_double_load_uniform(intr);
+ case nir_intrinsic_load_ubo:
+ return split_double_load_ubo(intr);
+ case nir_intrinsic_load_ssbo:
+ return split_double_load_ssbo(intr);
+ case nir_intrinsic_load_input:
+ return split_double_load(intr);
+ case nir_intrinsic_store_output:
+ return split_store_output(intr);
+ case nir_intrinsic_store_deref:
+ return split_double_store_deref(intr);
+ default:
+ assert(0);
+ }
+ }
+ case nir_instr_type_alu: {
+ auto alu = nir_instr_as_alu(instr);
+ nir_print_instr(instr, stderr);
+ fprintf(stderr, "\n");
+ switch (alu->op) {
+ case nir_op_bany_fnequal3:
+ return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
+ case nir_op_ball_fequal3:
+ return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
+ case nir_op_bany_inequal3:
+ return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
+ case nir_op_ball_iequal3:
+ return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
+ case nir_op_fdot3:
+ return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
+ case nir_op_bany_fnequal4:
+ return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
+ case nir_op_ball_fequal4:
+ return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand);
+ case nir_op_bany_inequal4:
+ return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior);
+ case nir_op_ball_iequal4:
+ return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
+ case nir_op_fdot4:
+ return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
+ case nir_op_bcsel:
+ return split_bcsel(alu);
+ default:
+ assert(0);
+ }
+ }
+ case nir_instr_type_load_const: {
+ auto lc = nir_instr_as_load_const(instr);
+ return split_load_const(lc);
+ }
+ default:
+ assert(0);
+ }
+ return nullptr;
+}
+
+/* Split 64 bit instruction so that at most two 64 bit components are
+ * used in one instruction */
+
+bool
+r600_nir_split_64bit_io(nir_shader *sh)
+{
+ return LowerSplit64BitVar().run(sh);
+}
+
+/* */
+class Lower64BitToVec2 : public NirLowerInstruction {
+
+private:
+ bool filter(const nir_instr *instr) const override;
+ nir_ssa_def *lower(nir_instr *instr) override;
+
+ nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
+ nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
+ nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
+ nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr);
+ nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr);
+};
+
+bool
+Lower64BitToVec2::filter(const nir_instr *instr) const
+{
+ switch (instr->type) {
+ case nir_instr_type_intrinsic: {
+ auto intr = nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_deref:
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ubo_vec4:
+ case nir_intrinsic_load_ssbo:
+ return nir_dest_bit_size(intr->dest) == 64;
+ case nir_intrinsic_store_deref: {
+ if (nir_src_bit_size(intr->src[1]) == 64)
+ return true;
+ auto var = nir_intrinsic_get_var(intr, 0);
+ if (var->type->without_array()->bit_size() == 64)
+ return true;
+ return (var->type->without_array()->components() != intr->num_components);
+ }
+ default:
+ return false;
+ }
+ }
+ case nir_instr_type_alu: {
+ auto alu = nir_instr_as_alu(instr);
+ return nir_dest_bit_size(alu->dest.dest) == 64;
+ }
+ case nir_instr_type_phi: {
+ auto phi = nir_instr_as_phi(instr);
+ return nir_dest_bit_size(phi->dest) == 64;
+ }
+ case nir_instr_type_load_const: {
+ auto lc = nir_instr_as_load_const(instr);
+ return lc->def.bit_size == 64;
+ }
+ case nir_instr_type_ssa_undef: {
+ auto undef = nir_instr_as_ssa_undef(instr);
+ return undef->def.bit_size == 64;
+ }
+ default:
+ return false;
+ }
+}
+
+nir_ssa_def *
+Lower64BitToVec2::lower(nir_instr *instr)
+{
+ switch (instr->type) {
+ case nir_instr_type_intrinsic: {
+ auto intr = nir_instr_as_intrinsic(instr);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_deref:
+ return load_deref_64_to_vec2(intr);
+ case nir_intrinsic_load_uniform:
+ return load_uniform_64_to_vec2(intr);
+ case nir_intrinsic_load_ssbo:
+ return load_ssbo_64_to_vec2(intr);
+ case nir_intrinsic_load_input:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ubo_vec4:
+ return load_64_to_vec2(intr);
+ case nir_intrinsic_store_deref:
+ return store_64_to_vec2(intr);
+ default:
+
+ return nullptr;
+ }
+ }
+ case nir_instr_type_alu: {
+ auto alu = nir_instr_as_alu(instr);
+ alu->dest.dest.ssa.bit_size = 32;
+ alu->dest.dest.ssa.num_components *= 2;
+ alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1;
+ switch (alu->op) {
+ case nir_op_pack_64_2x32_split:
+ alu->op = nir_op_vec2;
+ break;
+ case nir_op_pack_64_2x32:
+ alu->op = nir_op_mov;
+ break;
+ case nir_op_vec2:
+ return nir_vec4(b,
+ nir_channel(b, alu->src[0].src.ssa, 0),
+ nir_channel(b, alu->src[0].src.ssa, 1),
+ nir_channel(b, alu->src[1].src.ssa, 0),
+ nir_channel(b, alu->src[1].src.ssa, 1));
+ default:
+ return NULL;
+ }
+ return progress_keep;
+ }
+ case nir_instr_type_phi: {
+ auto phi = nir_instr_as_phi(instr);
+ phi->dest.ssa.bit_size = 32;
+ phi->dest.ssa.num_components = 2;
+ return progress_keep;
+ }
+ case nir_instr_type_load_const: {
+ auto lc = nir_instr_as_load_const(instr);
+ assert(lc->def.num_components < 3);
+ nir_const_value val[4] = {0};
+ for (uint i = 0; i < lc->def.num_components; ++i) {
+ uint64_t v = lc->value[i].u64;
+ val[0].u32 = v & 0xffffffff;
+ val[1].u32 = (v >> 32) & 0xffffffff;
+ }
+
+ return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
+ }
+ case nir_instr_type_ssa_undef: {
+ auto undef = nir_instr_as_ssa_undef(instr);
+ undef->def.num_components *= 2;
+ undef->def.bit_size = 32;
+ return progress_keep;
+ }
+ default:
+ return nullptr;
+ }
+
+}
+
+
+nir_ssa_def *
+Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
+{
+ auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
+ auto var = nir_intrinsic_get_var(intr, 0);
+ unsigned components = var->type->without_array()->components();
+ if (var->type->without_array()->bit_size() == 64) {
+ components *= 2;
+ if (deref->deref_type == nir_deref_type_var) {
+ var->type = glsl_vec_type(components);
+ } else if (deref->deref_type == nir_deref_type_array) {
+
+ var->type = glsl_array_type(glsl_vec_type(components),
+ var->type->array_size(), 0);
+
+ } else {
+ nir_print_shader(b->shader, stderr);
+ assert(0 && "Only lowring of var and array derefs supported\n");
+ }
+ }
+ deref->type = var->type;
+ if (deref->deref_type == nir_deref_type_array) {
+ auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
+ deref_array->type = var->type;
+ deref->type = deref_array->type->without_array();
+ }
+
+ intr->num_components = components;
+ intr->dest.ssa.bit_size = 32;
+ intr->dest.ssa.num_components = components;
+ return progress_keep;
+}
+
+nir_ssa_def *
+Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
+{
+ auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
+ auto var = nir_intrinsic_get_var(intr, 0);
+
+ unsigned components = var->type->without_array()->components();
+ unsigned wrmask = nir_intrinsic_write_mask(intr);
+ if (var->type->without_array()->bit_size() == 64) {
+ components *= 2;
+ if (deref->deref_type == nir_deref_type_var) {
+ var->type = glsl_vec_type(components);
+ } else if (deref->deref_type == nir_deref_type_array) {
+ var->type = glsl_array_type(glsl_vec_type(components),
+ var->type->array_size(), 0);
+ } else {
+ nir_print_shader(b->shader, stderr);
+ assert(0 && "Only lowring of var and array derefs supported\n");
+ }
+ }
+ deref->type = var->type;
+ if (deref->deref_type == nir_deref_type_array) {
+ auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
+ deref_array->type = var->type;
+ deref->type = deref_array->type->without_array();
+ }
+ intr->num_components = components;
+ nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
+ return progress_keep;
+}
+
+
+nir_ssa_def *
+Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
+{
+ intr->num_components *= 2;
+ intr->dest.ssa.bit_size = 32;
+ intr->dest.ssa.num_components *= 2;
+ nir_intrinsic_set_dest_type(intr, nir_type_float32);
+ return progress_keep;
+}
+
+nir_ssa_def *
+Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
+{
+ intr->num_components *= 2;
+ intr->dest.ssa.bit_size = 32;
+ intr->dest.ssa.num_components *= 2;
+ nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
+ return progress_keep;
+}
+
+nir_ssa_def *
+Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
+{
+ intr->num_components *= 2;
+ intr->dest.ssa.bit_size = 32;
+ intr->dest.ssa.num_components *= 2;
+ return progress_keep;
+}
+
+static bool store_64bit_intr(nir_src *src, void *state)
+{
+ bool *s = (bool *)state;
+ *s = nir_src_bit_size(*src) == 64;
+ return !*s;
+}
+
+static bool double2vec2(nir_src *src, void *state)
+{
+ if (nir_src_bit_size(*src) != 64)
+ return true;
+
+ assert(src->is_ssa);
+ src->ssa->bit_size = 32;
+ src->ssa->num_components *= 2;
+ return true;
+}
+
+bool
+r600_nir_64_to_vec2(nir_shader *sh)
+{
+ vector<nir_instr*> intr64bit;
+ nir_foreach_function(function, sh) {
+ if (function->impl) {
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ switch (instr->type) {
+ case nir_instr_type_alu: {
+ bool success = false;
+ nir_foreach_src(instr, store_64bit_intr, &success);
+ if (success)
+ intr64bit.push_back(instr);
+ break;
+ }
+ case nir_instr_type_intrinsic: {
+ auto ir = nir_instr_as_intrinsic(instr);
+ switch (ir->intrinsic) {
+ case nir_intrinsic_store_output:
+ case nir_intrinsic_store_ssbo: {
+ bool success = false;
+ nir_foreach_src(instr, store_64bit_intr, &success);
+ if (success) {
+ auto wm = nir_intrinsic_write_mask(ir);
+ nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
+ ir->num_components *= 2;
+ }
+ break;
+ }
+ default:
+ ;
+ }
+ }
+ default:
+ ;
+ }
+ }
+ }
+ }
+ }
+
+ bool result = Lower64BitToVec2().run(sh);
+
+ if (result || !intr64bit.empty()) {
+
+ for(auto&& instr: intr64bit) {
+ if (instr->type == nir_instr_type_alu) {
+ auto alu = nir_instr_as_alu(instr);
+ auto alu_info = nir_op_infos[alu->op];
+ for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
+ int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
+ for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
+ if (!nir_alu_instr_channel_used(alu, i, k)) {
+ continue;
+ }
+
+ switch (alu->op) {
+ case nir_op_unpack_64_2x32_split_x:
+ swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
+ alu->op = nir_op_mov;
+ break;
+ case nir_op_unpack_64_2x32_split_y:
+ swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
+ alu->op = nir_op_mov;
+ break;
+ case nir_op_unpack_64_2x32:
+ alu->op = nir_op_mov;
+ break;
+ case nir_op_bcsel:
+ if (i == 0) {
+ swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
+ break;
+ }
+ /* fallthrough */
+ default:
+ swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
+ swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
+ }
+ }
+ for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
+ alu->src[i].swizzle[k] = swizzle[k];
+ }
+ }
+ } else
+ nir_foreach_src(instr, double2vec2, nullptr);
+ }
+ result = true;
+ }
+
+ return result;
+}
+
+using std::map;
+using std::vector;
+using std::pair;
+
+class StoreMerger {
+public:
+ StoreMerger(nir_shader *shader);
+ void collect_stores();
+ bool combine();
+ void combine_one_slot(vector<nir_intrinsic_instr*>& stores);
+
+ using StoreCombos = map<unsigned, vector<nir_intrinsic_instr*>>;
+
+ StoreCombos m_stores;
+ nir_shader *sh;
+ nir_builder b;
+};
+
+StoreMerger::StoreMerger(nir_shader *shader):
+ sh(shader)
+{
+}
+
+
+void StoreMerger::collect_stores()
+{
+ unsigned vertex = 0;
+ nir_foreach_function(function, sh) {
+ if (function->impl) {
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ auto ir = nir_instr_as_intrinsic(instr);
+ if (ir->intrinsic == nir_intrinsic_emit_vertex ||
+ ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
+ ++vertex;
+ continue;
+ }
+ if (ir->intrinsic != nir_intrinsic_store_output)
+ continue;
+
+ unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
+ 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
+ m_stores[index].push_back(ir);
+ }
+ }
+ }
+ }
+}
+
+bool StoreMerger::combine()
+{
+ bool progress = false;
+ for(auto&& i : m_stores) {
+ if (i.second.size() < 2)
+ continue;
+
+ combine_one_slot(i.second);
+ progress = true;
+ }
+ return progress;
+}
+
+void StoreMerger::combine_one_slot(vector<nir_intrinsic_instr*>& stores)
+{
+ nir_ssa_def *srcs[4] = {nullptr};
+
+ nir_builder b;
+ nir_builder_init(&b, nir_shader_get_entrypoint(sh));
+ auto last_store = *stores.rbegin();
+
+ b.cursor = nir_before_instr(&last_store->instr);
+
+ unsigned comps = 0;
+ unsigned writemask = 0;
+ unsigned first_comp = 4;
+ for (auto&& store : stores) {
+ int cmp = nir_intrinsic_component(store);
+ for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
+ unsigned out_comp = i + cmp;
+ srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
+ writemask |= 1 << out_comp;
+ if (first_comp > out_comp)
+ first_comp = out_comp;
+ }
+ }
+
+ auto new_src = nir_vec(&b, srcs, comps);
+
+ nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src));
+ last_store->num_components = comps;
+ nir_intrinsic_set_component(last_store, first_comp);
+ nir_intrinsic_set_write_mask(last_store, writemask);
+
+ for (auto i = stores.begin(); i != stores.end() - 1; ++i)
+ nir_instr_remove(&(*i)->instr);
+}
+
+bool r600_merge_vec2_stores(nir_shader *shader)
+{
+ r600::StoreMerger merger(shader);
+ merger.collect_stores();
+ return merger.combine();
+}
+
+} // end namespace r600
+
+
More information about the mesa-commit
mailing list