Mesa (master): r600/sfn: Use load_ubo_vec4 lowering pass

Thu Sep 17 10:27:29 UTC 2020

Module: Mesa
Branch: master
Commit: 18e97817148eb376274a8749c03b45c2f817c139
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=18e97817148eb376274a8749c03b45c2f817c139

Author: Gert Wollny <gert.wollny at collabora.com>
Date:   Sun Sep  6 23:40:24 2020 +0200

r600/sfn: Use load_ubo_vec4 lowering pass

This replaces the lowering pass to align UBO loads at 16 byte
boundaries.

v2: use nir functions to query constants in ubo_vec4 (Eric)

Signed-off-by: Gert Wollny <gert.wollny at collabora.com>
Reviewed-by: Eric Anholt <eric at anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6743>

---

 src/gallium/drivers/r600/sfn/sfn_nir.cpp         | 100 +----------------------
 src/gallium/drivers/r600/sfn/sfn_shader_base.cpp |  55 +++++++------
 src/gallium/drivers/r600/sfn/sfn_shader_base.h   |   2 +-
 3 files changed, 31 insertions(+), 126 deletions(-)

diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
index 7c88b4a4b49..091f4ffface 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -390,100 +390,6 @@ bool r600_lower_scratch_addresses(nir_shader *shader)
    return progress;
 }
 
-static nir_ssa_def *
-r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options)
-{
-   b->cursor = nir_before_instr(instr);
-
-   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
-   assert(op->intrinsic == nir_intrinsic_load_ubo);
-
-   bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0]));
-
-   nir_ssa_def *offset = op->src[1].ssa;
-
-   /* This is ugly: With const addressing we can actually set a proper fetch target mask,
-    * but for this we need the component encoded, we don't shift and do de decoding in the
-    * backend. Otherwise we shift by four and resolve the component here
-    * (TODO: encode the start component in the intrinsic when the offset base is non-constant
-    * but a multiple of 16 */
-
-   nir_ssa_def *new_offset = offset;
-   if (!const_address)
-      new_offset = nir_ishr(b, offset,  nir_imm_int(b, 4));
-
-   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600);
-   load->num_components = const_address ? op->num_components : 4;
-   load->src[0] = op->src[0];
-   load->src[1] = nir_src_for_ssa(new_offset);
-   nir_intrinsic_set_align(load, nir_intrinsic_align_mul(op), nir_intrinsic_align_offset(op));
-
-   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
-   nir_builder_instr_insert(b, &load->instr);
-
-   /* when four components are loaded or both the offset and the location
-    * are constant, then the backend can deal with it better */
-   if (op->num_components == 4 || const_address)
-      return &load->dest.ssa;
-
-   /* What comes below is a performance disaster when the offset is not constant
-    * because then we have to assume that any component can be the first one and we
-    * have to pick the result manually. */
-   nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset,  nir_imm_int(b, 2)),
-                                      nir_imm_int(b,3));
-
-   const unsigned swz_000[4] = {0, 0, 0};
-   nir_ssa_def *component_select = nir_ieq(b, r600_imm_ivec3(b, 0, 1, 2),
-                                           nir_swizzle(b, first_comp, swz_000, 3));
-
-   if (op->num_components == 1) {
-      nir_ssa_def *check0 = nir_bcsel(b, nir_channel(b, component_select, 0),
-                                      nir_channel(b, &load->dest.ssa, 0),
-                                      nir_channel(b, &load->dest.ssa, 3));
-      nir_ssa_def *check1 = nir_bcsel(b, nir_channel(b, component_select, 1),
-                                      nir_channel(b, &load->dest.ssa, 1),
-                                      check0);
-      return nir_bcsel(b, nir_channel(b, component_select, 2),
-                       nir_channel(b, &load->dest.ssa, 2),
-                       check1);
-   } else if (op->num_components == 2) {
-      const unsigned szw_01[2] = {0, 1};
-      const unsigned szw_12[2] = {1, 2};
-      const unsigned szw_23[2] = {2, 3};
-
-      nir_ssa_def *check0 = nir_bcsel(b, nir_channel(b, component_select, 0),
-                                      nir_swizzle(b, &load->dest.ssa, szw_01, 2),
-                                      nir_swizzle(b, &load->dest.ssa, szw_23, 2));
-      return nir_bcsel(b, nir_channel(b, component_select, 1),
-                                      nir_swizzle(b, &load->dest.ssa, szw_12, 2),
-                                      check0);
-   } else {
-      const unsigned szw_012[3] = {0, 1, 2};
-      const unsigned szw_123[3] = {1, 2, 3};
-      return nir_bcsel(b, nir_channel(b, component_select, 0),
-                       nir_swizzle(b, &load->dest.ssa, szw_012, 3),
-                       nir_swizzle(b, &load->dest.ssa, szw_123, 3));
-   }
-}
-
-bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options)
-{
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
-   return op->intrinsic == nir_intrinsic_load_ubo;
-}
-
-
-bool r600_lower_ubo_to_align16(nir_shader *shader)
-{
-   return nir_shader_lower_instructions(shader,
-                                        r600_lower_ubo_to_align16_filter,
-                                        r600_lower_ubo_to_align16_impl,
-                                        nullptr);
-}
-
 static void
 insert_uniform_sorted(struct exec_list *var_list, nir_variable *new_var)
 {
@@ -875,10 +781,8 @@ int r600_shader_from_nir(struct r600_context *rctx,
    const nir_function *func = reinterpret_cast<const nir_function *>(exec_list_get_head_const(&sel->nir->functions));
    assert(func->impl->registers.length() == 0 && !has_saturate(func));
 
-   if (true) {
-      optimize_once(sel->nir);
-      NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16);
-   }
+   NIR_PASS_V(sel->nir, nir_lower_ubo_vec4);
+
    /* It seems the output of this optimization is cached somewhere, and
     * when there are registers, then we can no longer copy propagate, so
     * skip the optimization then. (There is probably a better way, but yeah)
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
index 04953c1679f..13a4c688ddf 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
@@ -634,8 +634,8 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins
    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if:
       return emit_discard_if(instr);
-   case nir_intrinsic_load_ubo_r600:
-      return emit_load_ubo(instr);
+   case nir_intrinsic_load_ubo_vec4:
+      return emit_load_ubo_vec4(instr);
    case nir_intrinsic_load_tcs_in_param_base_r600:
       return emit_load_tcs_param_base(instr, 0);
    case nir_intrinsic_load_tcs_out_param_base_r600:
@@ -777,7 +777,7 @@ GPRVector ShaderFromNirProcessor::vec_from_nir_with_fetch_constant(const nir_src
 
    /* Now check whether all inputs come from the same GPR, and fill
     * empty slots in the vector with unused swizzles, bail out if
-    * the sources are not from the same GPR
+    * the sources are nqot from the same GPR
     */
 
    if (use_same) {
@@ -829,26 +829,19 @@ GPRVector ShaderFromNirProcessor::vec_from_nir_with_fetch_constant(const nir_src
       return GPRVector(v);;
 }
 
-bool ShaderFromNirProcessor::emit_load_ubo(nir_intrinsic_instr* instr)
+bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr)
 {
-   nir_src& src0 = instr->src[0];
-   nir_src& src1 = instr->src[1];
-
-   int sel_bufid_reg = src0.is_ssa ? src0.ssa->index : src0.reg.reg->index;
-   const nir_load_const_instr* literal0 = get_literal_constant(sel_bufid_reg);
-
-   int ofs_reg = src1.is_ssa ? src1.ssa->index : src1.reg.reg->index;
-   const nir_load_const_instr* literal1 = get_literal_constant(ofs_reg);
-   if (literal0) {
-      if (literal1) {
-         uint bufid = literal0->value[0].u32;
-         uint buf_ofs = literal1->value[0].u32 >> 4;
-         int buf_cmp = ((literal1->value[0].u32 >> 2) & 3);
+   auto bufid = nir_src_as_const_value(instr->src[0]);
+   auto buf_offset = nir_src_as_const_value(instr->src[1]);
+
+   if (bufid) {
+      if (buf_offset) {
+         int buf_cmp = nir_intrinsic_component(instr);
          AluInstruction *ir = nullptr;
-         for (int i = 0; i < instr->num_components; ++i) {
+         for (unsigned i = 0; i < nir_dest_num_components(instr->dest); ++i) {
             int cmp = buf_cmp + i;
             assert(cmp < 4);
-            auto u = PValue(new UniformValue(512 +  buf_ofs, cmp, bufid + 1));
+            auto u = PValue(new UniformValue(512 +  buf_offset->u32, cmp, bufid->u32));
             if (instr->dest.is_ssa)
                load_preloaded_value(instr->dest, i, u);
             else {
@@ -861,21 +854,24 @@ bool ShaderFromNirProcessor::emit_load_ubo(nir_intrinsic_instr* instr)
          return true;
 
       } else {
-         /* literal0 is lost ...*/
-         return load_uniform_indirect(instr, from_nir(instr->src[1], 0, 0), 0, literal0->value[0].u32 + 1);
+         return load_uniform_indirect(instr, from_nir(instr->src[1], 0, 0), 0, bufid->u32);
       }
    } else {
-      /* TODO: This can also be solved by using the CF indes on the ALU block, and
-       * this would probably make sense when there are more then one loads with
-       * the same buffer ID. */
+      /* TODO: if buf_offset is constant then this can also be solved by using the CF indes
+       * on the ALU block, and this would probably make sense when there are more then one
+       * loads with the same buffer ID. */
       PValue bufid = from_nir(instr->src[0], 0, 0);
       PValue addr = from_nir_with_fetch_constant(instr->src[1], 0);
       GPRVector trgt;
-      for (int i = 0; i < 4; ++i)
+      std::array<int, 4> swz = {7,7,7,7};
+      for (unsigned i = 0; i < nir_dest_num_components(instr->dest); ++i) {
          trgt.set_reg_i(i, from_nir(instr->dest, i));
+         swz[i] = i + nir_intrinsic_component(instr);
+      }
 
       auto ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, 0,
-                                     1, bufid, bim_zero);
+                                     0, bufid, bim_zero);
+      ir->set_dest_swizzle(swz);
 
       emit_instruction(ir);
       for (int i = 0; i < instr->num_components ; ++i) {
@@ -887,6 +883,7 @@ bool ShaderFromNirProcessor::emit_load_ubo(nir_intrinsic_instr* instr)
 
 }
 
+
 bool ShaderFromNirProcessor::emit_discard_if(nir_intrinsic_instr* instr)
 {
    r600::sfn_log << SfnLog::instr << "emit '"
@@ -919,8 +916,11 @@ bool ShaderFromNirProcessor::load_uniform_indirect(nir_intrinsic_instr* instr, P
    }
 
    GPRVector trgt;
-   for (int i = 0; i < 4; ++i)
+   std::array<int, 4> swz = {7,7,7,7};
+   for (int i = 0; i < 4; ++i) {
       trgt.set_reg_i(i, from_nir(instr->dest, i));
+      swz[i] = i + nir_intrinsic_component(instr);
+   }
 
    if (addr->type() != Value::gpr) {
       emit_instruction(op1_mov, trgt.reg_i(0), {addr}, {alu_write, alu_last_instr});
@@ -930,6 +930,7 @@ bool ShaderFromNirProcessor::load_uniform_indirect(nir_intrinsic_instr* instr, P
    /* FIXME: buffer index and index mode are not set correctly */
    auto ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, offest,
                                   bufferid, PValue(), bim_none);
+   ir->set_dest_swizzle(swz);
    emit_instruction(ir);
    m_sh_info.indirect_files |= 1 << TGSI_FILE_CONSTANT;
    return true;
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.h b/src/gallium/drivers/r600/sfn/sfn_shader_base.h
index bd2cd9ce680..2bf094aa5b3 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.h
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.h
@@ -150,7 +150,7 @@ private:
    virtual bool emit_intrinsic_instruction_override(nir_intrinsic_instr* instr);
    bool emit_tex_instruction(nir_instr* instr);
    bool emit_discard_if(nir_intrinsic_instr* instr);
-   bool emit_load_ubo(nir_intrinsic_instr* instr);
+   bool emit_load_ubo_vec4(nir_intrinsic_instr* instr);
    bool emit_ssbo_atomic_add(nir_intrinsic_instr* instr);
    bool load_uniform_indirect(nir_intrinsic_instr* instr, PValue addr, int offest, int bufid);