Mesa (master): aco: use non-sequential addressing

Wed Jan 20 17:00:54 UTC 2021

Module: Mesa
Branch: master
Commit: c353895c92270c0e2a6e2b849c24d558efae0d5e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c353895c92270c0e2a6e2b849c24d558efae0d5e

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Thu Jan 14 19:58:13 2021 +0000

aco: use non-sequential addressing

fossil-db (GFX10.3):
Totals from 70493 (50.57% of 139391) affected shaders:
SGPRs: 4232624 -> 4231808 (-0.02%); split: -0.09%, +0.07%
VGPRs: 2831772 -> 2764740 (-2.37%); split: -2.53%, +0.17%
CodeSize: 225584412 -> 225048740 (-0.24%); split: -0.44%, +0.21%
MaxWaves: 875319 -> 878837 (+0.40%); split: +0.44%, -0.04%
Instrs: 43157803 -> 42496421 (-1.53%); split: -1.54%, +0.01%
Cycles: 1656380132 -> 1641532056 (-0.90%); split: -0.94%, +0.04%

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8523>

---

 src/amd/compiler/aco_assembler.cpp             | 17 +++++++++
 src/amd/compiler/aco_instruction_selection.cpp | 52 ++++++++++++++++----------
 src/amd/compiler/aco_validate.cpp              | 13 +++++--
 src/amd/compiler/tests/test_isel.cpp           | 16 ++++----
 4 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index 35f07dddb85..05ec485a2cf 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -428,6 +428,15 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       break;
    }
    case Format::MIMG: {
+      unsigned use_nsa = false;
+      unsigned addr_dwords = instr->operands.size() - 3;
+      for (unsigned i = 1; i < addr_dwords; i++) {
+         if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
+            use_nsa = true;
+      }
+      assert(!use_nsa || ctx.chip_class >= GFX10);
+      unsigned nsa_dwords = use_nsa ? DIV_ROUND_UP(addr_dwords - 1, 4) : 0;
+
       MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
       uint32_t encoding = (0b111100 << 26);
       encoding |= mimg->slc ? 1 << 25 : 0;
@@ -443,6 +452,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
          encoding |= mimg->da ? 1 << 14 : 0;
       } else {
          encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
+         encoding |= nsa_dwords << 1;
          encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */
          encoding |= mimg->dlc ? 1 << 7 : 0;
       }
@@ -465,6 +475,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
       }
 
       out.push_back(encoding);
+
+      if (nsa_dwords) {
+         out.resize(out.size() + nsa_dwords);
+         std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);
+         for (unsigned i = 0; i < addr_dwords - 1; i++)
+            nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);
+      }
       break;
    }
    case Format::FLAT:
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index d97604f8571..08ad9572bdb 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -5841,38 +5841,52 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
                                    Definition dst,
                                    Temp rsrc,
                                    Operand samp,
-                                   const std::vector<Temp>& coords,
+                                   std::vector<Temp> coords,
                                    unsigned num_wqm_coords=0,
                                    Operand vdata=Operand(v1))
 {
-   Temp coord = coords[0];
-   if (coords.size() > 1) {
-      coord = bld.tmp(RegType::vgpr, coords.size());
+   if (bld.program->chip_class < GFX10) {
+      Temp coord = coords[0];
+      if (coords.size() > 1) {
+         coord = bld.tmp(RegType::vgpr, coords.size());
 
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
-      for (unsigned i = 0; i < coords.size(); i++)
-         vec->operands[i] = Operand(coords[i]);
-      vec->definitions[0] = Definition(coord);
-      bld.insert(std::move(vec));
-   } else if (coord.type() == RegType::sgpr) {
-      coord = bld.copy(bld.def(v1), coord);
-   }
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
+         for (unsigned i = 0; i < coords.size(); i++)
+            vec->operands[i] = Operand(coords[i]);
+         vec->definitions[0] = Definition(coord);
+         bld.insert(std::move(vec));
+      } else if (coord.type() == RegType::sgpr) {
+         coord = bld.copy(bld.def(v1), coord);
+      }
 
-   if (num_wqm_coords) {
-      /* We don't need the bias, sample index, compare value or offset to be
-       * computed in WQM but if the p_create_vector copies the coordinates, then it
-       * needs to be in WQM. */
-      coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
+      if (num_wqm_coords) {
+         /* We don't need the bias, sample index, compare value or offset to be
+          * computed in WQM but if the p_create_vector copies the coordinates, then it
+          * needs to be in WQM. */
+         coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
+      }
+
+      coords[0] = coord;
+      coords.resize(1);
+   } else {
+      for (unsigned i = 0; i < num_wqm_coords; i++)
+         coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
+
+      for (Temp& coord : coords) {
+         if (coord.type() == RegType::sgpr)
+            coord = bld.copy(bld.def(v1), coord);
+      }
    }
 
    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(
-      op, Format::MIMG, 4, dst.isTemp())};
+      op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
    if (dst.isTemp())
       mimg->definitions[0] = dst;
    mimg->operands[0] = Operand(rsrc);
    mimg->operands[1] = samp;
    mimg->operands[2] = vdata;
-   mimg->operands[3] = Operand(coord);
+   for (unsigned i = 0; i < coords.size(); i++)
+      mimg->operands[3 + i] = Operand(coords[i]);
 
    MIMG_instruction *res = mimg.get();
    bld.insert(std::move(mimg));
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 3abb21b5796..4c707b49699 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -436,7 +436,7 @@ bool validate_ir(Program* program)
             break;
          }
          case Format::MIMG: {
-            check(instr->operands.size() == 4, "MIMG instructions must have 4 operands", instr.get());
+            check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
             check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
                   "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
             if (instr->operands[1].hasRegClass())
@@ -447,8 +447,15 @@ bool validate_ir(Program* program)
                check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
                      "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
             }
-            check(instr->operands[3].hasRegClass() && instr->operands[3].regClass().type() == RegType::vgpr,
-                  "MIMG operands[3] (VADDR) must be VGPR", instr.get());
+            check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
+            for (unsigned i = 3; i < instr->operands.size(); i++) {
+               if (instr->operands.size() == 4) {
+                  check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
+                        "MIMG operands[3] (VADDR) must be VGPR", instr.get());
+               } else {
+                  check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
+               }
+            }
             check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
                   "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
             break;
diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp
index c911fea839b..208833c54fc 100644
--- a/src/amd/compiler/tests/test_isel.cpp
+++ b/src/amd/compiler/tests/test_isel.cpp
@@ -149,18 +149,18 @@ BEGIN_TEST(isel.sparse.clause)
          };
          void main() {
             //>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
             //>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
             //>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
             //>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
             //>> s_clause 0x3
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
             code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]);
             code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]);
             code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]);