Mesa (master): aco: use bit vectors for liveness sets

Mon Sep 21 14:01:27 UTC 2020

Module: Mesa
Branch: master
Commit: d2c18b7bf37f6b4fd7b0a2f7cf2fc97c5305ea4c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d2c18b7bf37f6b4fd7b0a2f7cf2fc97c5305ea4c

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Mon Sep 14 16:45:55 2020 +0100

aco: use bit vectors for liveness sets

This seems to be much faster than hash sets. When compiling pipelines from
5 games, live_var_analysis takes about a third the time it used to and
fossilize-replay is ~1.77% faster.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Reviewed-by: Tony Wasserka <tony.wasserka at gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6733>

---

 src/amd/compiler/aco_ir.h                    |  11 +-
 src/amd/compiler/aco_live_var_analysis.cpp   |  23 +++--
 src/amd/compiler/aco_lower_to_cssa.cpp       |   4 +-
 src/amd/compiler/aco_register_allocation.cpp |  22 ++--
 src/amd/compiler/aco_util.h                  | 146 +++++++++++++++++++++++++++
 src/amd/compiler/aco_validate.cpp            |   3 +-
 6 files changed, 175 insertions(+), 34 deletions(-)

diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 04c4dd2eb23..ade74356454 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1647,16 +1647,9 @@ private:
    uint32_t allocationID = 1;
 };
 
-struct TempHash {
-   std::size_t operator()(Temp t) const {
-      return t.id();
-   }
-};
-using TempSet = std::unordered_set<Temp, TempHash>;
-
 struct live {
    /* live temps out per block */
-   std::vector<TempSet> live_out;
+   std::vector<IDSet> live_out;
    /* register demand (sgpr/vgpr) per instruction per block */
    std::vector<std::vector<RegisterDemand>> register_demand;
 };
@@ -1692,7 +1685,7 @@ void value_numbering(Program* program);
 void optimize(Program* program);
 void setup_reduce_temp(Program* program);
 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
-void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
+void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block);
 void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 1c47eb3d6a4..467fc631ac5 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -91,18 +91,18 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
 
    register_demand.resize(block->instructions.size());
    block->register_demand = RegisterDemand();
-   TempSet live = lives.live_out[block->index];
+   IDSet live = lives.live_out[block->index];
 
    /* add the live_out_exec to live */
    bool exec_live = false;
    if (block->live_out_exec != Temp()) {
-      live.insert(block->live_out_exec);
+      live.insert(block->live_out_exec.id());
       exec_live = true;
    }
 
    /* initialize register demand */
-   for (Temp t : live)
-      new_demand += t;
+   for (unsigned t : live)
+      new_demand += Temp(t, program->temp_rc[t]);
    new_demand.sgpr -= phi_sgpr_ops[block->index];
 
    /* traverse the instructions backwards */
@@ -126,7 +126,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
             program->needs_vcc = true;
 
          const Temp temp = definition.getTemp();
-         const size_t n = live.erase(temp);
+         const size_t n = live.erase(temp.id());
 
          if (n) {
             new_demand -= temp;
@@ -158,7 +158,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
             if (operand.isFixed() && operand.physReg() == vcc)
                program->needs_vcc = true;
             const Temp temp = operand.getTemp();
-            const bool inserted = live.insert(temp).second;
+            const bool inserted = live.insert(temp.id()).second;
             if (inserted) {
                operand.setFirstKill(true);
                for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
@@ -198,7 +198,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
       if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
          program->needs_vcc = true;
       const Temp temp = definition.getTemp();
-      const size_t n = live.erase(temp);
+      const size_t n = live.erase(temp.id());
 
       if (n)
          definition.setKill(false);
@@ -209,12 +209,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
    }
 
    /* now, we need to merge the live-ins into the live-out sets */
-   for (Temp t : live) {
-      std::vector<unsigned>& preds = t.is_linear() ? block->linear_preds : block->logical_preds;
+   for (unsigned t : live) {
+      RegClass rc = program->temp_rc[t];
+      std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
 
 #ifndef NDEBUG
       if (preds.empty())
-         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index);
+         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
 #endif
 
       for (unsigned pred_idx : preds) {
@@ -240,7 +241,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
          if (operand.isFixed() && operand.physReg() == vcc)
             program->needs_vcc = true;
          /* check if we changed an already processed block */
-         const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second;
+         const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
          if (inserted) {
             operand.setKill(true);
             worklist.insert(preds[i]);
diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp
index 4bd1c68ca0c..8d35e17ff36 100644
--- a/src/amd/compiler/aco_lower_to_cssa.cpp
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
@@ -88,7 +88,7 @@ bool collect_phi_info(cssa_ctx& ctx)
                          ctx.program->blocks[pred].logical_idom :
                          ctx.program->blocks[pred].linear_idom;
                } while (def_points[i] != pred &&
-                        ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
+                        ctx.live_vars.live_out[pred].count(op.tempId()));
             }
          }
 
@@ -118,7 +118,7 @@ bool collect_phi_info(cssa_ctx& ctx)
             } else if (def_points[i] == block.index) {
                interferes = true;
             /* operand might interfere with phi-def */
-            } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) {
+            } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].tempId())) {
                interferes = true;
             /* else check for interferences with other operands */
             } else {
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index aa35e2826e7..b57bb83904a 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1701,7 +1701,7 @@ void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
 } /* end namespace */
 
 
-void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
+void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block)
 {
    ra_ctx ctx(program);
    std::vector<std::vector<Temp>> phi_ressources;
@@ -1711,14 +1711,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
       Block& block = *it;
 
       /* first, compute the death points of all live vars within the block */
-      TempSet& live = live_out_per_block[block.index];
+      IDSet& live = live_out_per_block[block.index];
 
       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
          aco_ptr<Instruction>& instr = *rit;
          if (is_phi(instr)) {
             if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
-               live.erase(instr->definitions[0].getTemp());
+               live.erase(instr->definitions[0].tempId());
                continue;
             }
             /* collect information about affinity-related temporaries */
@@ -1748,7 +1748,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
             /* add operands to live variables */
             for (const Operand& op : instr->operands) {
                if (op.isTemp())
-                  live.emplace(op.getTemp());
+                  live.insert(op.tempId());
             }
          }
 
@@ -1757,7 +1757,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
             const Definition& def = instr->definitions[i];
             if (!def.isTemp())
                continue;
-            live.erase(def.getTemp());
+            live.erase(def.tempId());
             /* mark last-seen phi operand */
             std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
             if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
@@ -1793,14 +1793,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
    std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
 
    for (Block& block : program->blocks) {
-      TempSet& live = live_out_per_block[block.index];
+      IDSet& live = live_out_per_block[block.index];
       /* initialize register file */
       assert(block.index != 0 || live.empty());
       RegisterFile register_file;
       ctx.war_hint.reset();
 
-      for (Temp t : live) {
-         Temp renamed = handle_live_in(ctx, t, &block);
+      for (unsigned t : live) {
+         Temp renamed = handle_live_in(ctx, Temp(t, program->temp_rc[t]), &block);
          assignment& var = ctx.assignments[renamed.id()];
          /* due to live-range splits, the live-in might be a phi, now */
          if (var.assigned)
@@ -1953,7 +1953,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
             register_file.fill(definition);
             ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
          }
-         live.emplace(definition.getTemp());
+         live.insert(definition.tempId());
 
          /* update phi affinities */
          for (const Operand& op : phi->operands) {
@@ -2154,7 +2154,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
 
             /* set live if it has a kill point */
             if (!definition.isKill())
-               live.emplace(definition.getTemp());
+               live.insert(definition.tempId());
 
             ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
             register_file.fill(definition);
@@ -2215,7 +2215,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
 
             /* set live if it has a kill point */
             if (!definition->isKill())
-               live.emplace(definition->getTemp());
+               live.insert(definition->tempId());
 
             ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
             register_file.fill(*definition);
diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h
index 7728747ca57..6cb9e4ee607 100644
--- a/src/amd/compiler/aco_util.h
+++ b/src/amd/compiler/aco_util.h
@@ -27,6 +27,7 @@
 
 #include <cassert>
 #include <iterator>
+#include <vector>
 
 namespace aco {
 
@@ -234,6 +235,151 @@ private:
    size_type length{ 0 };     //!> Size of the span
 };
 
+/*
+ * Cache-friendly set of 32-bit IDs with O(1) insert/erase/lookup and
+ * the ability to efficiently iterate over contained elements.
+ *
+ * Internally implemented as a bit vector: If the set contains an ID, the
+ * corresponding bit is set. It doesn't use std::vector<bool> since we then
+ * couldn't efficiently iterate over the elements.
+ *
+ * The interface resembles a subset of std::set/std::unordered_set.
+ */
+struct IDSet {
+   struct Iterator {
+      const IDSet *set;
+      union {
+         struct {
+            uint32_t bit:6;
+            uint32_t word:26;
+         };
+         uint32_t id;
+      };
+
+      Iterator& operator ++();
+
+      bool operator != (const Iterator& other) const;
+
+      uint32_t operator * () const;
+   };
+
+   size_t count(uint32_t id) const {
+      if (id >= words.size() * 64)
+         return 0;
+
+      return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
+   }
+
+   Iterator find(uint32_t id) const {
+      if (!count(id))
+         return end();
+
+      Iterator it;
+      it.set = this;
+      it.bit = id % 64u;
+      it.word = id / 64u;
+      return it;
+   }
+
+   std::pair<Iterator, bool> insert(uint32_t id) {
+      if (words.size() * 64u <= id)
+         words.resize(DIV_ROUND_UP(id + 1, 64u));
+
+      Iterator it;
+      it.set = this;
+      it.bit = id % 64u;
+      it.word = id / 64u;
+
+      uint64_t mask = 1ull << it.bit;
+      if (words[it.word] & mask)
+         return std::make_pair(it, false);
+
+      words[it.word] |= mask;
+      bits_set++;
+      return std::make_pair(it, true);
+   }
+
+   size_t erase(uint32_t id) {
+      if (!count(id))
+         return 0;
+
+      words[id / 64u] ^= 1ull << (id % 64u);
+      bits_set--;
+      return 1;
+   }
+
+   Iterator cbegin() const {
+      Iterator it;
+      it.set = this;
+      for (size_t i = 0; i < words.size(); i++) {
+         if (words[i]) {
+            it.word = i;
+            it.bit = ffsll(words[i]) - 1;
+            return it;
+         }
+      }
+      return end();
+   }
+
+   Iterator cend() const {
+      Iterator it;
+      it.set = this;
+      it.word = words.size();
+      it.bit = 0;
+      return it;
+   }
+
+   Iterator begin() const {
+      return cbegin();
+   }
+
+   Iterator end() const {
+      return cend();
+   }
+
+   bool empty() const {
+      return bits_set == 0;
+   }
+
+   size_t size() const {
+      return bits_set;
+   }
+
+   std::vector<uint64_t> words;
+   uint32_t bits_set;
+};
+
+inline IDSet::Iterator& IDSet::Iterator::operator ++() {
+   uint64_t m = set->words[word];
+   m &= ~((2ull << bit) - 1ull);
+   if (!m) {
+      /* don't continue past the end */
+      if (word == set->words.size())
+         return *this;
+
+      word++;
+      for (; word < set->words.size(); word++) {
+         if (set->words[word]) {
+            bit = ffsll(set->words[word]) - 1;
+            return *this;
+         }
+      }
+      bit = 0;
+   } else {
+      bit = ffsll(m) - 1;
+   }
+   return *this;
+}
+
+inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
+   assert(set == other.set);
+   return id != other.id;
+}
+
+inline uint32_t IDSet::Iterator::operator * () const {
+   return (word << 6) | bit;
+}
+
 } // namespace aco
 
 #endif // ACO_UTIL_H
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 02852d648e8..4a22dac3cd4 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -739,7 +739,8 @@ bool validate_ra(Program *program, const struct radv_nir_compiler_options *optio
       regs.fill(0);
 
       std::set<Temp> live;
-      live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end());
+      for (unsigned id : live_vars.live_out[block.index])
+         live.insert(Temp(id, program->temp_rc[id]));
       /* remove killed p_phi sgpr operands */
       for (Temp tmp : phi_sgpr_ops[block.index])
          live.erase(tmp);