Mesa (master): aco: use bit vectors for liveness sets
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Sep 21 14:01:27 UTC 2020
Module: Mesa
Branch: master
Commit: d2c18b7bf37f6b4fd7b0a2f7cf2fc97c5305ea4c
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=d2c18b7bf37f6b4fd7b0a2f7cf2fc97c5305ea4c
Author: Rhys Perry <pendingchaos02 at gmail.com>
Date: Mon Sep 14 16:45:55 2020 +0100
aco: use bit vectors for liveness sets
This seems to be much faster than hash sets. When compiling pipelines from
5 games, live_var_analysis takes about a third the time it used to and
fossilize-replay is ~1.77% faster.
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Reviewed-by: Tony Wasserka <tony.wasserka at gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6733>
---
src/amd/compiler/aco_ir.h | 11 +-
src/amd/compiler/aco_live_var_analysis.cpp | 23 +++--
src/amd/compiler/aco_lower_to_cssa.cpp | 4 +-
src/amd/compiler/aco_register_allocation.cpp | 22 ++--
src/amd/compiler/aco_util.h | 146 +++++++++++++++++++++++++++
src/amd/compiler/aco_validate.cpp | 3 +-
6 files changed, 175 insertions(+), 34 deletions(-)
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 04c4dd2eb23..ade74356454 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1647,16 +1647,9 @@ private:
uint32_t allocationID = 1;
};
-struct TempHash {
- std::size_t operator()(Temp t) const {
- return t.id();
- }
-};
-using TempSet = std::unordered_set<Temp, TempHash>;
-
struct live {
/* live temps out per block */
- std::vector<TempSet> live_out;
+ std::vector<IDSet> live_out;
/* register demand (sgpr/vgpr) per instruction per block */
std::vector<std::vector<RegisterDemand>> register_demand;
};
@@ -1692,7 +1685,7 @@ void value_numbering(Program* program);
void optimize(Program* program);
void setup_reduce_temp(Program* program);
void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options);
-void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block);
+void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block);
void ssa_elimination(Program* program);
void lower_to_hw_instr(Program* program);
void schedule_program(Program* program, live& live_vars);
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 1c47eb3d6a4..467fc631ac5 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -91,18 +91,18 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
register_demand.resize(block->instructions.size());
block->register_demand = RegisterDemand();
- TempSet live = lives.live_out[block->index];
+ IDSet live = lives.live_out[block->index];
/* add the live_out_exec to live */
bool exec_live = false;
if (block->live_out_exec != Temp()) {
- live.insert(block->live_out_exec);
+ live.insert(block->live_out_exec.id());
exec_live = true;
}
/* initialize register demand */
- for (Temp t : live)
- new_demand += t;
+ for (unsigned t : live)
+ new_demand += Temp(t, program->temp_rc[t]);
new_demand.sgpr -= phi_sgpr_ops[block->index];
/* traverse the instructions backwards */
@@ -126,7 +126,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
program->needs_vcc = true;
const Temp temp = definition.getTemp();
- const size_t n = live.erase(temp);
+ const size_t n = live.erase(temp.id());
if (n) {
new_demand -= temp;
@@ -158,7 +158,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if (operand.isFixed() && operand.physReg() == vcc)
program->needs_vcc = true;
const Temp temp = operand.getTemp();
- const bool inserted = live.insert(temp).second;
+ const bool inserted = live.insert(temp.id()).second;
if (inserted) {
operand.setFirstKill(true);
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
@@ -198,7 +198,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
program->needs_vcc = true;
const Temp temp = definition.getTemp();
- const size_t n = live.erase(temp);
+ const size_t n = live.erase(temp.id());
if (n)
definition.setKill(false);
@@ -209,12 +209,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
}
/* now, we need to merge the live-ins into the live-out sets */
- for (Temp t : live) {
- std::vector<unsigned>& preds = t.is_linear() ? block->linear_preds : block->logical_preds;
+ for (unsigned t : live) {
+ RegClass rc = program->temp_rc[t];
+ std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;
#ifndef NDEBUG
if (preds.empty())
- aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index);
+ aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
#endif
for (unsigned pred_idx : preds) {
@@ -240,7 +241,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if (operand.isFixed() && operand.physReg() == vcc)
program->needs_vcc = true;
/* check if we changed an already processed block */
- const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second;
+ const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
if (inserted) {
operand.setKill(true);
worklist.insert(preds[i]);
diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp
index 4bd1c68ca0c..8d35e17ff36 100644
--- a/src/amd/compiler/aco_lower_to_cssa.cpp
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
@@ -88,7 +88,7 @@ bool collect_phi_info(cssa_ctx& ctx)
ctx.program->blocks[pred].logical_idom :
ctx.program->blocks[pred].linear_idom;
} while (def_points[i] != pred &&
- ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
+ ctx.live_vars.live_out[pred].count(op.tempId()));
}
}
@@ -118,7 +118,7 @@ bool collect_phi_info(cssa_ctx& ctx)
} else if (def_points[i] == block.index) {
interferes = true;
/* operand might interfere with phi-def */
- } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) {
+ } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].tempId())) {
interferes = true;
/* else check for interferences with other operands */
} else {
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index aa35e2826e7..b57bb83904a 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1701,7 +1701,7 @@ void try_remove_trivial_phi(ra_ctx& ctx, Temp temp)
} /* end namespace */
-void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
+void register_allocation(Program *program, std::vector<IDSet>& live_out_per_block)
{
ra_ctx ctx(program);
std::vector<std::vector<Temp>> phi_ressources;
@@ -1711,14 +1711,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
Block& block = *it;
/* first, compute the death points of all live vars within the block */
- TempSet& live = live_out_per_block[block.index];
+ IDSet& live = live_out_per_block[block.index];
std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
aco_ptr<Instruction>& instr = *rit;
if (is_phi(instr)) {
if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
- live.erase(instr->definitions[0].getTemp());
+ live.erase(instr->definitions[0].tempId());
continue;
}
/* collect information about affinity-related temporaries */
@@ -1748,7 +1748,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* add operands to live variables */
for (const Operand& op : instr->operands) {
if (op.isTemp())
- live.emplace(op.getTemp());
+ live.insert(op.tempId());
}
}
@@ -1757,7 +1757,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
const Definition& def = instr->definitions[i];
if (!def.isTemp())
continue;
- live.erase(def.getTemp());
+ live.erase(def.tempId());
/* mark last-seen phi operand */
std::unordered_map<unsigned, unsigned>::iterator it = temp_to_phi_ressources.find(def.tempId());
if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
@@ -1793,14 +1793,14 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
for (Block& block : program->blocks) {
- TempSet& live = live_out_per_block[block.index];
+ IDSet& live = live_out_per_block[block.index];
/* initialize register file */
assert(block.index != 0 || live.empty());
RegisterFile register_file;
ctx.war_hint.reset();
- for (Temp t : live) {
- Temp renamed = handle_live_in(ctx, t, &block);
+ for (unsigned t : live) {
+ Temp renamed = handle_live_in(ctx, Temp(t, program->temp_rc[t]), &block);
assignment& var = ctx.assignments[renamed.id()];
/* due to live-range splits, the live-in might be a phi, now */
if (var.assigned)
@@ -1953,7 +1953,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
register_file.fill(definition);
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
}
- live.emplace(definition.getTemp());
+ live.insert(definition.tempId());
/* update phi affinities */
for (const Operand& op : phi->operands) {
@@ -2154,7 +2154,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* set live if it has a kill point */
if (!definition.isKill())
- live.emplace(definition.getTemp());
+ live.insert(definition.tempId());
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
register_file.fill(definition);
@@ -2215,7 +2215,7 @@ void register_allocation(Program *program, std::vector<TempSet>& live_out_per_bl
/* set live if it has a kill point */
if (!definition->isKill())
- live.emplace(definition->getTemp());
+ live.insert(definition->tempId());
ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
register_file.fill(*definition);
diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h
index 7728747ca57..6cb9e4ee607 100644
--- a/src/amd/compiler/aco_util.h
+++ b/src/amd/compiler/aco_util.h
@@ -27,6 +27,7 @@
#include <cassert>
#include <iterator>
+#include <vector>
namespace aco {
@@ -234,6 +235,151 @@ private:
size_type length{ 0 }; //!> Size of the span
};
+/*
+ * Cache-friendly set of 32-bit IDs with O(1) insert/erase/lookup and
+ * the ability to efficiently iterate over contained elements.
+ *
+ * Internally implemented as a bit vector: If the set contains an ID, the
+ * corresponding bit is set. It doesn't use std::vector<bool> since we then
+ * couldn't efficiently iterate over the elements.
+ *
+ * The interface resembles a subset of std::set/std::unordered_set.
+ */
+struct IDSet {
+ struct Iterator {
+ const IDSet *set;
+ union {
+ struct {
+ uint32_t bit:6;
+ uint32_t word:26;
+ };
+ uint32_t id;
+ };
+
+ Iterator& operator ++();
+
+ bool operator != (const Iterator& other) const;
+
+ uint32_t operator * () const;
+ };
+
+ size_t count(uint32_t id) const {
+ if (id >= words.size() * 64)
+ return 0;
+
+ return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
+ }
+
+ Iterator find(uint32_t id) const {
+ if (!count(id))
+ return end();
+
+ Iterator it;
+ it.set = this;
+ it.bit = id % 64u;
+ it.word = id / 64u;
+ return it;
+ }
+
+ std::pair<Iterator, bool> insert(uint32_t id) {
+ if (words.size() * 64u <= id)
+ words.resize(DIV_ROUND_UP(id + 1, 64u));
+
+ Iterator it;
+ it.set = this;
+ it.bit = id % 64u;
+ it.word = id / 64u;
+
+ uint64_t mask = 1ull << it.bit;
+ if (words[it.word] & mask)
+ return std::make_pair(it, false);
+
+ words[it.word] |= mask;
+ bits_set++;
+ return std::make_pair(it, true);
+ }
+
+ size_t erase(uint32_t id) {
+ if (!count(id))
+ return 0;
+
+ words[id / 64u] ^= 1ull << (id % 64u);
+ bits_set--;
+ return 1;
+ }
+
+ Iterator cbegin() const {
+ Iterator it;
+ it.set = this;
+ for (size_t i = 0; i < words.size(); i++) {
+ if (words[i]) {
+ it.word = i;
+ it.bit = ffsll(words[i]) - 1;
+ return it;
+ }
+ }
+ return end();
+ }
+
+ Iterator cend() const {
+ Iterator it;
+ it.set = this;
+ it.word = words.size();
+ it.bit = 0;
+ return it;
+ }
+
+ Iterator begin() const {
+ return cbegin();
+ }
+
+ Iterator end() const {
+ return cend();
+ }
+
+ bool empty() const {
+ return bits_set == 0;
+ }
+
+ size_t size() const {
+ return bits_set;
+ }
+
+ std::vector<uint64_t> words;
+ uint32_t bits_set;
+};
+
+inline IDSet::Iterator& IDSet::Iterator::operator ++() {
+ uint64_t m = set->words[word];
+ m &= ~((2ull << bit) - 1ull);
+ if (!m) {
+ /* don't continue past the end */
+ if (word == set->words.size())
+ return *this;
+
+ word++;
+ for (; word < set->words.size(); word++) {
+ if (set->words[word]) {
+ bit = ffsll(set->words[word]) - 1;
+ return *this;
+ }
+ }
+ bit = 0;
+ } else {
+ bit = ffsll(m) - 1;
+ }
+ return *this;
+}
+
+inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
+ assert(set == other.set);
+ return id != other.id;
+}
+
+inline uint32_t IDSet::Iterator::operator * () const {
+ return (word << 6) | bit;
+}
+
} // namespace aco
#endif // ACO_UTIL_H
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index 02852d648e8..4a22dac3cd4 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -739,7 +739,8 @@ bool validate_ra(Program *program, const struct radv_nir_compiler_options *optio
regs.fill(0);
std::set<Temp> live;
- live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end());
+ for (unsigned id : live_vars.live_out[block.index])
+ live.insert(Temp(id, program->temp_rc[id]));
/* remove killed p_phi sgpr operands */
for (Temp tmp : phi_sgpr_ops[block.index])
live.erase(tmp);
More information about the mesa-commit
mailing list