From: Rhys Perry Date: Mon, 14 Sep 2020 15:45:55 +0000 (+0100) Subject: aco: use bit vectors for liveness sets X-Git-Tag: upstream/21.0.0~5244 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d2c18b7bf37f6b4fd7b0a2f7cf2fc97c5305ea4c;p=platform%2Fupstream%2Fmesa.git aco: use bit vectors for liveness sets This seems to be much faster than hash sets. When compiling pipelines from 5 games, live_var_analysis takes about a third the time it used to and fossilize-replay is ~1.77% faster. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Reviewed-by: Tony Wasserka Part-of: --- diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 04c4dd2..ade7435 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1647,16 +1647,9 @@ private: uint32_t allocationID = 1; }; -struct TempHash { - std::size_t operator()(Temp t) const { - return t.id(); - } -}; -using TempSet = std::unordered_set; - struct live { /* live temps out per block */ - std::vector live_out; + std::vector live_out; /* register demand (sgpr/vgpr) per instruction per block */ std::vector> register_demand; }; @@ -1692,7 +1685,7 @@ void value_numbering(Program* program); void optimize(Program* program); void setup_reduce_temp(Program* program); void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); -void register_allocation(Program *program, std::vector& live_out_per_block); +void register_allocation(Program *program, std::vector& live_out_per_block); void ssa_elimination(Program* program); void lower_to_hw_instr(Program* program); void schedule_program(Program* program, live& live_vars); diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 1c47eb3..467fc63 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -91,18 +91,18 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, register_demand.resize(block->instructions.size()); block->register_demand = RegisterDemand(); - TempSet live = lives.live_out[block->index]; + IDSet live = lives.live_out[block->index]; /* add the live_out_exec to live */ bool exec_live = false; if (block->live_out_exec != Temp()) { - live.insert(block->live_out_exec); + live.insert(block->live_out_exec.id()); exec_live = true; } /* initialize register demand */ - for (Temp t : live) - new_demand += t; + for (unsigned t : live) + new_demand += Temp(t, program->temp_rc[t]); new_demand.sgpr -= phi_sgpr_ops[block->index]; /* traverse the instructions backwards */ @@ -126,7 +126,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, program->needs_vcc = true; const Temp temp = definition.getTemp(); - const size_t n = live.erase(temp); + const size_t n = live.erase(temp.id()); if (n) { new_demand -= temp; @@ -158,7 +158,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, if (operand.isFixed() && operand.physReg() == vcc) program->needs_vcc = true; const Temp temp = operand.getTemp(); - const bool inserted = live.insert(temp).second; + const bool inserted = live.insert(temp.id()).second; if (inserted) { operand.setFirstKill(true); for (unsigned j = i + 1; j < insn->operands.size(); ++j) { @@ -198,7 +198,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc) program->needs_vcc = true; const Temp temp = definition.getTemp(); - const size_t n = live.erase(temp); + const size_t n = live.erase(temp.id()); if (n) definition.setKill(false); @@ -209,12 +209,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, } /* now, we need to merge the live-ins into the live-out sets */ - for (Temp t : live) { - std::vector& preds = t.is_linear() ? block->linear_preds : block->logical_preds; + for (unsigned t : live) { + RegClass rc = program->temp_rc[t]; + std::vector& preds = rc.is_linear() ? block->linear_preds : block->logical_preds; #ifndef NDEBUG if (preds.empty()) - aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index); + aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index); #endif for (unsigned pred_idx : preds) { @@ -240,7 +241,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, if (operand.isFixed() && operand.physReg() == vcc) program->needs_vcc = true; /* check if we changed an already processed block */ - const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second; + const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second; if (inserted) { operand.setKill(true); worklist.insert(preds[i]); diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp index 4bd1c68..8d35e17 100644 --- a/src/amd/compiler/aco_lower_to_cssa.cpp +++ b/src/amd/compiler/aco_lower_to_cssa.cpp @@ -88,7 +88,7 @@ bool collect_phi_info(cssa_ctx& ctx) ctx.program->blocks[pred].logical_idom : ctx.program->blocks[pred].linear_idom; } while (def_points[i] != pred && - ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end()); + ctx.live_vars.live_out[pred].count(op.tempId())); } } @@ -118,7 +118,7 @@ bool collect_phi_info(cssa_ctx& ctx) } else if (def_points[i] == block.index) { interferes = true; /* operand might interfere with phi-def */ - } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) { + } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].tempId())) { interferes = true; /* else check for interferences with other operands */ } else { diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index aa35e28..b57bb83 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1701,7 +1701,7 @@ void try_remove_trivial_phi(ra_ctx& ctx, Temp temp) } /* end namespace */ -void register_allocation(Program *program, std::vector& live_out_per_block) +void register_allocation(Program *program, std::vector& live_out_per_block) { ra_ctx ctx(program); std::vector> phi_ressources; @@ -1711,14 +1711,14 @@ void register_allocation(Program *program, std::vector& live_out_per_bl Block& block = *it; /* first, compute the death points of all live vars within the block */ - TempSet& live = live_out_per_block[block.index]; + IDSet& live = live_out_per_block[block.index]; std::vector>::reverse_iterator rit; for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { aco_ptr& instr = *rit; if (is_phi(instr)) { if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) { - live.erase(instr->definitions[0].getTemp()); + live.erase(instr->definitions[0].tempId()); continue; } /* collect information about affinity-related temporaries */ @@ -1748,7 +1748,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bl /* add operands to live variables */ for (const Operand& op : instr->operands) { if (op.isTemp()) - live.emplace(op.getTemp()); + live.insert(op.tempId()); } } @@ -1757,7 +1757,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bl const Definition& def = instr->definitions[i]; if (!def.isTemp()) continue; - live.erase(def.getTemp()); + live.erase(def.tempId()); /* mark last-seen phi operand */ std::unordered_map::iterator it = temp_to_phi_ressources.find(def.tempId()); if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) { @@ -1793,14 +1793,14 @@ void register_allocation(Program *program, std::vector& live_out_per_bl std::vector> sgpr_live_in(program->blocks.size()); for (Block& block : program->blocks) { - TempSet& live = live_out_per_block[block.index]; + IDSet& live = live_out_per_block[block.index]; /* initialize register file */ assert(block.index != 0 || live.empty()); RegisterFile register_file; ctx.war_hint.reset(); - for (Temp t : live) { - Temp renamed = handle_live_in(ctx, t, &block); + for (unsigned t : live) { + Temp renamed = handle_live_in(ctx, Temp(t, program->temp_rc[t]), &block); assignment& var = ctx.assignments[renamed.id()]; /* due to live-range splits, the live-in might be a phi, now */ if (var.assigned) @@ -1953,7 +1953,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bl register_file.fill(definition); ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; } - live.emplace(definition.getTemp()); + live.insert(definition.tempId()); /* update phi affinities */ for (const Operand& op : phi->operands) { @@ -2154,7 +2154,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bl /* set live if it has a kill point */ if (!definition.isKill()) - live.emplace(definition.getTemp()); + live.insert(definition.tempId()); ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()}; register_file.fill(definition); @@ -2215,7 +2215,7 @@ void register_allocation(Program *program, std::vector& live_out_per_bl /* set live if it has a kill point */ if (!definition->isKill()) - live.emplace(definition->getTemp()); + live.insert(definition->tempId()); ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()}; register_file.fill(*definition); diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h index 7728747..6cb9e4e 100644 --- a/src/amd/compiler/aco_util.h +++ b/src/amd/compiler/aco_util.h @@ -27,6 +27,7 @@ #include #include +#include namespace aco { @@ -234,6 +235,151 @@ private: size_type length{ 0 }; //!> Size of the span }; +/* + * Cache-friendly set of 32-bit IDs with O(1) insert/erase/lookup and + * the ability to efficiently iterate over contained elements. + * + * Internally implemented as a bit vector: If the set contains an ID, the + * corresponding bit is set. It doesn't use std::vector since we then + * couldn't efficiently iterate over the elements. + * + * The interface resembles a subset of std::set/std::unordered_set. + */ +struct IDSet { + struct Iterator { + const IDSet *set; + union { + struct { + uint32_t bit:6; + uint32_t word:26; + }; + uint32_t id; + }; + + Iterator& operator ++(); + + bool operator != (const Iterator& other) const; + + uint32_t operator * () const; + }; + + size_t count(uint32_t id) const { + if (id >= words.size() * 64) + return 0; + + return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0; + } + + Iterator find(uint32_t id) const { + if (!count(id)) + return end(); + + Iterator it; + it.set = this; + it.bit = id % 64u; + it.word = id / 64u; + return it; + } + + std::pair insert(uint32_t id) { + if (words.size() * 64u <= id) + words.resize(DIV_ROUND_UP(id + 1, 64u)); + + Iterator it; + it.set = this; + it.bit = id % 64u; + it.word = id / 64u; + + uint64_t mask = 1ull << it.bit; + if (words[it.word] & mask) + return std::make_pair(it, false); + + words[it.word] |= mask; + bits_set++; + return std::make_pair(it, true); + } + + size_t erase(uint32_t id) { + if (!count(id)) + return 0; + + words[id / 64u] ^= 1ull << (id % 64u); + bits_set--; + return 1; + } + + Iterator cbegin() const { + Iterator it; + it.set = this; + for (size_t i = 0; i < words.size(); i++) { + if (words[i]) { + it.word = i; + it.bit = ffsll(words[i]) - 1; + return it; + } + } + return end(); + } + + Iterator cend() const { + Iterator it; + it.set = this; + it.word = words.size(); + it.bit = 0; + return it; + } + + Iterator begin() const { + return cbegin(); + } + + Iterator end() const { + return cend(); + } + + bool empty() const { + return bits_set == 0; + } + + size_t size() const { + return bits_set; + } + + std::vector words; + uint32_t bits_set; +}; + +inline IDSet::Iterator& IDSet::Iterator::operator ++() { + uint64_t m = set->words[word]; + m &= ~((2ull << bit) - 1ull); + if (!m) { + /* don't continue past the end */ + if (word == set->words.size()) + return *this; + + word++; + for (; word < set->words.size(); word++) { + if (set->words[word]) { + bit = ffsll(set->words[word]) - 1; + return *this; + } + } + bit = 0; + } else { + bit = ffsll(m) - 1; + } + return *this; +} + +inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const { + assert(set == other.set); + return id != other.id; +} + +inline uint32_t IDSet::Iterator::operator * () const { + return (word << 6) | bit; +} + } // namespace aco #endif // ACO_UTIL_H diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 02852d6..4a22dac 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -739,7 +739,8 @@ bool validate_ra(Program *program, const struct radv_nir_compiler_options *optio regs.fill(0); std::set live; - live.insert(live_vars.live_out[block.index].begin(), live_vars.live_out[block.index].end()); + for (unsigned id : live_vars.live_out[block.index]) + live.insert(Temp(id, program->temp_rc[id])); /* remove killed p_phi sgpr operands */ for (Temp tmp : phi_sgpr_ops[block.index]) live.erase(tmp);