From e43a3a66a9d8a99021d76ff4d07dec7b8cfd62ca Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 9 Apr 2012 20:58:39 +0200 Subject: [PATCH] nv50/ir: rewrite the register allocator as GCRA, with spilling This is more flexible than the linear scan, and we don't need the separate allocation pass for constrained values anymore. --- src/gallium/drivers/nv50/codegen/nv50_ir.cpp | 94 +- src/gallium/drivers/nv50/codegen/nv50_ir.h | 25 +- src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp | 4 + .../drivers/nv50/codegen/nv50_ir_from_tgsi.cpp | 1 + src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h | 5 + .../drivers/nv50/codegen/nv50_ir_peephole.cpp | 2 +- src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp | 3 + src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp | 1588 +++++++++++++++----- src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp | 118 +- src/gallium/drivers/nv50/codegen/nv50_ir_util.h | 47 +- .../drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp | 6 +- 11 files changed, 1475 insertions(+), 418 deletions(-) diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp index 314701b..cdf0a13 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp @@ -222,66 +222,17 @@ Value::Value() reg.size = 4; } -bool -Value::coalesce(Value *jval, bool force) -{ - Value *repr = this->join; // new representative - Value *jrep = jval->join; - - if (reg.file != jval->reg.file || reg.size != jval->reg.size) { - if (!force) - return false; - ERROR("forced coalescing of values of different sizes/files"); - } - - if (!force && (repr->reg.data.id != jrep->reg.data.id)) { - if (repr->reg.data.id >= 0 && - jrep->reg.data.id >= 0) - return false; - if (jrep->reg.data.id >= 0) { - repr = jval->join; - jrep = this->join; - jval = this; - } - - // need to check all fixed register values of the program for overlap - Function *func = defs.front()->getInsn()->bb->getFunction(); - - // TODO: put values in by register-id bins per function - ArrayList::Iterator iter = func->allLValues.iterator(); - for (; !iter.end(); iter.next()) { - Value *fixed = reinterpret_cast(iter.get()); - assert(fixed); - if (fixed->reg.data.id == repr->reg.data.id) - if (fixed->livei.overlaps(jrep->livei)) - return false; - } - } - if (repr->livei.overlaps(jrep->livei)) { - if (!force) - return false; - // do we really want this ? if at all, only for constraint ops - INFO("NOTE: forced coalescing with live range overlap\n"); - } - - for (DefIterator it = jrep->defs.begin(); it != jrep->defs.end(); ++it) - (*it)->get()->join = repr; - - repr->defs.insert(repr->defs.end(), - jrep->defs.begin(), jrep->defs.end()); - repr->livei.unify(jrep->livei); - - assert(repr->join == repr && jval->join == repr); - return true; -} - LValue::LValue(Function *fn, DataFile file) { reg.file = file; reg.size = (file != FILE_PREDICATE) ? 4 : 1; reg.data.id = -1; - affinity = -1; + compMask = 0; + compound = 0; + ssa = 0; + fixedReg = 0; + noSpill = 0; fn->add(this, this->id); } @@ -294,7 +245,11 @@ LValue::LValue(Function *fn, LValue *lval) reg.size = lval->reg.size; reg.data.id = -1; - affinity = -1; + compMask = 0; + compound = 0; + ssa = 0; + fixedReg = 0; + noSpill = 0; fn->add(this, this->id); } @@ -523,8 +478,8 @@ Value::interfers(const Value *that) const idA = this->join->reg.data.offset; idB = that->join->reg.data.offset; } else { - idA = this->join->reg.data.id * this->reg.size; - idB = that->join->reg.data.id * that->reg.size; + idA = this->join->reg.data.id * MIN2(this->reg.size, 4); + idB = that->join->reg.data.id * MIN2(that->reg.size, 4); } if (idA < idB) @@ -539,8 +494,6 @@ Value::interfers(const Value *that) const bool Value::equals(const Value *that, bool strict) const { - that = that->join; - if (strict) return this == that; @@ -754,20 +707,38 @@ Instruction::clone(ClonePolicy& pol, Instruction *i) const } unsigned int -Instruction::defCount(unsigned int mask) const +Instruction::defCount(unsigned int mask, bool singleFile) const { unsigned int i, n; + if (singleFile) { + unsigned int d = ffs(mask); + if (!d) + return 0; + for (i = d--; defExists(i); ++i) + if (getDef(i)->reg.file != getDef(d)->reg.file) + mask &= ~(1 << i); + } + for (n = 0, i = 0; this->defExists(i); ++i, mask >>= 1) n += mask & 1; return n; } unsigned int -Instruction::srcCount(unsigned int mask) const +Instruction::srcCount(unsigned int mask, bool singleFile) const { unsigned int i, n; + if (singleFile) { + unsigned int s = ffs(mask); + if (!s) + return 0; + for (i = s--; srcExists(i); ++i) + if (getSrc(i)->reg.file != getSrc(s)->reg.file) + mask &= ~(1 << i); + } + for (n = 0, i = 0; this->srcExists(i); ++i, mask >>= 1) n += mask & 1; return n; @@ -1137,6 +1108,7 @@ out: info->bin.maxGPR = prog->maxGPR; info->bin.code = prog->code; info->bin.codeSize = prog->binSize; + info->bin.tlsSpace = prog->tlsSize; delete prog; nv50_ir::Target::destroy(targ); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h index b06d093..a52cc9a 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h @@ -219,6 +219,7 @@ enum DataFile FILE_PREDICATE, // boolean predicate FILE_FLAGS, // zero/sign/carry/overflow bits FILE_ADDRESS, + LAST_REGISTER_FILE = FILE_ADDRESS, FILE_IMMEDIATE, FILE_MEMORY_CONST, FILE_SHADER_INPUT, @@ -320,7 +321,7 @@ struct Storage float f32; double f64; int32_t offset; // offset from 0 (base of address space) - int32_t id; // register id (< 0 if virtual/unassigned) + int32_t id; // register id (< 0 if virtual/unassigned, in units <= 4) struct { SVSemantic sv; int index; @@ -473,8 +474,6 @@ public: inline const Symbol *asSym() const; inline const ImmediateValue *asImm() const; - bool coalesce(Value *, bool force = false); - inline bool inFile(DataFile f) { return reg.file == f; } static inline Value *get(Iterator&); @@ -506,9 +505,11 @@ public: virtual int print(char *, size_t, DataType ty = TYPE_NONE) const; public: - unsigned ssa : 1; - - int affinity; + unsigned compMask : 8; // compound/component mask + unsigned compound : 1; // used by RA, value involved in split/merge + unsigned ssa : 1; + unsigned fixedReg : 1; // set & used by RA, earlier just use (id < 0) + unsigned noSpill : 1; // do not spill (e.g. if spill temporary already) }; class Symbol : public Value @@ -611,7 +612,7 @@ public: return s < srcs.size() && srcs[s].exists(); } - inline bool constrainedDefs() const { return defExists(1); } + inline bool constrainedDefs() const; bool setPredicate(CondCode ccode, Value *); inline Value *getPredicate() const; @@ -622,9 +623,9 @@ public: inline void setFlagsDef(int d, Value *); unsigned int defCount() const { return defs.size(); }; - unsigned int defCount(unsigned int mask) const; + unsigned int defCount(unsigned int mask, bool singleFile = false) const; unsigned int srcCount() const { return srcs.size(); }; - unsigned int srcCount(unsigned int mask) const; + unsigned int srcCount(unsigned int mask, bool singleFile = false) const; // save & remove / set indirect[0,1] and predicate source void takeExtraSources(int s, Value *[3]); @@ -965,6 +966,11 @@ public: uint32_t binPos; uint32_t binSize; + Value *stackPtr; + + uint32_t tlsBase; // base address for l[] space (if no stack pointer is used) + uint32_t tlsSize; + ArrayList allBBlocks; ArrayList allInsns; ArrayList allLValues; @@ -1036,6 +1042,7 @@ public: uint32_t *code; uint32_t binSize; + uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL int maxGPR; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp index 7b211a2..ebfb07a 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp @@ -41,6 +41,10 @@ Function::Function(Program *p, const char *fnName, uint32_t label) binPos = 0; binSize = 0; + stackPtr = NULL; + tlsBase = 0; + tlsSize = 0; + prog->add(this, id); } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp index c419a7d..9fab1c3 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp @@ -2420,6 +2420,7 @@ Program::makeFromTGSI(struct nv50_ir_prog_info *info) tgsi::Source src(info); if (!src.scanSource()) return false; + tlsSize = info->bin.tlsSpace; Converter builder(this, &src); return builder.run(); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h index b06f217..4ce9deb 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h @@ -192,6 +192,11 @@ Instruction *Value::getUniqueInsn() const return defs.front()->getInsn(); } +inline bool Instruction::constrainedDefs() const +{ + return defExists(1) || op == OP_UNION; +} + Value *Instruction::getIndirect(int s, int dim) const { return srcs[s].isIndirect(dim) ? getSrc(srcs[s].indirect[dim]) : NULL; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp index 867bcb0..2cca449 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp @@ -48,7 +48,7 @@ Instruction::isNop() const } if (op == OP_MOV || op == OP_UNION) { - if (!def(0).rep()->equals(getSrc(0))) + if (!getDef(0)->equals(getSrc(0))) return false; if (op == OP_UNION) if (!def(0).rep()->equals(getSrc(1))) diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp index a831145..6138233 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp @@ -283,6 +283,9 @@ int LValue::print(char *buf, size_t size, DataType ty) const else if (reg.size == 16) postFix = "q"; + else + if (reg.size == 12) + postFix = "t"; break; case FILE_PREDICATE: r = 'p'; col = TXT_REGISTER; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp index f13335f..cd8f289 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp @@ -23,7 +23,8 @@ #include "nv50_ir.h" #include "nv50_ir_target.h" -#include "nv50/nv50_debug.h" +#include +#include namespace nv50_ir { @@ -32,41 +33,74 @@ namespace nv50_ir { class RegisterSet { public: - RegisterSet(); RegisterSet(const Target *); void init(const Target *); - void reset(); // reset allocation status, but not max assigned regs + void reset(DataFile, bool resetMax = false); void periodicMask(DataFile f, uint32_t lock, uint32_t unlock); void intersect(DataFile f, const RegisterSet *); - bool assign(Value **, int nr); - void release(const Value *); + bool assign(int32_t& reg, DataFile f, unsigned int size); + void release(DataFile f, int32_t reg, unsigned int size); + bool occupy(DataFile f, int32_t reg, unsigned int size); bool occupy(const Value *); + void occupyMask(DataFile f, int32_t reg, uint8_t mask); - int getMaxAssigned(DataFile f) const { return fill[f]; } + inline int getMaxAssigned(DataFile f) const { return fill[f]; } + + inline unsigned int getFileSize(DataFile f, uint8_t regSize) const + { + if (restrictedGPR16Range && f == FILE_GPR && regSize == 2) + return (last[f] + 1) / 2; + return last[f] + 1; + } + + inline unsigned int units(DataFile f, unsigned int size) const + { + return size >> unit[f]; + } + // for regs of size >= 4, id is counted in 4-byte words (like nv50/c0 binary) + inline unsigned int idToBytes(Value *v) const + { + return v->reg.data.id * MIN2(v->reg.size, 4); + } + inline unsigned int idToUnits(Value *v) const + { + return units(v->reg.file, idToBytes(v)); + } + inline int bytesToId(Value *v, unsigned int bytes) const + { + if (v->reg.size < 4) + return units(v->reg.file, bytes); + return bytes / 4; + } + inline int unitsToId(DataFile f, int u, uint8_t size) const + { + if (u < 0) + return -1; + return (size < 4) ? u : ((u << unit[f]) / 4); + } void print() const; private: - uint32_t bits[FILE_ADDRESS + 1][(MAX_REGISTER_FILE_SIZE + 31) / 32]; + BitSet bits[LAST_REGISTER_FILE + 1]; + + int unit[LAST_REGISTER_FILE + 1]; // log2 of allocation granularity - int unit[FILE_ADDRESS + 1]; // log2 of allocation granularity + int last[LAST_REGISTER_FILE + 1]; + int fill[LAST_REGISTER_FILE + 1]; - int last[FILE_ADDRESS + 1]; - int fill[FILE_ADDRESS + 1]; + const bool restrictedGPR16Range; }; void -RegisterSet::reset() +RegisterSet::reset(DataFile f, bool resetMax) { - memset(bits, 0, sizeof(bits)); -} - -RegisterSet::RegisterSet() -{ - reset(); + bits[f].fill(0); + if (resetMax) + fill[f] = -1; } void @@ -78,118 +112,83 @@ RegisterSet::init(const Target *targ) unit[rf] = targ->getFileUnit(f); fill[rf] = -1; assert(last[rf] < MAX_REGISTER_FILE_SIZE); + bits[rf].allocate(last[rf] + 1, true); } } RegisterSet::RegisterSet(const Target *targ) + : restrictedGPR16Range(targ->getChipset() < 0xc0) { - reset(); init(targ); + for (unsigned int i = 0; i <= LAST_REGISTER_FILE; ++i) + reset(static_cast(i)); } void RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock) { - for (int i = 0; i < (last[f] + 31) / 32; ++i) - bits[f][i] = (bits[f][i] | lock) & ~unlock; + bits[f].periodicMask32(lock, unlock); } void RegisterSet::intersect(DataFile f, const RegisterSet *set) { - for (int i = 0; i < (last[f] + 31) / 32; ++i) - bits[f][i] |= set->bits[f][i]; + bits[f] |= set->bits[f]; } void RegisterSet::print() const { INFO("GPR:"); - for (int i = 0; i < (last[FILE_GPR] + 31) / 32; ++i) - INFO(" %08x", bits[FILE_GPR][i]); + bits[FILE_GPR].print(); INFO("\n"); } bool -RegisterSet::assign(Value **def, int nr) +RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size) { - DataFile f = def[0]->reg.file; - int n = nr; - if (n == 3) - n = 4; - int s = (n * def[0]->reg.size) >> unit[f]; - uint32_t m = (1 << s) - 1; - - int id = last[f] + 1; - int i; - - for (i = 0; (i * 32) < last[f]; ++i) { - if (bits[f][i] == 0xffffffff) - continue; - - for (id = 0; id < 32; id += s) - if (!(bits[f][i] & (m << id))) - break; - if (id < 32) - break; - } - id += i * 32; - if (id > last[f]) + reg = bits[f].findFreeRange(size); + if (reg < 0) return false; - - bits[f][id / 32] |= m << (id % 32); - - if (id + (s - 1) > fill[f]) - fill[f] = id + (s - 1); - - for (i = 0; i < nr; ++i, ++id) - if (!def[i]->livei.isEmpty()) // XXX: really increased id if empty ? - def[i]->reg.data.id = id; + fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1)); return true; } bool -RegisterSet::occupy(const Value *val) +RegisterSet::occupy(const Value *v) { - int id = val->reg.data.id; - unsigned int f = val->reg.file; + return occupy(v->reg.file, v->reg.data.id, v->reg.size >> unit[v->reg.file]); +} - uint32_t m = (1 << (val->reg.size >> unit[f])) - 1; +void +RegisterSet::occupyMask(DataFile f, int32_t reg, uint8_t mask) +{ + bits[f].setMask(reg & ~31, static_cast(mask) << (reg % 32)); +} - if (id < 0 || bits[f][id / 32] & m << (id % 32)) +bool +RegisterSet::occupy(DataFile f, int32_t reg, unsigned int size) +{ + if (bits[f].testRange(reg, size)) return false; - INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %x\n", f, id, m); + bits[f].setRange(reg, size); - bits[f][id / 32] |= m << (id % 32); + INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %u\n", f, reg, size); - if (fill[f] < id) - fill[f] = id; + fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1)); return true; } void -RegisterSet::release(const Value *val) +RegisterSet::release(DataFile f, int32_t reg, unsigned int size) { - int id = val->reg.data.id; - if (id < 0) - return; - unsigned int f = val->reg.file; - - uint32_t m = (1 << (val->reg.size >> unit[f])) - 1; + bits[f].clrRange(reg, size); - INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %x\n", f, id, m); - - bits[f][id / 32] &= ~(m << (id % 32)); + INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %u\n", f, reg, size); } -#define JOIN_MASK_PHI (1 << 0) -#define JOIN_MASK_UNION (1 << 1) -#define JOIN_MASK_MOV (1 << 2) -#define JOIN_MASK_TEX (1 << 3) -#define JOIN_MASK_CONSTRAINT (1 << 4) - class RegAlloc { public: @@ -199,11 +198,6 @@ public: bool execFunc(); private: - bool coalesceValues(unsigned int mask); - bool linearScan(); - bool allocateConstrainedValues(); - -private: class PhiMovesPass : public Pass { private: virtual bool visit(BasicBlock *); @@ -230,19 +224,25 @@ private: bool insertConstraintMoves(); + void condenseDefs(Instruction *); + void condenseSrcs(Instruction *, const int first, const int last); + void addHazard(Instruction *i, const ValueRef *src); void textureMask(TexInstruction *); void addConstraint(Instruction *, int s, int n); bool detectConflict(Instruction *, int s); - DLList constrList; + // target specific functions, TODO: put in subclass or Target + void texConstraintNV50(TexInstruction *); + void texConstraintNVC0(TexInstruction *); + void texConstraintNVE0(TexInstruction *); + + std::list constrList; + + const Target *targ; }; bool buildLiveSets(BasicBlock *); - void collectLValues(DLList&, bool assignedOnly); - - void insertOrderedTail(DLList&, Value *); - inline Instruction *insnBySerial(int); private: Program *prog; @@ -254,11 +254,36 @@ private: int sequence; // for manual passes through CFG }; -Instruction * -RegAlloc::insnBySerial(int serial) +typedef std::pair ValuePair; + +class SpillCodeInserter { - return reinterpret_cast(insns.get(serial)); -} +public: + SpillCodeInserter(Function *fn) : func(fn), stackSize(0), stackBase(0) { } + + bool run(const std::list&); + + Symbol *assignSlot(const Interval&, unsigned int size); + inline int32_t getStackSize() const { return stackSize; } + +private: + Function *func; + + struct SpillSlot + { + Interval occup; + std::list residents; // needed to recalculate occup + Symbol *sym; + int32_t offset; + inline uint8_t size() const { return sym->reg.size; } + }; + std::list slots; + int32_t stackSize; + int32_t stackBase; + + LValue *unspill(Instruction *usei, LValue *, Value *slot); + void spill(Instruction *defi, Value *slot, LValue *); +}; void RegAlloc::BuildIntervalsPass::addLiveRange(Value *val, @@ -312,23 +337,26 @@ RegAlloc::PhiMovesPass::visit(BasicBlock *bb) Instruction *phi, *mov; BasicBlock *pb, *pn; + std::stack stack; + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { - pb = pn = BasicBlock::get(ei.getNode()); + pb = BasicBlock::get(ei.getNode()); assert(pb); - - if (needNewElseBlock(bb, pb)) { - pn = new BasicBlock(func); - - // deletes an edge, iterator is invalid after this: - pb->cfg.detach(&bb->cfg); - pb->cfg.attach(&pn->cfg, Graph::Edge::TREE); - pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD); // XXX: check order ! - - assert(pb->getExit()->op != OP_CALL); - if (pb->getExit()->asFlow()->target.bb == bb) - pb->getExit()->asFlow()->target.bb = pn; - break; - } + if (needNewElseBlock(bb, pb)) + stack.push(pb); + } + while (!stack.empty()) { + pb = stack.top(); + pn = new BasicBlock(func); + stack.pop(); + + pb->cfg.detach(&bb->cfg); + pb->cfg.attach(&pn->cfg, Graph::Edge::TREE); + pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD); + + assert(pb->getExit()->op != OP_CALL); + if (pb->getExit()->asFlow()->target.bb == bb) + pb->getExit()->asFlow()->target.bb = pn; } // insert MOVs (phi->src(j) should stem from j-th in-BB) @@ -567,36 +595,350 @@ RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb) return true; } + +#define JOIN_MASK_PHI (1 << 0) +#define JOIN_MASK_UNION (1 << 1) +#define JOIN_MASK_MOV (1 << 2) +#define JOIN_MASK_TEX (1 << 3) + +class GCRA +{ +public: + GCRA(Function *, SpillCodeInserter&); + ~GCRA(); + + bool allocateRegisters(ArrayList& insns); + + void printNodeInfo() const; + +private: + class RIG_Node : public Graph::Node + { + public: + RIG_Node(); + + void init(const RegisterSet&, LValue *); + + void addInterference(RIG_Node *); + void addRegPreference(RIG_Node *); + + inline LValue *getValue() const + { + return reinterpret_cast(data); + } + inline void setValue(LValue *lval) { data = lval; } + + inline uint8_t getCompMask() const + { + return ((1 << colors) - 1) << (reg & 7); + } + + static inline RIG_Node *get(const Graph::EdgeIterator& ei) + { + return static_cast(ei.getNode()); + } + + public: + uint32_t degree; + uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable + uint16_t colors; + + DataFile f; + int32_t reg; + + float weight; + + // list pointers for simplify() phase + RIG_Node *next; + RIG_Node *prev; + + // union of the live intervals of all coalesced values (we want to retain + // the separate intervals for testing interference of compound values) + Interval livei; + + std::list prefRegs; + }; + +private: + inline RIG_Node *getNode(const LValue *v) const { return &nodes[v->id]; } + + void buildRIG(ArrayList&); + bool coalesce(ArrayList&); + bool doCoalesce(ArrayList&, unsigned int mask); + void calculateSpillWeights(); + void simplify(); + bool selectRegisters(); + void cleanup(const bool success); + + void simplifyEdge(RIG_Node *, RIG_Node *); + void simplifyNode(RIG_Node *); + + bool coalesceValues(Value *, Value *, bool force); + void resolveSplitsAndMerges(); + void makeCompound(Instruction *, bool isSplit); + + inline void checkInterference(const RIG_Node *, Graph::EdgeIterator&); + + inline void insertOrderedTail(std::list&, RIG_Node *); + void checkList(std::list&); + +private: + std::stack stack; + + // list headers for simplify() phase + RIG_Node lo[2]; + RIG_Node hi; + + Graph RIG; + RIG_Node *nodes; + unsigned int nodeCount; + + Function *func; + Program *prog; + + static uint8_t relDegree[17][17]; + + RegisterSet regs; + + // need to fixup register id for participants of OP_MERGE/SPLIT + std::list merges; + std::list splits; + + SpillCodeInserter& spill; + std::list mustSpill; +}; + +uint8_t GCRA::relDegree[17][17]; + +GCRA::RIG_Node::RIG_Node() : Node(NULL), next(this), prev(this) +{ + colors = 0; +} + +void +GCRA::printNodeInfo() const +{ + for (unsigned int i = 0; i < nodeCount; ++i) { + if (!nodes[i].colors) + continue; + INFO("RIG_Node[%%%i]($[%u]%i): %u colors, weight %f, deg %u/%u\n X", + i, + nodes[i].f,nodes[i].reg,nodes[i].colors, + nodes[i].weight, + nodes[i].degree, nodes[i].degreeLimit); + + for (Graph::EdgeIterator ei = nodes[i].outgoing(); !ei.end(); ei.next()) + INFO(" %%%i", RIG_Node::get(ei)->getValue()->id); + for (Graph::EdgeIterator ei = nodes[i].incident(); !ei.end(); ei.next()) + INFO(" %%%i", RIG_Node::get(ei)->getValue()->id); + INFO("\n"); + } +} + +void +GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval) +{ + setValue(lval); + if (lval->reg.data.id >= 0) + lval->noSpill = lval->fixedReg = 1; + + colors = regs.units(lval->reg.file, lval->reg.size); + f = lval->reg.file; + reg = -1; + if (lval->reg.data.id >= 0) + reg = regs.idToUnits(lval); + + weight = std::numeric_limits::infinity(); + degree = 0; + degreeLimit = regs.getFileSize(f, lval->reg.size); + + livei.insert(lval->livei); +} + +bool +GCRA::coalesceValues(Value *dst, Value *src, bool force) +{ + LValue *rep = dst->join->asLValue(); + LValue *val = src->join->asLValue(); + + if (!force && val->reg.data.id >= 0) { + rep = src->join->asLValue(); + val = dst->join->asLValue(); + } + RIG_Node *nRep = &nodes[rep->id]; + RIG_Node *nVal = &nodes[val->id]; + + if (src->reg.file != dst->reg.file) { + if (!force) + return false; + WARN("forced coalescing of values in different files !\n"); + } + if (!force && dst->reg.size != src->reg.size) + return false; + + if ((rep->reg.data.id >= 0) && (rep->reg.data.id != val->reg.data.id)) { + if (force) { + if (val->reg.data.id >= 0) + WARN("forced coalescing of values in different fixed regs !\n"); + } else { + if (val->reg.data.id >= 0) + return false; + // make sure that there is no overlap with the fixed register of rep + for (ArrayList::Iterator it = func->allLValues.iterator(); + !it.end(); it.next()) { + Value *reg = reinterpret_cast(it.get())->asLValue(); + assert(reg); + if (reg->interfers(rep) && reg->livei.overlaps(nVal->livei)) + return false; + } + } + } + + if (!force && nRep->livei.overlaps(nVal->livei)) + return false; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "joining %%%i($%i) <- %%%i\n", + rep->id, rep->reg.data.id, val->id); + + // set join pointer of all values joined with val + for (Value::DefIterator def = val->defs.begin(); def != val->defs.end(); + ++def) + (*def)->get()->join = rep; + assert(rep->join == rep && val->join == rep); + + // add val's definitions to rep and extend the live interval of its RIG node + rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end()); + nRep->livei.unify(nVal->livei); + return true; +} + +bool +GCRA::coalesce(ArrayList& insns) +{ + bool ret = doCoalesce(insns, JOIN_MASK_PHI); + if (!ret) + return false; + switch (func->getProgram()->getTarget()->getChipset() & ~0xf) { + case 0x50: + case 0x80: + case 0x90: + case 0xa0: + ret = doCoalesce(insns, JOIN_MASK_UNION | JOIN_MASK_TEX); + break; + case 0xc0: + case 0xd0: + case 0xe0: + ret = doCoalesce(insns, JOIN_MASK_UNION); + break; + default: + break; + } + if (!ret) + return false; + return doCoalesce(insns, JOIN_MASK_MOV); +} + +static inline uint8_t makeCompMask(int compSize, int base, int size) +{ + uint8_t m = ((1 << size) - 1) << base; + + switch (compSize) { + case 1: + return 0xff; + case 2: + m |= (m << 2); + return (m << 4) | m; + case 3: + case 4: + return (m << 4) | m; + default: + assert(compSize <= 8); + return m; + } +} + +static inline void copyCompound(Value *dst, Value *src) +{ + LValue *ldst = dst->asLValue(); + LValue *lsrc = src->asLValue(); + + ldst->compound = lsrc->compound; + ldst->compMask = lsrc->compMask; +} + +void +GCRA::makeCompound(Instruction *insn, bool split) +{ + LValue *rep = (split ? insn->getSrc(0) : insn->getDef(0))->asLValue(); + + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { + INFO("makeCompound(split = %i): ", split); + insn->print(); + } + + const unsigned int size = getNode(rep)->colors; + unsigned int base = 0; + + if (!rep->compound) + rep->compMask = 0xff; + rep->compound = 1; + + for (int c = 0; split ? insn->defExists(c) : insn->srcExists(c); ++c) { + LValue *val = (split ? insn->getDef(c) : insn->getSrc(c))->asLValue(); + + val->compound = 1; + if (!val->compMask) + val->compMask = 0xff; + val->compMask &= makeCompMask(size, base, getNode(val)->colors); + assert(val->compMask); + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "compound: %%%i:%02x <- %%%i:%02x\n", + rep->id, rep->compMask, val->id, val->compMask); + + base += getNode(val)->colors; + } + assert(base == size); +} + bool -RegAlloc::coalesceValues(unsigned int mask) +GCRA::doCoalesce(ArrayList& insns, unsigned int mask) { int c, n; for (n = 0; n < insns.getSize(); ++n) { Instruction *i; - Instruction *insn = insnBySerial(n); + Instruction *insn = reinterpret_cast(insns.get(n)); switch (insn->op) { case OP_PHI: if (!(mask & JOIN_MASK_PHI)) break; for (c = 0; insn->srcExists(c); ++c) - if (!insn->getDef(0)->coalesce(insn->getSrc(c), false)) { + if (!coalesceValues(insn->getDef(0), insn->getSrc(c), false)) { + // this is bad ERROR("failed to coalesce phi operands\n"); return false; } break; case OP_UNION: + case OP_MERGE: if (!(mask & JOIN_MASK_UNION)) break; for (c = 0; insn->srcExists(c); ++c) - insn->getDef(0)->coalesce(insn->getSrc(c), true); + coalesceValues(insn->getDef(0), insn->getSrc(c), true); + if (insn->op == OP_MERGE) { + merges.push_back(insn); + if (insn->srcExists(1)) + makeCompound(insn, false); + } break; - case OP_CONSTRAINT: - if (!(mask & JOIN_MASK_CONSTRAINT)) + case OP_SPLIT: + if (!(mask & JOIN_MASK_UNION)) break; - for (c = 0; c < 4 && insn->srcExists(c); ++c) - insn->getDef(c)->coalesce(insn->getSrc(c), true); + splits.push_back(insn); + for (c = 0; insn->defExists(c); ++c) + coalesceValues(insn->getSrc(0), insn->getDef(c), true); + makeCompound(insn, true); break; case OP_MOV: if (!(mask & JOIN_MASK_MOV)) @@ -605,11 +947,13 @@ RegAlloc::coalesceValues(unsigned int mask) if (!insn->getDef(0)->uses.empty()) i = insn->getDef(0)->uses.front()->getInsn(); // if this is a contraint-move there will only be a single use - if (i && i->op == OP_CONSTRAINT) + if (i && i->op == OP_MERGE) // do we really still need this ? break; i = insn->getSrc(0)->getUniqueInsn(); - if (i && !i->constrainedDefs()) - insn->getDef(0)->coalesce(insn->getSrc(0), false); + if (i && !i->constrainedDefs()) { + if (coalesceValues(insn->getDef(0), insn->getSrc(0), false)) + copyCompound(insn->getSrc(0), insn->getDef(0)); + } break; case OP_TEX: case OP_TXB: @@ -621,8 +965,8 @@ RegAlloc::coalesceValues(unsigned int mask) case OP_TEXCSAA: if (!(mask & JOIN_MASK_TEX)) break; - for (c = 0; c < 4 && insn->srcExists(c); ++c) - insn->getDef(c)->coalesce(insn->getSrc(c), true); + for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c) + coalesceValues(insn->getDef(c), insn->getSrc(c), true); break; default: break; @@ -632,186 +976,555 @@ RegAlloc::coalesceValues(unsigned int mask) } void -RegAlloc::insertOrderedTail(DLList &list, Value *val) +GCRA::RIG_Node::addInterference(RIG_Node *node) +{ + this->degree += relDegree[node->colors][colors]; + node->degree += relDegree[colors][node->colors]; + + this->attach(node, Graph::Edge::CROSS); +} + +void +GCRA::RIG_Node::addRegPreference(RIG_Node *node) +{ + prefRegs.push_back(node); +} + +GCRA::GCRA(Function *fn, SpillCodeInserter& spill) : + func(fn), + regs(fn->getProgram()->getTarget()), + spill(spill) { - // we insert the live intervals in order, so this should be short - DLList::Iterator iter = list.revIterator(); - const int begin = val->livei.begin(); - for (; !iter.end(); iter.next()) { - if (reinterpret_cast(iter.get())->livei.begin() <= begin) + prog = func->getProgram(); + + // initialize relative degrees array - i takes away from j + for (int i = 1; i <= 16; ++i) + for (int j = 1; j <= 16; ++j) + relDegree[i][j] = j * ((i + j - 1) / j); +} + +GCRA::~GCRA() +{ + if (nodes) + delete[] nodes; +} + +void +GCRA::checkList(std::list& lst) +{ + GCRA::RIG_Node *prev = NULL; + + for (std::list::iterator it = lst.begin(); + it != lst.end(); + ++it) { + assert((*it)->getValue()->join == (*it)->getValue()); + if (prev) + assert(prev->livei.begin() <= (*it)->livei.begin()); + prev = *it; + } +} + +void +GCRA::insertOrderedTail(std::list& list, RIG_Node *node) +{ + if (node->livei.isEmpty()) + return; + // only the intervals of joined values don't necessarily arrive in order + std::list::iterator prev, it; + for (it = list.end(); it != list.begin(); it = prev) { + prev = it; + --prev; + if ((*prev)->livei.begin() <= node->livei.begin()) break; } - iter.insert(val); + list.insert(it, node); } -static void -checkList(DLList &list) +void +GCRA::buildRIG(ArrayList& insns) { - Value *prev = NULL; - Value *next = NULL; + std::list values, active; + + for (std::deque::iterator it = func->ins.begin(); + it != func->ins.end(); ++it) + insertOrderedTail(values, getNode(it->get()->asLValue())); + + for (int i = 0; i < insns.getSize(); ++i) { + Instruction *insn = reinterpret_cast(insns.get(i)); + for (int d = 0; insn->defExists(d); ++d) + if (insn->getDef(d)->rep() == insn->getDef(d)) + insertOrderedTail(values, getNode(insn->getDef(d)->asLValue())); + } + checkList(values); - for (DLList::Iterator iter = list.iterator(); !iter.end(); iter.next()) { - next = Value::get(iter); - assert(next); - if (prev) { - assert(prev->livei.begin() <= next->livei.begin()); + while (!values.empty()) { + RIG_Node *cur = values.front(); + + for (std::list::iterator it = active.begin(); + it != active.end(); + ++it) { + RIG_Node *node = *it; + + if (node->livei.end() <= cur->livei.begin()) { + it = active.erase(it); + --it; + } else + if (node->f == cur->f && node->livei.overlaps(cur->livei)) { + cur->addInterference(node); + } } - assert(next->join == next); - prev = next; + values.pop_front(); + active.push_back(cur); } } void -RegAlloc::collectLValues(DLList &list, bool assignedOnly) +GCRA::calculateSpillWeights() { - for (std::deque::iterator it = func->ins.begin(); - it != func->ins.end(); ++it) { - if (!assignedOnly || it->get()->reg.data.id >= 0) - insertOrderedTail(list, it->get()); - } + for (unsigned int i = 0; i < nodeCount; ++i) { + RIG_Node *const n = &nodes[i]; + if (!nodes[i].colors || nodes[i].livei.isEmpty()) + continue; + if (nodes[i].reg >= 0) { + // update max reg + regs.occupy(n->f, n->reg, n->colors); + continue; + } + LValue *val = nodes[i].getValue(); - for (int n = 0; n < insns.getSize(); ++n) { - Instruction *i = insnBySerial(n); + if (!val->noSpill) { + int rc = 0; + for (Value::DefIterator it = val->defs.begin(); + it != val->defs.end(); + ++it) + rc += (*it)->get()->refCount(); - for (int d = 0; i->defExists(d); ++d) - if (!i->getDef(d)->livei.isEmpty()) - if (!assignedOnly || i->getDef(d)->reg.data.id >= 0) - insertOrderedTail(list, i->getDef(d)); + nodes[i].weight = + (float)rc * (float)rc / (float)nodes[i].livei.extent(); + } + + if (nodes[i].degree < nodes[i].degreeLimit) { + int l = 0; + if (val->reg.size > 4) + l = 1; + DLLIST_ADDHEAD(&lo[l], &nodes[i]); + } else { + DLLIST_ADDHEAD(&hi, &nodes[i]); + } } - checkList(list); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + printNodeInfo(); } -bool -RegAlloc::allocateConstrainedValues() +void +GCRA::simplifyEdge(RIG_Node *a, RIG_Node *b) { - Value *defs[4]; - RegisterSet regSet[4]; - DLList regVals; + bool move = b->degree >= b->degreeLimit; - INFO_DBG(prog->dbgFlags, REG_ALLOC, "RA: allocating constrained values\n"); + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "edge: (%%%i, deg %u/%u) >-< (%%%i, deg %u/%u)\n", + a->getValue()->id, a->degree, a->degreeLimit, + b->getValue()->id, b->degree, b->degreeLimit); - collectLValues(regVals, true); + b->degree -= relDegree[a->colors][b->colors]; - for (int c = 0; c < 4; ++c) - regSet[c].init(prog->getTarget()); + move = move && b->degree < b->degreeLimit; + if (move && !DLLIST_EMPTY(b)) { + int l = (b->getValue()->reg.size > 4) ? 1 : 0; + DLLIST_DEL(b); + DLLIST_ADDTAIL(&lo[l], b); + } +} - for (int n = 0; n < insns.getSize(); ++n) { - Instruction *i = insnBySerial(n); +void +GCRA::simplifyNode(RIG_Node *node) +{ + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) + simplifyEdge(node, RIG_Node::get(ei)); - const int vecSize = i->defCount(0xf); - if (vecSize < 2) - continue; - assert(vecSize <= 4); + for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next()) + simplifyEdge(node, RIG_Node::get(ei)); - for (int c = 0; c < vecSize; ++c) - defs[c] = i->def(c).rep(); + DLLIST_DEL(node); + stack.push(node->getValue()->id); - if (defs[0]->reg.data.id >= 0) { - for (int c = 1; c < vecSize; ++c) { - assert(defs[c]->reg.data.id >= 0); + INFO_DBG(prog->dbgFlags, REG_ALLOC, "SIMPLIFY: pushed %%%i%s\n", + node->getValue()->id, + (node->degree < node->degreeLimit) ? "" : "(spill)"); +} + +void +GCRA::simplify() +{ + for (;;) { + if (!DLLIST_EMPTY(&lo[0])) { + do { + simplifyNode(lo[0].next); + } while (!DLLIST_EMPTY(&lo[0])); + } else + if (!DLLIST_EMPTY(&lo[1])) { + simplifyNode(lo[1].next); + } else + if (!DLLIST_EMPTY(&hi)) { + RIG_Node *best = hi.next; + float bestScore = best->weight / (float)best->degree; + // spill candidate + for (RIG_Node *it = best->next; it != &hi; it = it->next) { + float score = it->weight / (float)it->degree; + if (score < bestScore) { + best = it; + bestScore = score; + } } - continue; + if (isinf(bestScore)) { + ERROR("no viable spill candidates left\n"); + break; + } + simplifyNode(best); + } else { + break; } + } +} + +void +GCRA::checkInterference(const RIG_Node *node, Graph::EdgeIterator& ei) +{ + const RIG_Node *intf = RIG_Node::get(ei); + + if (intf->reg < 0) + return; + const LValue *vA = node->getValue(); + const LValue *vB = intf->getValue(); - for (int c = 0; c < vecSize; ++c) { - uint32_t mask; - regSet[c].reset(); + const uint8_t intfMask = ((1 << intf->colors) - 1) << (intf->reg & 7); - for (DLList::Iterator it = regVals.iterator(); !it.end(); it.next()) { - Value *rVal = Value::get(it); - if (rVal->reg.data.id >= 0 && rVal->livei.overlaps(defs[c]->livei)) - regSet[c].occupy(rVal); + if (vA->compound | vB->compound) { + // NOTE: this only works for >aligned< register tuples ! + for (Value::DefCIterator D = vA->defs.begin(); D != vA->defs.end(); ++D) { + for (Value::DefCIterator d = vB->defs.begin(); d != vB->defs.end(); ++d) { + const LValue *vD = (*D)->get()->asLValue(); + const LValue *vd = (*d)->get()->asLValue(); + + if (!vD->livei.overlaps(vd->livei)) { + INFO_DBG(prog->dbgFlags, REG_ALLOC, "(%%%i) X (%%%i): no overlap\n", + vD->id, vd->id); + continue; } - mask = 0x11111111; - if (vecSize == 2) // granularity is 2 instead of 4 - mask |= 0x11111111 << 2; - regSet[c].periodicMask(defs[0]->reg.file, 0, ~(mask << c)); - if (!defs[c]->livei.isEmpty()) - insertOrderedTail(regVals, defs[c]); + uint8_t mask = vD->compound ? vD->compMask : ~0; + if (vd->compound) { + assert(vB->compound); + mask &= vd->compMask & vB->compMask; + } else { + mask &= intfMask; + } + + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "(%%%i)%02x X (%%%i)%02x & %02x: $r%i.%02x\n", + vD->id, + vD->compound ? vD->compMask : 0xff, + vd->id, + vd->compound ? vd->compMask : intfMask, + vB->compMask, intf->reg & ~7, mask); + if (mask) + regs.occupyMask(node->f, intf->reg & ~7, mask); } - for (int c = 1; c < vecSize; ++c) - regSet[0].intersect(defs[0]->reg.file, ®Set[c]); + } + } else { + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "(%%%i) X (%%%i): $r%i + %u\n", + vA->id, vB->id, intf->reg, intf->colors); + regs.occupy(node->f, intf->reg, intf->colors); + } +} - if (!regSet[0].assign(&defs[0], vecSize)) // TODO: spilling - return false; +bool +GCRA::selectRegisters() +{ + INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nSELECT phase\n"); + + while (!stack.empty()) { + RIG_Node *node = &nodes[stack.top()]; + stack.pop(); + + regs.reset(node->f); + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nNODE[%%%i, %u colors]\n", + node->getValue()->id, node->colors); + + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) + checkInterference(node, ei); + for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next()) + checkInterference(node, ei); + + if (!node->prefRegs.empty()) { + for (std::list::const_iterator it = node->prefRegs.begin(); + it != node->prefRegs.end(); + ++it) { + if ((*it)->reg >= 0 && + regs.occupy(node->f, (*it)->reg, node->colors)) { + node->reg = (*it)->reg; + break; + } + } + } + if (node->reg >= 0) + continue; + LValue *lval = node->getValue(); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + regs.print(); + bool ret = regs.assign(node->reg, node->f, node->colors); + if (ret) { + INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg); + lval->compMask = node->getCompMask(); + } else { + INFO_DBG(prog->dbgFlags, REG_ALLOC, "must spill: %%%i (size %u)\n", + lval->id, lval->reg.size); + Symbol *slot = NULL; + if (lval->reg.file == FILE_GPR) + slot = spill.assignSlot(node->livei, lval->reg.size); + mustSpill.push_back(ValuePair(lval, slot)); + } + } + if (!mustSpill.empty()) + return false; + for (unsigned int i = 0; i < nodeCount; ++i) { + LValue *lval = nodes[i].getValue(); + if (nodes[i].reg >= 0 && nodes[i].colors > 0) + lval->reg.data.id = + regs.unitsToId(nodes[i].f, nodes[i].reg, lval->reg.size); } - for (int c = 0; c < 4; c += 2) - if (regSet[c].getMaxAssigned(FILE_GPR) > prog->maxGPR) - prog->maxGPR = regSet[c].getMaxAssigned(FILE_GPR); return true; } bool -RegAlloc::linearScan() +GCRA::allocateRegisters(ArrayList& insns) { - Value *cur, *val; - DLList unhandled, active, inactive; - RegisterSet f(prog->getTarget()), free(prog->getTarget()); + bool ret; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "allocateRegisters to %u instructions\n", insns.getSize()); - INFO_DBG(prog->dbgFlags, REG_ALLOC, "RA: linear scan\n"); + nodeCount = func->allLValues.getSize(); + nodes = new RIG_Node[nodeCount]; + if (!nodes) + return false; + for (unsigned int i = 0; i < nodeCount; ++i) { + LValue *lval = reinterpret_cast(func->allLValues.get(i)); + if (lval) { + nodes[i].init(regs, lval); + RIG.insert(&nodes[i]); + } + } - collectLValues(unhandled, false); + // coalesce first, we use only 1 RIG node for a group of joined values + ret = coalesce(insns); + if (!ret) + goto out; - for (DLList::Iterator cI = unhandled.iterator(); !cI.end();) { - cur = Value::get(cI); - cI.erase(); + if (func->getProgram()->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + func->printLiveIntervals(); - for (DLList::Iterator aI = active.iterator(); !aI.end();) { - val = Value::get(aI); - if (val->livei.end() <= cur->livei.begin()) { - free.release(val); - aI.erase(); - } else - if (!val->livei.contains(cur->livei.begin())) { - free.release(val); - aI.moveToList(inactive); - } else { - aI.next(); - } + buildRIG(insns); + calculateSpillWeights(); + simplify(); + + ret = selectRegisters(); + if (!ret) { + INFO_DBG(prog->dbgFlags, REG_ALLOC, + "selectRegisters failed, inserting spill code ...\n"); + regs.reset(FILE_GPR, true); + spill.run(mustSpill); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + func->print(); + } else { + prog->maxGPR = regs.getMaxAssigned(FILE_GPR); + } + +out: + cleanup(ret); + return ret; +} + +void +GCRA::cleanup(const bool success) +{ + mustSpill.clear(); + + for (ArrayList::Iterator it = func->allLValues.iterator(); + !it.end(); it.next()) { + LValue *lval = reinterpret_cast(it.get()); + + lval->livei.clear(); + + lval->compound = 0; + lval->compMask = 0; + + if (lval->join == lval) + continue; + + if (success) { + lval->reg.data.id = lval->join->reg.data.id; + } else { + for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end(); + ++d) + lval->join->defs.remove(*d); + lval->join = lval; } + } - for (DLList::Iterator iI = inactive.iterator(); !iI.end();) { - val = Value::get(iI); - if (val->livei.end() <= cur->livei.begin()) { - iI.erase(); - } else - if (val->livei.contains(cur->livei.begin())) { - free.occupy(val); - iI.moveToList(active); - } else { - iI.next(); + if (success) + resolveSplitsAndMerges(); + splits.clear(); // avoid duplicate entries on next coalesce pass + merges.clear(); + + delete[] nodes; + nodes = NULL; +} + +Symbol * +SpillCodeInserter::assignSlot(const Interval &livei, unsigned int size) +{ + SpillSlot slot; + int32_t offsetBase = stackSize; + int32_t offset; + std::list::iterator pos = slots.end(), it = slots.begin(); + + if (offsetBase % size) + offsetBase += size - (offsetBase % size); + + slot.sym = NULL; + + for (offset = offsetBase; offset < stackSize; offset += size) { + while (it != slots.end() && it->offset < offset) + ++it; + if (it == slots.end()) // no slots left + break; + std::list::iterator bgn = it; + + while (it != slots.end() && it->offset < (offset + size)) { + it->occup.print(); + if (it->occup.overlaps(livei)) + break; + ++it; + } + if (it == slots.end() || it->offset >= (offset + size)) { + // fits + for (; bgn != slots.end() && bgn->offset < (offset + size); ++bgn) { + bgn->occup.insert(livei); + if (bgn->size() == size) + slot.sym = bgn->sym; } + break; } - f = free; + } + if (!slot.sym) { + stackSize = offset + size; + slot.offset = offset; + slot.sym = new_Symbol(func->getProgram(), FILE_MEMORY_LOCAL); + if (!func->stackPtr) + offset += func->tlsBase; + slot.sym->setAddress(NULL, offset); + slot.sym->reg.size = size; + slots.insert(pos, slot)->occup.insert(livei); + } + return slot.sym; +} - for (DLList::Iterator iI = inactive.iterator(); !iI.end(); iI.next()) { - val = Value::get(iI); - if (val->livei.overlaps(cur->livei)) - f.occupy(val); - } +void +SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval) +{ + const DataType ty = typeOfSize(slot->reg.size); + + Instruction *st; + if (slot->reg.file == FILE_MEMORY_LOCAL) { + st = new_Instruction(func, OP_STORE, ty); + st->setSrc(0, slot); + st->setSrc(1, lval); + lval->noSpill = 1; + } else { + st = new_Instruction(func, OP_CVT, ty); + st->setDef(0, slot); + st->setSrc(0, lval); + } + defi->bb->insertAfter(defi, st); +} - for (DLList::Iterator uI = unhandled.iterator(); !uI.end(); uI.next()) { - val = Value::get(uI); - if (val->reg.data.id >= 0 && val->livei.overlaps(cur->livei)) - f.occupy(val); - } +LValue * +SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot) +{ + const DataType ty = typeOfSize(slot->reg.size); + + lval = cloneShallow(func, lval); - if (cur->reg.data.id < 0) { - bool spill = !f.assign(&cur, 1); - if (spill) { - ERROR("out of registers of file %u\n", cur->reg.file); - abort(); + Instruction *ld; + if (slot->reg.file == FILE_MEMORY_LOCAL) { + lval->noSpill = 1; + ld = new_Instruction(func, OP_LOAD, ty); + } else { + ld = new_Instruction(func, OP_CVT, ty); + } + ld->setDef(0, lval); + ld->setSrc(0, slot); + + usei->bb->insertBefore(usei, ld); + return lval; +} + +bool +SpillCodeInserter::run(const std::list& lst) +{ + for (std::list::const_iterator it = lst.begin(); it != lst.end(); + ++it) { + LValue *lval = it->first->asLValue(); + Symbol *mem = it->second ? it->second->asSym() : NULL; + + for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end(); + ++d) { + Value *slot = mem ? + static_cast(mem) : new_LValue(func, FILE_GPR); + Value *tmp = NULL; + Instruction *last = NULL; + + LValue *dval = (*d)->get()->asLValue(); + Instruction *defi = (*d)->getInsn(); + + // handle uses first or they'll contain the spill stores + while (!dval->uses.empty()) { + ValueRef *u = dval->uses.front(); + Instruction *usei = u->getInsn(); + assert(usei); + if (usei->op == OP_PHI) { + tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot; + last = NULL; + } else + if (!last || usei != last->next) { // TODO: sort uses + tmp = unspill(usei, dval, slot); + last = usei; + } + u->set(tmp); + } + + assert(defi); + if (defi->op == OP_PHI) { + d = lval->defs.erase(d); + --d; + if (slot->reg.file == FILE_MEMORY_LOCAL) + delete_Instruction(func->getProgram(), defi); + else + defi->setDef(0, slot); + } else { + spill(defi, slot, dval); } } - free.occupy(cur); - active.insert(cur); + } - if (f.getMaxAssigned(FILE_GPR) > prog->maxGPR) - prog->maxGPR = f.getMaxAssigned(FILE_GPR); - if (free.getMaxAssigned(FILE_GPR) > prog->maxGPR) - prog->maxGPR = free.getMaxAssigned(FILE_GPR); + // TODO: We're not trying to reuse old slots in a potential next iteration. + // We have to update the slots' livei intervals to be able to do that. + stackBase = stackSize; + slots.clear(); return true; } @@ -821,8 +1534,11 @@ RegAlloc::exec() for (IteratorRef it = prog->calls.iteratorDFS(false); !it->end(); it->next()) { func = Function::get(reinterpret_cast(it->get())); + + func->tlsBase = prog->tlsSize; if (!execFunc()) return false; + prog->tlsSize += func->tlsSize; } return true; } @@ -834,8 +1550,11 @@ RegAlloc::execFunc() PhiMovesPass insertPhiMoves; ArgumentMovesPass insertArgMoves; BuildIntervalsPass buildIntervals; + SpillCodeInserter insertSpills(func); + + GCRA gcra(func, insertSpills); - unsigned int i; + unsigned int i, retries; bool ret; ret = insertConstr.exec(func); @@ -846,60 +1565,76 @@ RegAlloc::execFunc() if (!ret) goto out; - ret = insertArgMoves.run(func, true); + ret = insertArgMoves.run(func); if (!ret) goto out; - for (sequence = func->cfg.nextSequence(), i = 0; - ret && i <= func->loopNestingBound; - sequence = func->cfg.nextSequence(), ++i) - ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot())); - if (!ret) - goto out; - - func->orderInstructions(this->insns); - - ret = buildIntervals.run(func); - if (!ret) - goto out; - - ret = coalesceValues(JOIN_MASK_PHI); - if (!ret) - goto out; - switch (prog->getTarget()->getChipset() & 0xf0) { - case 0x50: - ret = coalesceValues(JOIN_MASK_UNION | JOIN_MASK_TEX); - break; - case 0xc0: - ret = coalesceValues(JOIN_MASK_UNION | JOIN_MASK_CONSTRAINT); - break; - default: - break; - } - if (!ret) - goto out; - ret = coalesceValues(JOIN_MASK_MOV); - if (!ret) - goto out; + // TODO: need to fix up spill slot usage ranges to support > 1 retry + for (retries = 0; retries < 3; ++retries) { + if (retries && (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)) + INFO("Retry: %i\n", retries); + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) + func->print(); + + // spilling to registers may add live ranges, need to rebuild everything + ret = true; + for (sequence = func->cfg.nextSequence(), i = 0; + ret && i <= func->loopNestingBound; + sequence = func->cfg.nextSequence(), ++i) + ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot())); + if (!ret) + break; + func->orderInstructions(this->insns); - if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { - func->print(); - func->printLiveIntervals(); + ret = buildIntervals.run(func); + if (!ret) + break; + ret = gcra.allocateRegisters(insns); + if (ret) + break; // success } + INFO_DBG(prog->dbgFlags, REG_ALLOC, "RegAlloc done: %i\n", ret); - ret = allocateConstrainedValues() && linearScan(); - if (!ret) - goto out; - + func->tlsSize = insertSpills.getStackSize(); out: - // TODO: should probably call destructor on LValues later instead - for (ArrayList::Iterator it = func->allLValues.iterator(); - !it.end(); it.next()) - reinterpret_cast(it.get())->livei.clear(); - return ret; } +// TODO: check if modifying Instruction::join here breaks anything +void +GCRA::resolveSplitsAndMerges() +{ + for (std::list::iterator it = splits.begin(); + it != splits.end(); + ++it) { + Instruction *split = *it; + unsigned int reg = regs.idToBytes(split->getSrc(0)); + for (int d = 0; split->defExists(d); ++d) { + Value *v = split->getDef(d); + v->reg.data.id = regs.bytesToId(v, reg); + v->join = v; + reg += v->reg.size; + } + delete_Instruction(prog, split); + } + splits.clear(); + + for (std::list::iterator it = merges.begin(); + it != merges.end(); + ++it) { + Instruction *merge = *it; + unsigned int reg = regs.idToBytes(merge->getDef(0)); + for (int s = 0; merge->srcExists(s); ++s) { + Value *v = merge->getSrc(s); + v->reg.data.id = regs.bytesToId(v, reg); + v->join = v; + reg += v->reg.size; + } + delete_Instruction(prog, merge); + } + merges.clear(); +} + bool Program::registerAllocation() { RegAlloc ra(this); @@ -936,17 +1671,10 @@ RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex) } tex->tex.mask = mask; -#if 0 // reorder or set the unused ones NULL ? - for (c = 0; c < 4; ++c) - if (!(tex->tex.mask & (1 << c))) - def[d++] = tex->getDef(c); -#endif for (c = 0; c < d; ++c) tex->setDef(c, def[c]); -#if 1 for (; c < 4; ++c) tex->setDef(c, NULL); -#endif } bool @@ -978,8 +1706,10 @@ RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n) int d; // first, look for an existing identical constraint op - for (DLList::Iterator it = constrList.iterator(); !it.end(); it.next()) { - cst = reinterpret_cast(it.get()); + for (std::list::iterator it = constrList.begin(); + it != constrList.end(); + ++it) { + cst = (*it); if (!i->bb->dominatedBy(cst->bb)) break; for (d = 0; d < n; ++d) @@ -1000,7 +1730,7 @@ RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n) } i->bb->insertBefore(i, cst); - constrList.insert(cst); + constrList.push_back(cst); } // Add a dummy use of the pointer source of >= 8 byte loads after the load @@ -1015,6 +1745,143 @@ RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src) } +// b32 { %r0 %r1 %r2 %r3 } -> b128 %r0q +void +RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn) +{ + uint8_t size = 0; + int n; + for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n) + size += insn->getDef(n)->reg.size; + if (n < 2) + return; + LValue *lval = new_LValue(func, FILE_GPR); + lval->reg.size = size; + + Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size)); + split->setSrc(0, lval); + for (int d = 0; d < n; ++d) { + split->setDef(d, insn->getDef(d)); + insn->setDef(d, NULL); + } + insn->setDef(0, lval); + + for (int k = 1, d = n; insn->defExists(d); ++d, ++k) { + insn->setDef(k, insn->getDef(d)); + insn->setDef(d, NULL); + } + // carry over predicate if any (mainly for OP_UNION uses) + split->setPredicate(insn->cc, insn->getPredicate()); + + insn->bb->insertAfter(insn, split); + constrList.push_back(split); +} +void +RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn, + const int a, const int b) +{ + uint8_t size = 0; + if (a >= b) + return; + for (int s = a; s <= b; ++s) + size += insn->getSrc(s)->reg.size; + if (!size) + return; + LValue *lval = new_LValue(func, FILE_GPR); + lval->reg.size = size; + + Value *save[3]; + insn->takeExtraSources(0, save); + + Instruction *merge = new_Instruction(func, OP_MERGE, typeOfSize(size)); + merge->setDef(0, lval); + for (int s = a, i = 0; s <= b; ++s, ++i) { + merge->setSrc(i, insn->getSrc(s)); + insn->setSrc(s, NULL); + } + insn->setSrc(a, lval); + + for (int k = a + 1, s = b + 1; insn->srcExists(s); ++s, ++k) { + insn->setSrc(k, insn->getSrc(s)); + insn->setSrc(s, NULL); + } + insn->bb->insertBefore(insn, merge); + + insn->putExtraSources(0, save); + + constrList.push_back(merge); +} + +void +RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex) +{ + textureMask(tex); + condenseDefs(tex); + + int n = tex->srcCount(0xff, true); + if (n > 4) { + condenseSrcs(tex, 0, 3); + if (n > 5) + condenseSrcs(tex, 4, n - 1); + } else + if (n > 1) { + condenseSrcs(tex, 0, n - 1); + } +} + +void +RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex) +{ + int n, s; + + textureMask(tex); + + if (tex->op == OP_TXQ) { + s = tex->srcCount(0xff); + n = 0; + } else { + s = tex->tex.target.getArgCount(); + if (!tex->tex.target.isArray() && + (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0)) + ++s; + if (tex->op == OP_TXD && tex->tex.useOffsets) + ++s; + n = tex->srcCount(0xff) - s; + assert(n <= 4); + } + + if (s > 1) + condenseSrcs(tex, 0, s - 1); + if (n > 1) + condenseSrcs(tex, s, s + (n - 1)); + + condenseDefs(tex); +} + +void +RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex) +{ + Value *pred = tex->getPredicate(); + if (pred) + tex->setPredicate(tex->cc, NULL); + + textureMask(tex); + + assert(tex->defExists(0) && tex->srcExists(0)); + // make src and def count match + int c; + for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) { + if (!tex->srcExists(c)) + tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue())); + if (!tex->defExists(c)) + tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue())); + } + if (pred) + tex->setPredicate(tex->cc, pred); + condenseDefs(tex); + condenseSrcs(tex, 0, c - 1); +} + // Insert constraint markers for instructions whose multiple sources must be // located in consecutive registers. bool @@ -1022,45 +1889,46 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) { TexInstruction *tex; Instruction *next; - int s, n, size; + int s, size; + + targ = bb->getProgram()->getTarget(); for (Instruction *i = bb->getEntry(); i; i = next) { next = i->next; if ((tex = i->asTex())) { - textureMask(tex); - - // FIXME: this is target specific - if (tex->op == OP_TXQ) { - s = tex->srcCount(0xff); - n = 0; - } else { - s = tex->tex.target.getArgCount(); - if (!tex->tex.target.isArray() && - (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0)) - ++s; - if (tex->op == OP_TXD && tex->tex.useOffsets) - ++s; - n = tex->srcCount(0xff) - s; - assert(n <= 4); + switch (targ->getChipset() & ~0xf) { + case 0x50: + case 0x80: + case 0x90: + case 0xa0: + texConstraintNV50(tex); + break; + case 0xc0: + case 0xd0: + texConstraintNVC0(tex); + break; + case 0xe0: + texConstraintNVE0(tex); + break; + default: + break; } - - if (s > 1) - addConstraint(i, 0, s); - if (n > 1) - addConstraint(i, s, n); } else if (i->op == OP_EXPORT || i->op == OP_STORE) { for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) { assert(i->srcExists(s)); size -= i->getSrc(s)->reg.size; } - if ((s - 1) > 1) - addConstraint(i, 1, s - 1); + condenseSrcs(i, 1, s - 1); } else - if (i->op == OP_LOAD) { + if (i->op == OP_LOAD || i->op == OP_VFETCH) { + condenseDefs(i); if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8) addHazard(i, i->src(0).getIndirect(0)); + } else + if (i->op == OP_UNION) { + constrList.push_back(i); } } return true; @@ -1071,21 +1939,65 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) bool RegAlloc::InsertConstraintsPass::insertConstraintMoves() { - for (DLList::Iterator it = constrList.iterator(); !it.end(); it.next()) { - Instruction *cst = reinterpret_cast(it.get()); + for (std::list::iterator it = constrList.begin(); + it != constrList.end(); + ++it) { + Instruction *cst = *it; + Instruction *mov; + + if (cst->op == OP_SPLIT && 0) { + // spilling splits is annoying, just make sure they're separate + for (int d = 0; cst->defExists(d); ++d) { + if (!cst->getDef(d)->refCount()) + continue; + LValue *lval = new_LValue(func, cst->def(d).getFile()); + const uint8_t size = cst->def(d).getSize(); + lval->reg.size = size; + + mov = new_Instruction(func, OP_MOV, typeOfSize(size)); + mov->setSrc(0, lval); + mov->setDef(0, cst->getDef(d)); + cst->setDef(d, mov->getSrc(0)); + cst->bb->insertAfter(cst, mov); + + cst->getSrc(0)->asLValue()->noSpill = 1; + mov->getSrc(0)->asLValue()->noSpill = 1; + } + } else + if (cst->op == OP_MERGE || cst->op == OP_UNION) { + for (int s = 0; cst->srcExists(s); ++s) { + const uint8_t size = cst->src(s).getSize(); + + if (!cst->getSrc(s)->defs.size()) { + mov = new_Instruction(func, OP_NOP, typeOfSize(size)); + mov->setDef(0, cst->getSrc(s)); + cst->bb->insertBefore(cst, mov); + continue; + } + assert(cst->getSrc(s)->defs.size() == 1); // still SSA + + Instruction *defi = cst->getSrc(s)->defs.front()->getInsn(); + // catch some cases where don't really need MOVs + if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs()) + continue; + + LValue *lval = new_LValue(func, cst->src(s).getFile()); + lval->reg.size = size; - for (int s = 0; cst->srcExists(s); ++s) { - if (!detectConflict(cst, s)) - continue; - Instruction *mov = new_Instruction(func, OP_MOV, - typeOfSize(cst->src(s).getSize())); - mov->setSrc(0, cst->getSrc(s)); - mov->setDef(0, new_LValue(func, FILE_GPR)); - cst->setSrc(s, mov->getDef(0)); + mov = new_Instruction(func, OP_MOV, typeOfSize(size)); + mov->setDef(0, lval); + mov->setSrc(0, cst->getSrc(s)); + cst->setSrc(s, mov->getDef(0)); + cst->bb->insertBefore(cst, mov); - cst->bb->insertBefore(cst, mov); + cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help + + if (cst->op == OP_UNION) + mov->setPredicate(defi->cc, defi->getPredicate()); + } } } + return true; } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp index 4eaaa96..f7812a1 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp @@ -88,6 +88,11 @@ Stack::moveTo(Stack& that) this->size = 0; } +Interval::Interval(const Interval& that) : head(NULL), tail(NULL) +{ + this->insert(that); +} + Interval::~Interval() { clear(); @@ -148,7 +153,7 @@ Interval::extend(int a, int b) return true; } -bool Interval::contains(int pos) +bool Interval::contains(int pos) const { for (Range *r = head; r && r->bgn <= pos; r = r->next) if (r->end > pos) @@ -156,16 +161,37 @@ bool Interval::contains(int pos) return false; } -bool Interval::overlaps(const Interval &iv) const +bool Interval::overlaps(const Interval &that) const { +#if 1 + Range *a = this->head; + Range *b = that.head; + + while (a && b) { + if (b->bgn < a->end && + b->end > a->bgn) + return true; + if (a->end <= b->bgn) + a = a->next; + else + b = b->next; + } +#else for (Range *rA = this->head; rA; rA = rA->next) for (Range *rB = iv.head; rB; rB = rB->next) if (rB->bgn < rA->end && rB->end > rA->bgn) return true; +#endif return false; } +void Interval::insert(const Interval &that) +{ + for (Range *r = that.head; r; r = r->next) + this->extend(r->bgn, r->end); +} + void Interval::unify(Interval &that) { assert(this != &that); @@ -177,6 +203,14 @@ void Interval::unify(Interval &that) that.head = NULL; } +int Interval::length() const +{ + int len = 0; + for (Range *r = head; r; r = r->next) + len += r->bgn - r->end; + return len; +} + void Interval::print() const { if (!head) @@ -205,6 +239,27 @@ BitSet& BitSet::operator|=(const BitSet &set) return *this; } +bool BitSet::resize(unsigned int nBits) +{ + if (!data || !nBits) + return allocate(nBits, true); + const unsigned int p = (size + 31) / 32; + const unsigned int n = (nBits + 31) / 32; + if (n == p) + return true; + + data = (uint32_t *)REALLOC(data, 4 * p, 4 * n); + if (!data) { + size = 0; + return false; + } + if (n > p) + memset(&data[4 * p + 4], 0, (n - p) * 4); + + size = nBits; + return true; +} + bool BitSet::allocate(unsigned int nBits, bool zero) { if (data && size < nBits) { @@ -254,6 +309,65 @@ void BitSet::setOr(BitSet *pA, BitSet *pB) } } +int BitSet::findFreeRange(unsigned int count) const +{ + const uint32_t m = (1 << count) - 1; + int pos = size; + unsigned int i; + const unsigned int end = (size + 31) / 32; + + if (count == 1) { + for (i = 0; i < end; ++i) { + pos = ffs(~data[i]) - 1; + if (pos >= 0) + break; + } + } else + if (count == 2) { + for (i = 0; i < end; ++i) { + if (data[i] != 0xffffffff) { + uint32_t b = data[i] | (data[i] >> 1) | 0xaaaaaaaa; + pos = ffs(~b) - 1; + if (pos >= 0) + break; + } + } + } else + if (count == 4 || count == 3) { + for (i = 0; i < end; ++i) { + if (data[i] != 0xffffffff) { + uint32_t b = + (data[i] >> 0) | (data[i] >> 1) | + (data[i] >> 2) | (data[i] >> 3) | 0xeeeeeeee; + pos = ffs(~b) - 1; + if (pos >= 0) + break; + } + } + } else { + if (count <= 8) + count = 8; + else + if (count <= 16) + count = 16; + else + count = 32; + + for (i = 0; i < end; ++i) { + if (data[i] != 0xffffffff) { + for (pos = 0; pos < 32; pos += count) + if (!(data[i] & (m << pos))) + break; + if (pos < 32) + break; + } + } + } + pos += i * 32; + + return ((pos + count) <= size) ? pos : -1; +} + void BitSet::print() const { unsigned int n = 0; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h index 8303207..f348f18 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h @@ -137,6 +137,8 @@ public: (__listA)->prev = prevB; \ } while(0) +#define DLLIST_EMPTY(__list) ((__list)->next == (__list)) + #define DLLIST_FOR_EACH(list, it) \ for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next()) @@ -416,17 +418,22 @@ class Interval { public: Interval() : head(0), tail(0) { } + Interval(const Interval&); ~Interval(); bool extend(int, int); + void insert(const Interval&); void unify(Interval&); // clears source interval void clear(); - inline int begin() { return head ? head->bgn : -1; } - inline int end() { checkTail(); return tail ? tail->end : -1; } + inline int begin() const { return head ? head->bgn : -1; } + inline int end() const { checkTail(); return tail ? tail->end : -1; } inline bool isEmpty() const { return !head; } bool overlaps(const Interval&) const; - bool contains(int pos); + bool contains(int pos) const; + + inline int extent() const { return end() - begin(); } + int length() const; void print() const; @@ -477,6 +484,7 @@ public: } bool allocate(unsigned int nBits, bool zero); + bool resize(unsigned int nBits); // keep old data, zero additional bits inline unsigned int getSize() const { return size; } @@ -489,18 +497,44 @@ public: assert(i < size); data[i / 32] |= 1 << (i % 32); } + // NOTE: range may not cross 32 bit boundary (implies n <= 32) + inline void setRange(unsigned int i, unsigned int n) + { + assert((i + n) <= size && (((i % 32) + n) <= 32)); + data[i / 32] |= ((1 << n) - 1) << (i % 32); + } + inline void setMask(unsigned int i, uint32_t m) + { + assert(i < size); + data[i / 32] |= m; + } inline void clr(unsigned int i) { assert(i < size); data[i / 32] &= ~(1 << (i % 32)); } + // NOTE: range may not cross 32 bit boundary (implies n <= 32) + inline void clrRange(unsigned int i, unsigned int n) + { + assert((i + n) <= size && (((i % 32) + n) <= 32)); + data[i / 32] &= ~(((1 << n) - 1) << (i % 32)); + } inline bool test(unsigned int i) const { assert(i < size); return data[i / 32] & (1 << (i % 32)); } + // NOTE: range may not cross 32 bit boundary (implies n <= 32) + inline bool testRange(unsigned int i, unsigned int n) + { + assert((i + n) <= size && (((i % 32) + n) <= 32)); + return data[i / 32] & (((1 << n) - 1) << (i % 32)); + } + + // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size). + int findFreeRange(unsigned int size) const; BitSet& operator|=(const BitSet&); @@ -514,6 +548,13 @@ public: void andNot(const BitSet&); + // bits = (bits | setMask) & ~clrMask + inline void periodicMask32(uint32_t setMask, uint32_t clrMask) + { + for (unsigned int i = 0; i < (size + 31) / 32; ++i) + data[i] = (data[i] | setMask) & ~clrMask; + } + unsigned int popCount() const; void print() const; diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp index 9c4108c..31769b2 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp @@ -1009,9 +1009,7 @@ CodeEmitterNVC0::emitTEX(const TexInstruction *i) if (i->tex.target.isShadow()) code[1] |= 1 << 24; - int src1 = i->tex.target.getArgCount(); - if (i->op == OP_TXD && i->tex.useOffsets) - ++src1; + const int src1 = MAX2(i->predSrc + 1, 1); // if predSrc == 1, no 2nd src if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) { // lzero @@ -1184,7 +1182,7 @@ CodeEmitterNVC0::emitVFETCH(const Instruction *i) emitPredicate(i); - code[0] |= (i->defCount(0xf) - 1) << 5; + code[0] |= ((i->getDef(0)->reg.size / 4) - 1) << 5; defId(i->def(0), 14); srcId(i->src(0).getIndirect(0), 20); -- 2.7.4