From: Ruiling Song Date: Fri, 28 Feb 2014 02:16:45 +0000 (+0800) Subject: GBE: Optimize scratch memory usage using register interval X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3a7bfa1300913115879c3ca6486fc5df3d1a3a20;p=contrib%2Fbeignet.git GBE: Optimize scratch memory usage using register interval As scratch memory is a limited resource in HW. And different register have the opptunity to share same scratch memory. So I introduce an allocator for scratch memory management. v2: In order to reuse the registerFilePartitioner, I rename it as SimpleAllocator, and derive ScratchAllocator & RegisterAllocator from it. v3: fix a typo, scratch size is 12KB. Signed-off-by: Ruiling Song Reviewed-by: Zhigang Gong --- diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 51a0628..b806586 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -35,24 +35,13 @@ namespace gbe { - /*! Structure that keeps track of allocation in the register file. This is - * actually needed by Context (and not only by GenContext) because both - * simulator and hardware have to deal with constant pushing which uses the - * register file - * - * Since Gen is pretty flexible, we just maintain a free list for the - * register file (as a classical allocator) and coalesce blocks when required - */ - class RegisterFilePartitioner + class SimpleAllocator { public: - RegisterFilePartitioner(void); - ~RegisterFilePartitioner(void); + SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail); + ~SimpleAllocator(void); - /*! Allocate some memory in the register file. Return 0 if out-of-memory. By - * the way, zero is not a valid offset since r0 is always preallocated by - * the hardware. Note that we always use the left most block when - * allocating, so it makes sense for constant pushing + /*! Allocate some memory from the pool. */ int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false); @@ -62,10 +51,7 @@ namespace gbe /*! Spilt a block into 2 blocks */ void splitBlock(int16_t offset, int16_t subOffset); - private: - /*! May need to make that run-time in the future */ - static const int16_t RegisterFileSize = 4*KB; - + protected: /*! Double chained list of free spaces */ struct Block { Block(int16_t offset, int16_t size) : @@ -79,6 +65,10 @@ namespace gbe * If the colascing was done, the left block is deleted */ void coalesce(Block *left, Block *right); + /*! the maximum offset */ + int16_t maxOffset; + /*! whether trigger an assertion on allocation failure */ + bool assertFail; /*! Head and tail of the free list */ Block *head; Block *tail; @@ -87,17 +77,46 @@ namespace gbe /*! Track allocated memory blocks */ map allocatedBlocks; /*! Use custom allocators */ - GBE_CLASS(RegisterFilePartitioner); + GBE_CLASS(SimpleAllocator); + }; + + /*! Structure that keeps track of allocation in the register file. This is + * actually needed by Context (and not only by GenContext) because both + * simulator and hardware have to deal with constant pushing which uses the + * register file + * + * Since Gen is pretty flexible, we just reuse the Simpleallocator + */ + + class RegisterAllocator: public SimpleAllocator { + public: + RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {} + + GBE_CLASS(RegisterAllocator); + }; + + /*! + * an allocator for scratch memory allocation. Scratch memory are used for register spilling. + * You can query how much scratch memory needed through getMaxScatchMemUsed(). + */ + + class ScratchAllocator: public SimpleAllocator { + public: + ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {} + int16_t getMaxScatchMemUsed() { return maxOffset; } + + GBE_CLASS(ScratchAllocator); }; - RegisterFilePartitioner::RegisterFilePartitioner(void) { - // r0 is always set by the HW and used at the end by EOT - const int16_t offset = GEN_REG_SIZE; - const int16_t size = RegisterFileSize - offset; - tail = head = this->newBlock(offset, size); + SimpleAllocator::SimpleAllocator(int16_t startOffset, + int16_t size, + bool _assertFail) + : maxOffset(0), + assertFail(_assertFail){ + tail = head = this->newBlock(startOffset, size); } - RegisterFilePartitioner::~RegisterFilePartitioner(void) { + SimpleAllocator::~SimpleAllocator(void) { while (this->head) { Block *next = this->head->next; this->deleteBlock(this->head); @@ -105,7 +124,7 @@ namespace gbe } } - int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd) + int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd) { // Make it simple and just use the first block we find Block *list = bFwd ? head : tail; @@ -205,13 +224,16 @@ namespace gbe // Track the allocation to retrieve the size later allocatedBlocks.insert(std::make_pair(aligned, size)); + // update max offset + if(aligned + size > maxOffset) maxOffset = aligned + size; // We have a valid offset now return aligned; } + GBE_ASSERT( !assertFail ); return 0; } - void RegisterFilePartitioner::deallocate(int16_t offset) + void SimpleAllocator::deallocate(int16_t offset) { // Retrieve the size in the allocation map auto it = allocatedBlocks.find(offset); @@ -254,7 +276,7 @@ namespace gbe allocatedBlocks.erase(it); } - void RegisterFilePartitioner::coalesce(Block *left, Block *right) { + void SimpleAllocator::coalesce(Block *left, Block *right) { if (left == NULL || right == NULL) return; GBE_ASSERT(left->offset < right->offset); GBE_ASSERT(left->next == right); @@ -270,7 +292,7 @@ namespace gbe } } - void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) { + void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) { // Retrieve the size in the allocation map auto it = allocatedBlocks.find(offset); GBE_ASSERT(it != allocatedBlocks.end()); @@ -300,6 +322,7 @@ namespace gbe return i; } + /////////////////////////////////////////////////////////////////////////// // Generic Context (shared by the simulator and the HW context) /////////////////////////////////////////////////////////////////////////// @@ -311,16 +334,18 @@ namespace gbe GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS); this->liveness = GBE_NEW(ir::Liveness, const_cast(fn)); this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness); - this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner); + // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT + this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE); + this->scratchAllocator = GBE_NEW(ScratchAllocator, 12*KB); if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15) this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH); else this->simdWidth = fn.getSimdWidth(); - this->scratchOffset = 0; } Context::~Context(void) { - GBE_SAFE_DELETE(this->partitioner); + GBE_SAFE_DELETE(this->registerAllocator); + GBE_SAFE_DELETE(this->scratchAllocator); GBE_SAFE_DELETE(this->dag); GBE_SAFE_DELETE(this->liveness); } @@ -339,20 +364,20 @@ namespace gbe this->kernel = NULL; } if(this->kernel != NULL) { - this->kernel->scratchSize = alignScratchSize(this->scratchOffset); + this->kernel->scratchSize = alignScratchSize(scratchAllocator->getMaxScatchMemUsed()); this->kernel->ctx = this; } return this->kernel; } int16_t Context::allocate(int16_t size, int16_t alignment) { - return partitioner->allocate(size, alignment); + return registerAllocator->allocate(size, alignment); } - void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); } + void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); } void Context::splitBlock(int16_t offset, int16_t subOffset) { - partitioner->splitBlock(offset, subOffset); + registerAllocator->splitBlock(offset, subOffset); } int32_t Context::allocConstBuf(uint32_t argID) { @@ -376,10 +401,15 @@ namespace gbe return offset + GEN_REG_SIZE; } - uint32_t Context::allocateScratchMem(uint32_t size) { - uint32_t offset = scratchOffset; - scratchOffset += size; - return offset; + // FIXME TODO as we optimize scratch memory usage using the register interval. + // we need to add some dependency in post_reg_alloc scheduler, to keep scratch + // memory that are reused still keep the order + + int32_t Context::allocateScratchMem(uint32_t size) { + return scratchAllocator->allocate(size, 32, true); + } + void Context::deallocateScratchMem(int32_t offset) { + scratchAllocator->deallocate(offset); } void Context::buildStack(void) { @@ -402,7 +432,7 @@ namespace gbe uint32_t alignment) { alignment = alignment == 0 ? size : alignment; - const uint32_t offset = partitioner->allocate(size, alignment, 1); + const uint32_t offset = registerAllocator->allocate(size, alignment, 1); GBE_ASSERT(offset >= GEN_REG_SIZE); kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE)); kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE); diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp index 000612e..ac940bd 100644 --- a/backend/src/backend/context.hpp +++ b/backend/src/backend/context.hpp @@ -41,7 +41,8 @@ namespace ir { namespace gbe { class Kernel; // context creates Kernel - class RegisterFilePartitioner; // Partition register file for reg allocation + class RegisterAllocator; // allocator for physical register allocation + class ScratchAllocator; // allocator for scratch memory allocation /*! Context is the helper structure to build the Gen ISA or simulation code * from GenIR @@ -94,7 +95,9 @@ namespace gbe /*! Get (search or allocate if fail to find one) image info curbeOffset.*/ uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size); /*! allocate size scratch memory and return start address */ - uint32_t allocateScratchMem(uint32_t size); + int32_t allocateScratchMem(uint32_t size); + /*! deallocate scratch memory at offset */ + void deallocateScratchMem(int32_t offset); /*! Preallocated curbe register set including special registers. */ map curbeRegs; protected: @@ -129,11 +132,11 @@ namespace gbe Kernel *kernel; //!< Kernel we are building ir::Liveness *liveness; //!< Liveness info for the variables ir::FunctionDAG *dag; //!< Graph of values on the function - RegisterFilePartitioner *partitioner; //!< Handle register file partionning + RegisterAllocator *registerAllocator; //!< physical register allocation + ScratchAllocator *scratchAllocator; //!< scratch memory allocator set usedLabels; //!< Set of all used labels JIPMap JIPs; //!< Where to jump all labels/branches uint32_t simdWidth; //!< Number of lanes per HW threads - uint32_t scratchOffset; //!< scratch slot for next scratch memory request GBE_CLASS(Context); //!< Use custom allocators }; diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index c282b36..1ecb8ea 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -174,7 +174,7 @@ namespace gbe INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false); INLINE bool spillReg(ir::Register reg, bool isAllocated = false); INLINE bool vectorCanSpill(SelectionVector *vector); - + INLINE void allocateScratchForSpilled(); /*! Use custom allocator */ GBE_CLASS(Opaque); }; @@ -583,6 +583,8 @@ namespace gbe } if (!spilledRegs.empty()) { GBE_ASSERT(reservedReg != 0); + allocateScratchForSpilled(); + bool success = selection.spillRegs(spilledRegs, reservedReg); if (!success) { std::cerr << "Fail to spill registers." << std::endl; @@ -592,6 +594,43 @@ namespace gbe return true; } + INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled() + { + const uint32_t regNum = spilledRegs.size(); + this->starting.resize(regNum); + this->ending.resize(regNum); + uint32_t regID = 0; + for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) { + this->starting[regID] = this->ending[regID] = &intervals[it->first]; + regID++; + } + std::sort(this->starting.begin(), this->starting.end(), cmp); + std::sort(this->ending.begin(), this->ending.end(), cmp); + int toExpire = 0; + for(uint32_t i = 0; i < regNum; i++) { + const GenRegInterval * cur = starting[i]; + const GenRegInterval * exp = ending[toExpire]; + if(exp->maxID < cur->minID) { + auto it = spilledRegs.find(exp->reg); + GBE_ASSERT(it != spilledRegs.end()); + if(it->second.addr != -1) { + ctx.deallocateScratchMem(it->second.addr); + } + toExpire++; + } + auto it = spilledRegs.find(cur->reg); + GBE_ASSERT(it != spilledRegs.end()); + if(cur->minID == cur->maxID) { + it->second.addr = -1; + continue; + } + + ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg); + it->second.addr = ctx.allocateScratchMem(getFamilySize(family) + * ctx.getSimdWidth()); + } + } + INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg) { auto it = RA.find(reg); @@ -643,14 +682,8 @@ namespace gbe return false; SpillRegTag spillTag; spillTag.isTmpReg = interval.maxID == interval.minID; - if (!spillTag.isTmpReg) { - // FIXME, we can optimize scratch allocation according to - // the interval information. - ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg); - spillTag.addr = ctx.allocateScratchMem(getFamilySize(family) - * ctx.getSimdWidth()); - } else - spillTag.addr = -1; + spillTag.addr = -1; + if (isAllocated) { // If this register is allocated, we need to expire it and erase it // from the RA map.