From 10924b4bb21d527e8f1ebb3392d7f8e44bee1415 Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Wed, 25 Sep 2013 11:20:45 +0800 Subject: [PATCH] GBE: refact the curbe register payload allocation. As we already handle all the used curbe registers when we build the patchlist. We can easily create a set to store all the required curbe registers, and then latter at register allocation stage, we can easily insert the registers in that set without any other checking. This way, at register allocation stage, we don't need to know anything about those CURBE magic number. We only need to use the virtual register as key naturally. This make the code a little bit clearer. And most important, this change is to support dynamic curbe register allocation. For example, the image attributes. Each image may have 6 DWs, and we may have many images but only access part of the image and part of the image attributes. So we can't just simply allocate a special register for all the image attributes. We need to dynamic allocate curbe registers on demand. So the previous implementation is not satisfy this requirment. So I have to make this change. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" --- backend/src/backend/context.cpp | 37 ++++++------ backend/src/backend/context.hpp | 5 +- backend/src/backend/gen_reg_allocation.cpp | 93 +++++++++++------------------- 3 files changed, 55 insertions(+), 80 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index a55ef04..2cacd07 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -366,7 +366,7 @@ namespace gbe this->kernel->stackSize = 1*KB; // XXX compute that in a better way } - void Context::newCurbeEntry(gbe_curbe_type value, + uint32_t Context::newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment) @@ -376,6 +376,7 @@ namespace gbe GBE_ASSERT(offset >= GEN_REG_SIZE); kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE)); kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE); + return offset; } uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size) @@ -392,12 +393,17 @@ namespace gbe return offset; } + + void Context::insertCurbeReg(ir::Register reg, uint32_t offset) { + curbeRegs.insert(std::make_pair(reg, offset)); + } + void Context::buildPatchList(void) { const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u; kernel->curbeSize = 0u; // We insert the block IP mask first - this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)); + this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t))); // Go over the arguments and find the related patch locations const uint32_t argNum = fn.argNum(); @@ -411,28 +417,22 @@ namespace gbe arg.type == ir::FunctionArgument::STRUCTURE || arg.type == ir::FunctionArgument::IMAGE || arg.type == ir::FunctionArgument::SAMPLER) - this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize); + this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize)); } // Already inserted registers go here - set specialRegs; - const size_t localIDSize = sizeof(uint32_t) * this->simdWidth; - this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize); - this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize); - this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize); - this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32); - specialRegs.insert(ir::ocl::lid0); - specialRegs.insert(ir::ocl::lid1); - specialRegs.insert(ir::ocl::lid2); - specialRegs.insert(ir::ocl::samplerinfo); + insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize)); + insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize)); + insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize)); + insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32)); // Go over all the instructions and find the special register we need // to push #define INSERT_REG(SPECIAL_REG, PATCH, WIDTH) \ if (reg == ir::ocl::SPECIAL_REG) { \ - if (specialRegs.find(reg) != specialRegs.end()) continue; \ - this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH); \ + if (curbeRegs.find(reg) != curbeRegs.end()) continue; \ + insertCurbeReg(reg, this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH)); \ } else bool useStackPtr = false; @@ -441,7 +441,7 @@ namespace gbe for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const ir::Register reg = insn.getSrc(srcID); if (fn.isSpecialReg(reg) == false) continue; - if (specialRegs.contains(reg) == true) continue; + if (curbeRegs.find(reg) != curbeRegs.end()) continue; if (reg == ir::ocl::stackptr) useStackPtr = true; INSERT_REG(lsize0, LOCAL_SIZE_X, 1) INSERT_REG(lsize1, LOCAL_SIZE_Y, 1) @@ -458,17 +458,16 @@ namespace gbe INSERT_REG(numgroup2, GROUP_NUM_Z, 1) INSERT_REG(stackptr, STACK_POINTER, this->simdWidth) do {} while (0); - specialRegs.insert(reg); } }); #undef INSERT_REG // Insert the number of threads - this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)); + insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t))); // Insert the stack buffer if used if (useStackPtr) - this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize); + insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize)); // After this point the vector is immutable. Sorting it will make // research faster diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp index 50c0e70..ca2c88d 100644 --- a/backend/src/backend/context.hpp +++ b/backend/src/backend/context.hpp @@ -93,6 +93,8 @@ namespace gbe uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size); /*! allocate size scratch memory and return start address */ uint32_t allocateScratchMem(uint32_t size); + /*! Preallocated curbe register set including special registers. */ + map curbeRegs; protected: /*! Build the instruction stream. Return false if failed */ virtual bool emitCode(void) = 0; @@ -115,7 +117,8 @@ namespace gbe /*! Insert a new entry with the given size in the Curbe. Return the offset * of the entry */ - void newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0); + void insertCurbeReg(ir::Register, uint32_t grfOffset); + uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0); /*! Provide for each branch and label the label index target */ typedef map JIPMap; const ir::Unit &unit; //!< Unit that contains the kernel diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index a9132df..a72333d 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -65,8 +65,10 @@ namespace gbe void allocateFlags(Selection &selection); /*! Allocate the GRF registers */ bool allocateGRFs(Selection &selection); + /*! Create gen registers for all preallocated curbe registers. */ + void allocatePayloadRegs(void); /*! Create a Gen register from a register set in the payload */ - void allocatePayloadReg(gbe_curbe_type, ir::Register, uint32_t subValue = 0, uint32_t subOffset = 0); + void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0); /*! Create the intervals for each register */ /*! Allocate the vectors detected in the instruction selection pass */ void allocateVector(Selection &selection); @@ -124,19 +126,37 @@ namespace gbe GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {} GenRegAllocator::Opaque::~Opaque(void) {} - void GenRegAllocator::Opaque::allocatePayloadReg(gbe_curbe_type value, - ir::Register reg, - uint32_t subValue, + void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg, + uint32_t offset, uint32_t subOffset) { using namespace ir; - const Kernel *kernel = ctx.getKernel(); - const int32_t curbeOffset = kernel->getCurbeOffset(value, subValue); - if (curbeOffset >= 0) { - const uint32_t offset = GEN_REG_SIZE + curbeOffset + subOffset; - RA.insert(std::make_pair(reg, offset)); - this->intervals[reg].minID = 0; - this->intervals[reg].maxID = 0; + assert(offset >= GEN_REG_SIZE); + offset += subOffset; + RA.insert(std::make_pair(reg, offset)); + GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0)); + this->intervals[reg].minID = 0; + this->intervals[reg].maxID = 0; + } + + INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) { + using namespace ir; + for(auto &it : this->ctx.curbeRegs) + allocatePayloadReg(it.first, it.second); + + // Allocate all pushed registers (i.e. structure kernel arguments) + const Function &fn = ctx.getFunction(); + GBE_ASSERT(fn.getProfile() == PROFILE_OCL); + const Function::PushMap &pushMap = fn.getPushMap(); + for (const auto &pushed : pushMap) { + const uint32_t argID = pushed.second.argID; + const FunctionArgument arg = fn.getArg(argID); + + const uint32_t subOffset = pushed.second.offset; + const Register reg = pushed.second.getRegister(); + auto it = this->ctx.curbeRegs.find(arg.reg); + assert(it != ctx.curbeRegs.end()); + allocatePayloadReg(reg, it->second, subOffset); } } @@ -535,11 +555,9 @@ namespace gbe } return true; } + INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) { using namespace ir; - const Kernel *kernel = ctx.getKernel(); - const Function &fn = ctx.getFunction(); - GBE_ASSERT(fn.getProfile() == PROFILE_OCL); if (ctx.getSimdWidth() == 8) { reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE); reservedReg /= GEN_REG_SIZE; @@ -555,25 +573,7 @@ namespace gbe this->intervals.push_back(ir::Register(regID)); // Allocate the special registers (only those which are actually used) - allocatePayloadReg(GBE_CURBE_LOCAL_ID_X, ocl::lid0); - allocatePayloadReg(GBE_CURBE_LOCAL_ID_Y, ocl::lid1); - allocatePayloadReg(GBE_CURBE_LOCAL_ID_Z, ocl::lid2); - allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_X, ocl::lsize0); - allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Y, ocl::lsize1); - allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Z, ocl::lsize2); - allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_X, ocl::gsize0); - allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Y, ocl::gsize1); - allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Z, ocl::gsize2); - allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0); - allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1); - allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2); - allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim); - allocatePayloadReg(GBE_CURBE_SAMPLER_INFO, ocl::samplerinfo); - allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0); - allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1); - allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2); - allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr); - allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn); + this->allocatePayloadRegs(); // Group and barrier IDs are always allocated by the hardware in r0 RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1 @@ -582,33 +582,6 @@ namespace gbe RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2 // block IP used to handle the mask in SW is always allocated - const int32_t blockIPOffset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_BLOCK_IP,0); - GBE_ASSERT(blockIPOffset >= 0 && blockIPOffset % GEN_REG_SIZE == 0); - RA.insert(std::make_pair(ocl::blockip, blockIPOffset)); - this->intervals[ocl::blockip].minID = 0; - - // Allocate all (non-structure) argument parameters - const uint32_t argNum = fn.argNum(); - for (uint32_t argID = 0; argID < argNum; ++argID) { - const FunctionArgument &arg = fn.getArg(argID); - GBE_ASSERT(arg.type == FunctionArgument::GLOBAL_POINTER || - arg.type == FunctionArgument::CONSTANT_POINTER || - arg.type == FunctionArgument::LOCAL_POINTER || - arg.type == FunctionArgument::VALUE || - arg.type == FunctionArgument::STRUCTURE || - arg.type == FunctionArgument::IMAGE || - arg.type == FunctionArgument::SAMPLER); - allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, arg.reg, argID); - } - - // Allocate all pushed registers (i.e. structure kernel arguments) - const Function::PushMap &pushMap = fn.getPushMap(); - for (const auto &pushed : pushMap) { - const uint32_t argID = pushed.second.argID; - const uint32_t subOffset = pushed.second.offset; - const Register reg = pushed.second.getRegister(); - allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, reg, argID, subOffset); - } // Compute the intervals int32_t insnID = 0; -- 2.7.4