From 5362669330dda931179ac4b32d8dbce606d2fdd0 Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Fri, 14 Feb 2014 15:04:26 +0800 Subject: [PATCH] GBE: Support 64Bit register spill. Now we support DWORD & QWORD register spill/fill. v2: only add poolOffset by 1 when we meet QWord register and poolOffset is 1. v3: allocate reserved register pool unifiedly for src and dst register. when it spill a qword register, payload register should be retyped as dword per bottom/top logic. put a limit on the scratch space memory size. v4: fix a typo. increase the reserved register from 6 to 8 for some complex instruction. Signed-off-by: Ruiling Song Reviewed-by: Zhigang Gong --- backend/src/backend/gen_context.cpp | 45 +++++++++++++++++++----- backend/src/backend/gen_insn_selection.cpp | 56 +++++++++++++++++------------- backend/src/backend/gen_reg_allocation.cpp | 18 ++++++---- backend/src/backend/gen_reg_allocation.hpp | 1 + src/cl_command_queue_gen7.c | 3 +- 5 files changed, 82 insertions(+), 41 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index d72b19b..7a74856 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1662,14 +1662,28 @@ namespace gbe GenRegister payload = src; payload.nr = header + 1; payload.subnr = 0; + GBE_ASSERT(src.subnr == 0); - if (payload.nr != src.nr) - p->MOV(payload, src); uint32_t regType = insn.src(0).type; uint32_t size = typeSize(regType); - assert(size <= 4); - uint32_t regNum = (stride(src.hstride)*size*simdWidth) > 32 ? 2 : 1; - this->scratchWrite(msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD); + uint32_t regSize = stride(src.hstride)*size; + + GBE_ASSERT(regSize == 4 || regSize == 8); + if(regSize == 4) { + if (payload.nr != src.nr) + p->MOV(payload, src); + uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1; + this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD); + } + else { //size == 8 + payload.type = GEN_TYPE_UD; + GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1); + loadBottomHalf(payload, src); + uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1; + this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD); + loadTopHalf(payload, src); + this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD); + } p->pop(); } @@ -1680,10 +1694,25 @@ namespace gbe uint32_t simdWidth = p->curr.execWidth; const uint32_t header = insn.extra.scratchMsgHeader; uint32_t size = typeSize(regType); - assert(size <= 4); - uint32_t regNum = (stride(dst.hstride)*size*simdWidth) > 32 ? 2 : 1; + uint32_t regSize = stride(dst.hstride)*size; + const GenRegister msg = GenRegister::ud8grf(header, 0); - this->scratchRead(GenRegister::retype(dst, GEN_TYPE_UD), msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD); + GenRegister payload = msg; + payload.nr = header + 1; + + p->push(); + assert(regSize == 4 || regSize == 8); + if(regSize == 4) { + uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1; + this->scratchRead(GenRegister::ud8grf(dst.nr, dst.subnr), msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD); + } else { + uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1; + this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD); + storeBottomHalf(dst, payload); + this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD); + storeTopHalf(dst, payload); + } + p->pop(); } // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 54e5ebe..75ee906 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -670,18 +670,9 @@ namespace gbe return vector; } - // FIXME, there is a risk need to be fixed here. - // as the instruction we spill here is the gen ir level not the final - // single instruction. If it will be translated to multiple instructions - // at gen_context stage, and as the destination registers and source registers - // may be spilled to the same register based on current implementation, - // then the source register may be modified within the final instruction and - // may lead to incorrect result. bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) { GBE_ASSERT(registerPool != 0); - const uint32_t dstStart = registerPool + 1; - const uint32_t srcStart = registerPool + 1; for (auto &block : blockList) for (auto &insn : block.insnList) { @@ -693,17 +684,19 @@ namespace gbe const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum; struct RegSlot { RegSlot(ir::Register _reg, uint8_t _srcID, - bool _isTmp, uint32_t _addr) - : reg(_reg), srcID(_srcID), isTmpReg(_isTmp), addr(_addr) + uint8_t _poolOffset, bool _isTmp, uint32_t _addr) + : reg(_reg), srcID(_srcID), poolOffset(_poolOffset), isTmpReg(_isTmp), addr(_addr) {}; ir::Register reg; union { uint8_t srcID; uint8_t dstID; }; + uint8_t poolOffset; bool isTmpReg; int32_t addr; }; + uint8_t poolOffset = 1; // keep one for scratch message header vector regSet; for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const GenRegister selReg = insn.src(srcID); @@ -712,18 +705,27 @@ namespace gbe if(it != spilledRegs.end() && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) { - struct RegSlot regSlot(reg, srcID, + ir::RegisterFamily family = getRegisterFamily(reg); + if(family == ir::FAMILY_QWORD && poolOffset == 1) { + poolOffset += 1; // qword register fill could not share the scratch read message payload register + } + struct RegSlot regSlot(reg, srcID, poolOffset, it->second.isTmpReg, it->second.addr); + if(family == ir::FAMILY_QWORD) { + poolOffset += 2; + } else { + poolOffset += 1; + } regSet.push_back(regSlot); } } - if (regSet.size() > 5) + if (poolOffset > RESERVED_REG_NUM_FOR_SPILL) { + std::cerr << "Instruction (#" << (uint32_t)insn.opcode << ") src too large pooloffset " << (uint32_t)poolOffset << std::endl; return false; - + } while(!regSet.empty()) { - uint32_t scratchID = regSet.size() - 1; struct RegSlot regSlot = regSet.back(); regSet.pop_back(); const GenRegister selReg = insn.src(regSlot.srcID); @@ -732,7 +734,7 @@ namespace gbe SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0); unspill->state = GenInstructionState(ctx.getSimdWidth()); unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, - srcStart + scratchID, 0, + registerPool + regSlot.poolOffset, 0, selReg.type, selReg.vstride, selReg.width, selReg.hstride); unspill->extra.scratchOffset = regSlot.addr; @@ -742,7 +744,7 @@ namespace gbe GenRegister src = insn.src(regSlot.srcID); // change nr/subnr, keep other register settings - src.nr = srcStart + scratchID; src.subnr = 0; src.physical = 1; + src.nr = registerPool + regSlot.poolOffset; src.subnr = 0; src.physical = 1; insn.src(regSlot.srcID) = src; }; @@ -756,7 +758,6 @@ namespace gbe instruction. Thus the registerPool + 1 still contain valid data. */ - for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { const GenRegister selReg = insn.dst(dstID); const ir::Register reg = selReg.reg(); @@ -764,18 +765,24 @@ namespace gbe if(it != spilledRegs.end() && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) { - struct RegSlot regSlot(reg, dstID, + ir::RegisterFamily family = getRegisterFamily(reg); + if(family == ir::FAMILY_QWORD && poolOffset == 1) { + poolOffset += 1; // qword register spill could not share the scratch write message payload register + } + struct RegSlot regSlot(reg, dstID, poolOffset, it->second.isTmpReg, it->second.addr); + if(family == ir::FAMILY_QWORD) poolOffset +=2; + else poolOffset += 1; regSet.push_back(regSlot); } } - if (regSet.size() > 5) + if (poolOffset > RESERVED_REG_NUM_FOR_SPILL){ + std::cerr << "Instruction (#" << (uint32_t)insn.opcode << ") dst too large pooloffset " << (uint32_t)poolOffset << std::endl; return false; - + } while(!regSet.empty()) { - uint32_t scratchID = regSet.size() - 1; struct RegSlot regSlot = regSet.back(); regSet.pop_back(); const GenRegister selReg = insn.dst(regSlot.dstID); @@ -784,7 +791,7 @@ namespace gbe SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1); spill->state = GenInstructionState(ctx.getSimdWidth()); spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, - dstStart + scratchID, 0, + registerPool + regSlot.poolOffset, 0, selReg.type, selReg.vstride, selReg.width, selReg.hstride); spill->extra.scratchOffset = regSlot.addr; @@ -794,9 +801,8 @@ namespace gbe GenRegister dst = insn.dst(regSlot.dstID); // change nr/subnr, keep other register settings - dst.physical =1; dst.nr = dstStart + scratchID; dst.subnr = 0; + dst.physical =1; dst.nr = registerPool + regSlot.poolOffset; dst.subnr = 0; insn.dst(regSlot.dstID)= dst; - scratchID++; } } return true; diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 2aafdb1..726b78c 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -33,7 +33,6 @@ #include #include -#define RESERVED_REG_NUM_FOR_SPILL 6 namespace gbe { @@ -234,7 +233,7 @@ namespace gbe uint32_t grfOffset = allocateReg(interval, regSize, regSize); if (grfOffset == 0) { /* this register is going to be spilled. */ - GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD)); + GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)); return false; } insertNewReg(reg, grfOffset); @@ -617,7 +616,8 @@ namespace gbe ir::RegisterFamily family; getRegAttrib(reg, regSize, &family); - if (regSize == GEN_REG_SIZE && family == ir::FAMILY_DWORD /*&& !isVector*/) { + if ((regSize == GEN_REG_SIZE && family == ir::FAMILY_DWORD) + || (regSize == 2*GEN_REG_SIZE && family == ir::FAMILY_QWORD)) { GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end()); offsetReg.insert(std::make_pair(grfOffset, reg)); spillCandidate.insert(intervals[reg]); @@ -639,7 +639,8 @@ namespace gbe if (!spillTag.isTmpReg) { // FIXME, we can optimize scratch allocation according to // the interval information. - spillTag.addr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D) + ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg); + spillTag.addr = ctx.allocateScratchMem(getFamilySize(family) * ctx.getSimdWidth()); } else spillTag.addr = -1; @@ -682,6 +683,7 @@ namespace gbe auto vectorIt = vectorMap.find(reg); bool isVector = vectorIt != vectorMap.end(); bool needRestart = false; + ir::RegisterFamily family = ctx.sel->getRegisterFamily(reg); if (isVector && (vectorCanSpill(vectorIt->second.first))) { const SelectionVector *vector = vectorIt->second.first; @@ -690,11 +692,12 @@ namespace gbe == spilledRegs.end()); spillSet.insert(vector->reg[id].reg()); reg = vector->reg[id].reg(); - size -= GEN_REG_SIZE; + family = ctx.sel->getRegisterFamily(reg); + size -= family == ir::FAMILY_QWORD ? 2*GEN_REG_SIZE : GEN_REG_SIZE; } } else if (!isVector) { spillSet.insert(reg); - size -= GEN_REG_SIZE; + size -= family == ir::FAMILY_QWORD ? 2*GEN_REG_SIZE : GEN_REG_SIZE; } else needRestart = true; // is a vector which could not be spilled. @@ -702,7 +705,8 @@ namespace gbe break; if (!needRestart) { uint32_t offset = RA.find(reg)->second; - auto nextRegIt = offsetReg.find(offset + GEN_REG_SIZE); + uint32_t nextOffset = (family == ir::FAMILY_QWORD) ? (offset + 2*GEN_REG_SIZE) : (offset + GEN_REG_SIZE); + auto nextRegIt = offsetReg.find(nextOffset); if (nextRegIt != offsetReg.end()) reg = nextRegIt->second; else diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp index 84b0f9c..bccccc8 100644 --- a/backend/src/backend/gen_reg_allocation.hpp +++ b/backend/src/backend/gen_reg_allocation.hpp @@ -27,6 +27,7 @@ #include "ir/register.hpp" #include "backend/gen_register.hpp" +#define RESERVED_REG_NUM_FOR_SPILL 8 namespace gbe { diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index ba69589..83fe21d 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -259,7 +259,8 @@ static void cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker) { int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque); - + /* Per HW Spec, it only allows 12KB scratch memory per HW thread now */ + assert(scratch_sz <= 12*1024); cl_gpgpu_set_scratch(gpgpu, scratch_sz); } -- 2.7.4