From 5616922e1907ae7809e123d0edae570b46ece90d Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Fri, 7 Mar 2014 13:48:48 +0800 Subject: [PATCH] GBE: Optimize byte/short load/store using untyped read/write Scatter/gather are much worse than untyped read/write. So if we can pack load/store of char/short to use untyped message, jut do it. v2: add some assert in splitReg() Signed-off-by: Ruiling Song Reviewed-by: "Yang, Rong R" --- backend/src/backend/gen_context.cpp | 29 +++++ backend/src/backend/gen_context.hpp | 2 + .../src/backend/gen_insn_gen7_schedule_info.hxx | 2 + backend/src/backend/gen_insn_selection.cpp | 120 +++++++++++++++------ backend/src/backend/gen_insn_selection.hxx | 2 + backend/src/backend/gen_register.hpp | 25 +++++ backend/src/ir/instruction.hpp | 3 +- backend/src/llvm/llvm_gen_backend.cpp | 112 +++++++++++-------- 8 files changed, 217 insertions(+), 78 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 7104e81..9689ac5 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1776,6 +1776,35 @@ namespace gbe p->BYTE_SCATTER(src, bti, elemSize); } + void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) { + const GenRegister src = ra->genReg(insn.src(0)); + for(uint32_t i = 0; i < insn.dstNum; i++) { + p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i)); + } + } + + void GenContext::emitPackByteInstruction(const SelectionInstruction &insn) { + const GenRegister dst = ra->genReg(insn.dst(0)); + p->push(); + if(simdWidth == 8) { + for(uint32_t i = 0; i < insn.srcNum; i++) + p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i))); + } else { + // when destination expands two registers, the source must span two registers. + p->curr.execWidth = 8; + for(uint32_t i = 0; i < insn.srcNum; i++) { + GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i); + GenRegister src = ra->genReg(insn.src(i)); + + p->curr.quarterControl = 0; + p->MOV(dsti, src); + p->curr.quarterControl = 1; + p->MOV(GenRegister::Qn(dsti,1), GenRegister::Qn(src, 1)); + } + } + p->pop(); + } + void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) { const GenRegister dst = ra->genReg(insn.dst(0)); const GenRegister src = ra->genReg(insn.src(0)); diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 6cfc295..a853731 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -142,6 +142,8 @@ namespace gbe void emitAtomicInstruction(const SelectionInstruction &insn); void emitByteGatherInstruction(const SelectionInstruction &insn); void emitByteScatterInstruction(const SelectionInstruction &insn); + void emitPackByteInstruction(const SelectionInstruction &insn); + void emitUnpackByteInstruction(const SelectionInstruction &insn); void emitDWordGatherInstruction(const SelectionInstruction &insn); void emitSampleInstruction(const SelectionInstruction &insn); void emitTypedWriteInstruction(const SelectionInstruction &insn); diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx index 13cbd41..9eb04de 100644 --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx @@ -30,6 +30,8 @@ DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1) DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1) DECL_GEN7_SCHEDULE(ByteScatter, 80, 1, 1) DECL_GEN7_SCHEDULE(DWordGather, 80, 1, 1) +DECL_GEN7_SCHEDULE(PackByte, 20, 1, 1) +DECL_GEN7_SCHEDULE(UnpackByte, 20, 1, 1) DECL_GEN7_SCHEDULE(Sample, 80, 1, 1) DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1) DECL_GEN7_SCHEDULE(SpillReg, 80, 1, 1) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 46f0123..48b3069 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -529,6 +529,10 @@ namespace gbe void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti); /*! DWord scatter (for constant cache read) */ void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti); + /*! Unpack the uint to char4 */ + void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum); + /*! pack the char4 to uint */ + void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum); /*! Extended math function (2 arguments) */ void MATH(Reg dst, uint32_t function, Reg src0, Reg src1); /*! Extended math function (1 argument) */ @@ -1114,6 +1118,18 @@ namespace gbe insn->dst(0) = dst; insn->extra.function = bti; } + void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1); + insn->src(0) = src; + for(uint32_t i = 0; i < elemNum; i++) + insn->dst(i) = dst[i]; + } + void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum); + for(uint32_t i = 0; i < elemNum; i++) + insn->src(i) = src[i]; + insn->dst(0) = dst; + } void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) { SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2); @@ -2415,26 +2431,50 @@ namespace gbe const ir::LoadInstruction &insn, const uint32_t elemSize, GenRegister address, - GenRegister value, uint32_t bti) const { using namespace ir; - GBE_ASSERT(insn.getValueNum() == 1); + const uint32_t valueNum = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); + if(valueNum > 1) { + vector dst(valueNum); + const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType())); + + if(elemSize == GEN_BYTE_SCATTER_WORD) { + for(uint32_t i = 0; i < valueNum; i++) + dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16); + } else if(elemSize == GEN_BYTE_SCATTER_BYTE) { + for(uint32_t i = 0; i < valueNum; i++) + dst[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8); + } - // We need a temporary register if we read bytes or words - Register dst = Register(value.value.reg); - if (elemSize == GEN_BYTE_SCATTER_WORD || - elemSize == GEN_BYTE_SCATTER_BYTE) { - dst = sel.reg(FAMILY_DWORD); - sel.BYTE_GATHER(GenRegister::fxgrf(simdWidth, dst), address, elemSize, bti); - } + uint32_t tmpRegNum = typeSize*valueNum / 4; + vector tmp(tmpRegNum); + for(uint32_t i = 0; i < tmpRegNum; i++) { + tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + } - // Repack bytes or words using a converting mov instruction - if (elemSize == GEN_BYTE_SCATTER_WORD) - sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst)); - else if (elemSize == GEN_BYTE_SCATTER_BYTE) - sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst)); + sel.UNTYPED_READ(address, tmp.data(), tmpRegNum, bti); + for(uint32_t i = 0; i < tmpRegNum; i++) { + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize); + } + } else { + GBE_ASSERT(insn.getValueNum() == 1); + const GenRegister value = sel.selReg(insn.getValue(0)); + // We need a temporary register if we read bytes or words + Register dst = Register(value.value.reg); + if (elemSize == GEN_BYTE_SCATTER_WORD || + elemSize == GEN_BYTE_SCATTER_BYTE) { + dst = sel.reg(FAMILY_DWORD); + sel.BYTE_GATHER(GenRegister::fxgrf(simdWidth, dst), address, elemSize, bti); + } + + // Repack bytes or words using a converting mov instruction + if (elemSize == GEN_BYTE_SCATTER_WORD) + sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst)); + else if (elemSize == GEN_BYTE_SCATTER_BYTE) + sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst)); + } } void emitIndirectMove(Selection::Opaque &sel, @@ -2469,8 +2509,7 @@ namespace gbe else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitDWordGather(sel, insn, address, 0x2); else { - const GenRegister value = sel.selReg(insn.getValue(0)); - this->emitByteGather(sel, insn, elemSize, address, value, 0x2); + this->emitByteGather(sel, insn, elemSize, address, 0x2); } } else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) @@ -2478,8 +2517,7 @@ namespace gbe else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00); else { - const GenRegister value = sel.selReg(insn.getValue(0)); - this->emitByteGather(sel, insn, elemSize, address, value, space == MEM_LOCAL ? 0xfe : 0x01); + this->emitByteGather(sel, insn, elemSize, address, space == MEM_LOCAL ? 0xfe : 0x01); } return true; } @@ -2535,22 +2573,43 @@ namespace gbe const ir::StoreInstruction &insn, const uint32_t elemSize, GenRegister addr, - GenRegister value, uint32_t bti) const { using namespace ir; const uint32_t simdWidth = sel.ctx.getSimdWidth(); - const GenRegister dst = value; - - GBE_ASSERT(insn.getValueNum() == 1); - if (elemSize == GEN_BYTE_SCATTER_WORD) { - value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); - sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UW)); - } else if (elemSize == GEN_BYTE_SCATTER_BYTE) { - value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); - sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UB)); + uint32_t valueNum = insn.getValueNum(); + + if(valueNum > 1) { + const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType())); + vector value(valueNum); + + if(elemSize == GEN_BYTE_SCATTER_WORD) { + for(uint32_t i = 0; i < valueNum; i++) + value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16); + } else if(elemSize == GEN_BYTE_SCATTER_BYTE) { + for(uint32_t i = 0; i < valueNum; i++) + value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8); + } + + uint32_t tmpRegNum = typeSize*valueNum / 4; + vector tmp(tmpRegNum); + for(uint32_t i = 0; i < tmpRegNum; i++) { + tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize); + } + + sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti); + } else { + const GenRegister value = sel.selReg(insn.getValue(0)); + GBE_ASSERT(insn.getValueNum() == 1); + const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + if (elemSize == GEN_BYTE_SCATTER_WORD) { + sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW)); + } else if (elemSize == GEN_BYTE_SCATTER_BYTE) { + sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB)); + } + sel.BYTE_SCATTER(addr, tmp, elemSize, bti); } - sel.BYTE_SCATTER(addr, value, elemSize, bti); } INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn) const @@ -2566,8 +2625,7 @@ namespace gbe this->emitUntypedWrite(sel, insn, bti); else { const GenRegister address = sel.selReg(insn.getAddress()); - const GenRegister value = sel.selReg(insn.getValue(0)); - this->emitByteScatter(sel, insn, elemSize, address, value, bti); + this->emitByteScatter(sel, insn, elemSize, address, bti); } return true; } diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index e44b9d4..564dbc5 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -58,6 +58,8 @@ DECL_SELECTION_IR(WRITE64, Write64Instruction) DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction) DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction) DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction) +DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction) +DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction) DECL_SELECTION_IR(SAMPLE, SampleInstruction) DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction) DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction) diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index 8794318..25cb428 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -255,6 +255,31 @@ namespace gbe return r; } + // split a DWORD register into unpacked Byte or Short register + static INLINE GenRegister splitReg(GenRegister reg, uint32_t count, uint32_t sub_part) { + GenRegister r = reg; + GBE_ASSERT(count == 4 || count == 2); + GBE_ASSERT(reg.type == GEN_TYPE_UD || reg.type == GEN_TYPE_D); + + if(reg.hstride != GEN_HORIZONTAL_STRIDE_0) { + GBE_ASSERT(reg.hstride == GEN_HORIZONTAL_STRIDE_1); + r.hstride = count == 4 ? GEN_HORIZONTAL_STRIDE_4 : GEN_HORIZONTAL_STRIDE_2; + } + if(count == 4) { + r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UB : GEN_TYPE_B; + r.vstride = GEN_VERTICAL_STRIDE_32; + } else { + r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UW : GEN_TYPE_W; + r.vstride = GEN_VERTICAL_STRIDE_16; + } + + r.subnr += sub_part*typeSize(r.type); + r.nr += r.subnr / 32; + r.subnr %= 32; + + return r; + } + INLINE bool isint64(void) const { if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE) return true; diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index f9f5e68..457b5b4 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -178,7 +178,8 @@ namespace ir { template INLINE bool isMemberOf(void) const { return T::isClassOf(*this); } - static const uint32_t MAX_SRC_NUM = 16; + /*! max_src for store instruction (vec16 + addr) */ + static const uint32_t MAX_SRC_NUM = 17; static const uint32_t MAX_DST_NUM = 16; protected: BasicBlock *parent; //!< The basic block containing the instruction diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 4b692e2..dcc1497 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -578,7 +578,10 @@ namespace gbe void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;} void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;} template void visitLoadOrStore(T &I); - + // batch vec4/8/16 load/store + INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum, + Value *llvmValue, const ir::Register ptr, + const ir::AddressSpace addrSpace, Type * elemType, bool isLoad); void visitInstruction(Instruction &I) {NOT_SUPPORTED;} }; @@ -2774,6 +2777,61 @@ namespace gbe } void GenWriter::regAllocateStoreInst(StoreInst &I) {} + void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum, + Value *llvmValues, const ir::Register ptr, + const ir::AddressSpace addrSpace, + Type * elemType, bool isLoad) { + const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); + uint32_t totalSize = elemNum * getFamilySize(getFamily(type)); + uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1; + const uint32_t perMsgNum = elemNum / msgNum; + + for (uint32_t msg = 0; msg < msgNum; ++msg) { + // Build the tuple data in the vector + vector tupleData; // put registers here + for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) { + ir::Register reg; + if(regTranslator.isUndefConst(llvmValues, elemID)) { + Value *v = Constant::getNullValue(elemType); + reg = this->getRegister(v); + } else + reg = this->getRegister(llvmValues, perMsgNum*msg+elemID); + + tupleData.push_back(reg); + } + const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum); + + // We may need to update to offset the pointer + ir::Register addr; + if (msg == 0) + addr = ptr; + else { + const ir::Register offset = ctx.reg(pointerFamily); + ir::ImmediateIndex immIndex; + ir::Type immType; + // each message can read/write 16 byte + const int32_t stride = 16; + if (pointerFamily == ir::FAMILY_DWORD) { + immIndex = ctx.newImmediate(int32_t(msg*stride)); + immType = ir::TYPE_S32; + } else { + immIndex = ctx.newImmediate(int64_t(msg*stride)); + immType = ir::TYPE_S64; + } + + addr = ctx.reg(pointerFamily); + ctx.LOADI(immType, offset, immIndex); + ctx.ADD(immType, addr, ptr, offset); + } + + // Emit the instruction + if (isLoad) + ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true); + else + ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true); + } + } + extern int OCL_SIMD_WIDTH; template INLINE void GenWriter::emitLoadOrStore(T &I) @@ -2811,12 +2869,14 @@ namespace gbe // count here. if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3)) elemNum = 3; + // The code is going to be fairly different from types to types (based on // size of each vector element) const ir::Type type = getType(ctx, elemType); const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); + const ir::RegisterFamily dataFamily = getFamily(type); - if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) { + if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) { // One message is enough here. Nothing special to do if (elemNum <= 4) { // Build the tuple data in the vector @@ -2842,51 +2902,11 @@ namespace gbe // Not supported by the hardware. So, we split the message and we use // strided loads and stores else { - // We simply use several uint4 loads - const uint32_t msgNum = elemNum / 4; - for (uint32_t msg = 0; msg < msgNum; ++msg) { - // Build the tuple data in the vector - vector tupleData; // put registers here - for (uint32_t elemID = 0; elemID < 4; ++elemID) { - ir::Register reg; - if(regTranslator.isUndefConst(llvmValues, elemID)) { - Value *v = Constant::getNullValue(elemType); - reg = this->getRegister(v); - } else - reg = this->getRegister(llvmValues, 4*msg+elemID); - - tupleData.push_back(reg); - } - const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], 4); - - // We may need to update to offset the pointer - ir::Register addr; - if (msg == 0) - addr = ptr; - else { - const ir::Register offset = ctx.reg(pointerFamily); - ir::ImmediateIndex immIndex; - ir::Type immType; - if (pointerFamily == ir::FAMILY_DWORD) { - immIndex = ctx.newImmediate(int32_t(msg*sizeof(uint32_t[4]))); - immType = ir::TYPE_S32; - } else { - immIndex = ctx.newImmediate(int64_t(msg*sizeof(uint64_t[4]))); - immType = ir::TYPE_S64; - } - - addr = ctx.reg(pointerFamily); - ctx.LOADI(immType, offset, immIndex); - ctx.ADD(immType, addr, ptr, offset); - } - - // Emit the instruction - if (isLoad) - ctx.LOAD(type, tuple, addr, addrSpace, 4, true); - else - ctx.STORE(type, tuple, addr, addrSpace, 4, true); - } + emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad); } + } + else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) { + emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad); } else { for (uint32_t elemID = 0; elemID < elemNum; elemID++) { if(regTranslator.isUndefConst(llvmValues, elemID)) -- 2.7.4