From 455892e4f7823c947824adae3c070ff7d9a11a3a Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Sat, 3 Aug 2013 01:53:44 +0800 Subject: [PATCH] GBE: refactor double support. There are two major issues in double support: 1. Doesn't work at SIMD16 mode. 2. The incorrect usage of vectors. We only need to allocate those temporary register to contiguous registers. If you look at the previous implementation of READ_FLOAT64/WRITE_FLOAT64 in gen_encoder.cpp. You can easily find it contains many duplicate code and considering the SIMD16 code path never work correctly, it's so difficult to work based on that code. So I choose to refactor those two major functions. And refine other parts in the instruction selection stage to fix the above two major problem with a cleaner code. Now, it works well on both SIMD16/SIMD8 mode. Another minor improvement is for the READ_FLOAT64 on SIMD8 mode, this patch saves one time of send instruction to read all the 8 double data into registers. Signed-off-by: Zhigang Gong Reviewed-by: Homer Hsing --- backend/src/backend/gen_context.cpp | 21 ++- backend/src/backend/gen_encoder.cpp | 231 +++++++++++++---------------- backend/src/backend/gen_encoder.hpp | 7 +- backend/src/backend/gen_insn_selection.cpp | 65 +++++--- backend/src/backend/gen_reg_allocation.cpp | 10 +- backend/src/backend/gen_register.hpp | 25 ++++ backend/src/llvm/llvm_gen_backend.cpp | 4 +- 7 files changed, 201 insertions(+), 162 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index e33d8da..655b1d7 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -354,12 +354,18 @@ namespace gbe p->pop(); } + // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and + // then follow the real destination registers. + // For SIMD16, we allocate elemNum temporary registers from dst(0). void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) { - const GenRegister dst = ra->genReg(insn.dst(0)); + const uint32_t elemNum = insn.extra.elem; + const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum; + const GenRegister dst = ra->genReg(insn.dst(tmpRegSize)); + const GenRegister tmp = ra->genReg(insn.dst(0)); const GenRegister src = ra->genReg(insn.src(0)); + const GenRegister tempAddr = ra->genReg(insn.src(1)); const uint32_t bti = insn.extra.function; - const uint32_t elemNum = insn.extra.elem; - p->READ_FLOAT64(dst, src, bti, elemNum); + p->READ_FLOAT64(dst, tmp, tempAddr, src, bti, elemNum); } void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) { @@ -370,11 +376,16 @@ namespace gbe p->UNTYPED_READ(dst, src, bti, elemNum); } + // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and + // then follow the real destination registers. + // For SIMD16, we allocate elemNum temporary registers from dst(0). void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) { const GenRegister src = ra->genReg(insn.src(0)); - const uint32_t bti = insn.extra.function; const uint32_t elemNum = insn.extra.elem; - p->WRITE_FLOAT64(src, bti, elemNum); + const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum; + const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1)); + const uint32_t bti = insn.extra.function; + p->WRITE_FLOAT64(src, data, bti, elemNum); } void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) { diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index f84c6dd..b2be32f 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -356,103 +356,69 @@ namespace gbe 0 }; - static int dst_type(int exec_width) { - if (exec_width == 8) - return GEN_TYPE_UD; - if (exec_width == 16) - return GEN_TYPE_UW; - NOT_IMPLEMENTED; - return 0; - } - - void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) { - int w = curr.execWidth; - dst = GenRegister::h2(dst); - dst.type = GEN_TYPE_UD; - src.type = GEN_TYPE_UD; - GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD); - GenRegister imm4 = GenRegister::immud(4); - GenInstruction *insn; - insn = next(GEN_OPCODE_SEND); - setHeader(insn); - setDst(insn, GenRegister::uw16grf(r.nr, 0)); - setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); - setSrc1(insn, GenRegister::immud(0)); - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8); - push(); - curr.quarterControl = 0; - curr.nibControl = 0; - MOV(dst, r); - if (w == 8) - curr.nibControl = 1; - else - curr.quarterControl = 1; - MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2)); - pop(); - ADD(src, src, imm4); - insn = next(GEN_OPCODE_SEND); - setHeader(insn); - setDst(insn, GenRegister::uw16grf(r.nr, 0)); - setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); - setSrc1(insn, GenRegister::immud(0)); - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8); + void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) { + GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD); + src = GenRegister::retype(src, GEN_TYPE_UD); + addr = GenRegister::retype(addr, GEN_TYPE_UD); + tmp = GenRegister::retype(tmp, GEN_TYPE_UD); + uint32_t originSimdWidth = curr.execWidth; + uint32_t originPredicate = curr.predicate; + uint32_t originMask = curr.noMask; push(); - curr.quarterControl = 0; - curr.nibControl = 0; - MOV(GenRegister::suboffset(dst, 1), r); - if (w == 8) - curr.nibControl = 1; - else - curr.quarterControl = 1; - MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2)); + for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1; + channels < originSimdWidth; channels += 8, currQuarter++) { + curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = GEN_MASK_DISABLE; + curr.execWidth = 8; + /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode + which is what we want here. */ + MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels)); + ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4)); + MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4)); + ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4)); + // Let's use SIMD16 to read all bytes for 8 doubles data at one time. + curr.execWidth = 16; + this->UNTYPED_READ(tmp, addr, bti, elemNum); + if (originSimdWidth == 16) + curr.quarterControl = currQuarter; + curr.predicate = originPredicate; + curr.noMask = originMask; + // Back to simd8 for correct predication flag. + curr.execWidth = 8; + MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF)); + } pop(); } - void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) { - int w = curr.execWidth; - GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD); - r.type = GEN_TYPE_UD; - GenRegister hdr = GenRegister::h2(r); - GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0); - src.hstride = GEN_HORIZONTAL_STRIDE_2; - GenRegister data = GenRegister::offset(r, w / 8); - GenRegister imm4 = GenRegister::immud(4); - MOV(r, GenRegister::ud8grf(msg.nr, 0)); + void GenEncoder::WRITE_FLOAT64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) { + GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD); + msg = GenRegister::retype(msg, GEN_TYPE_UD); + int originSimdWidth = curr.execWidth; + int originPredicate = curr.predicate; + int originMask = curr.noMask; push(); - curr.quarterControl = 0; - curr.nibControl = 0; - MOV(data, src); - if (w == 8) - curr.nibControl = 1; - else - curr.quarterControl = 1; - MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w)); - pop(); - GenInstruction *insn; - insn = next(GEN_OPCODE_SEND); - setHeader(insn); - setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth))); - setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0)); - setSrc1(insn, GenRegister::immud(0)); - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0); - - ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4); - push(); - curr.quarterControl = 0; - curr.nibControl = 0; - MOV(data, GenRegister::suboffset(src, 1)); - if (w == 8) - curr.nibControl = 1; - else - curr.quarterControl = 1; - MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1)); + for (uint32_t half = 0; half < 2; half++) { + curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = GEN_MASK_DISABLE; + curr.execWidth = 8; + MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half)); + if (originSimdWidth == 16) { + MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half)); + curr.execWidth = 16; + } + if (half == 1) + ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4)); + curr.predicate = originPredicate; + curr.noMask = originMask; + this->UNTYPED_WRITE(msg, bti, elemNum); + } + /* Let's restore the original message(addr) register. */ + /* XXX could be optimized if we don't allocate the address to the header + position of the message. */ + curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = GEN_MASK_DISABLE; + ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(-4)); pop(); - insn = next(GEN_OPCODE_SEND); - setHeader(insn); - setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth))); - setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0)); - setSrc1(insn, GenRegister::immud(0)); - setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0); } void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) { @@ -470,7 +436,7 @@ namespace gbe NOT_IMPLEMENTED; this->setHeader(insn); - this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); + this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); setDPUntypedRW(this, @@ -601,25 +567,53 @@ namespace gbe return &this->store.back(); } - INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) { - if (dst.isdf() && src.isdf()) { + INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, + GenRegister src0, GenRegister src1 = GenRegister::null()) { int w = p->curr.execWidth; p->push(); - p->curr.quarterControl = 0; p->curr.nibControl = 0; GenInstruction *insn = p->next(opcode); p->setHeader(insn); p->setDst(insn, dst); - p->setSrc0(insn, src); + p->setSrc0(insn, src0); + if (!GenRegister::isNull(src1)) + p->setSrc1(insn, src1); if (w == 8) p->curr.nibControl = 1; // second 1/8 mask - else // w == 16 - p->curr.quarterControl = 1; // second 1/4 mask insn = p->next(opcode); p->setHeader(insn); p->setDst(insn, GenRegister::suboffset(dst, w / 2)); - p->setSrc0(insn, GenRegister::suboffset(src, w / 2)); + p->setSrc0(insn, GenRegister::suboffset(src0, w / 2)); + if (!GenRegister::isNull(src1)) + p->setSrc1(insn, GenRegister::suboffset(src1, w / 2)); p->pop(); + } + + // Double register accessing is a little special, + // Per Gen spec, then only supported mode is SIMD8 and, it only + // handles four doubles each time. + // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8 + // to two SIMD1x4. + INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, + GenRegister src0, GenRegister src1 = GenRegister::null()) { + if (p->curr.execWidth == 8) + _handleDouble(p, opcode, dst, src0, src1); + else if (p->curr.execWidth == 16) { + p->push(); + p->curr.execWidth = 8; + p->curr.quarterControl = GEN_COMPRESSION_Q1; + _handleDouble(p, opcode, dst, src0, src1); + p->curr.quarterControl = GEN_COMPRESSION_Q2; + if (!GenRegister::isNull(src1)) + src1 = GenRegister::offset(src1, 2); + _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1); + p->pop(); + } + } + + INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) { + if (dst.isdf() && src.isdf()) { + handleDouble(p, opcode, dst, src); } else if (needToSplitAlu1(p, dst, src) == false) { GenInstruction *insn = p->next(opcode); p->setHeader(insn); @@ -653,25 +647,7 @@ namespace gbe GenRegister src1) { if (dst.isdf() && src0.isdf() && src1.isdf()) { - int w = p->curr.execWidth; - p->push(); - p->curr.quarterControl = 0; - p->curr.nibControl = 0; - GenInstruction *insn = p->next(opcode); - p->setHeader(insn); - p->setDst(insn, dst); - p->setSrc0(insn, src0); - p->setSrc1(insn, src1); - if (w == 8) - p->curr.nibControl = 1; // second 1/8 mask - else // w == 16 - p->curr.quarterControl = 1; // second 1/4 mask - insn = p->next(opcode); - p->setHeader(insn); - p->setDst(insn, GenRegister::suboffset(dst, w / 2)); - p->setSrc0(insn, GenRegister::suboffset(src0, w / 2)); - p->setSrc1(insn, GenRegister::suboffset(src1, w / 2)); - p->pop(); + handleDouble(p, opcode, dst, src0, src1); } else if (needToSplitAlu2(p, dst, src0, src1) == false) { GenInstruction *insn = p->next(opcode); p->setHeader(insn); @@ -808,7 +784,16 @@ namespace gbe r.width = GEN_WIDTH_1; r.hstride = GEN_HORIZONTAL_STRIDE_0; push(); + uint32_t width = curr.execWidth; + curr.execWidth = 8; + curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = 1; + curr.quarterControl = GEN_COMPRESSION_Q1; MOV(dest, r); + if (width == 16) { + curr.quarterControl = GEN_COMPRESSION_Q2; + MOV(GenRegister::offset(dest, 2), r); + } pop(); } @@ -839,14 +824,8 @@ namespace gbe void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) { int w = curr.execWidth; if (src0.isdf()) { - push(); - curr.execWidth = 16; - MOV(dest, src0); - if (w == 16) { - curr.quarterControl = 1; - MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4)); - } - pop(); + GBE_ASSERT(0); // MOV DF is called from convert instruction, + // We should never convert a df to a df. } else { GenRegister r0 = GenRegister::h2(r); push(); diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index d3a7165..86e1a71 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -118,10 +118,11 @@ namespace gbe ALU2(LINE) ALU2(PLN) ALU3(MAD) - ALU2(MOV_DF); + //ALU2(MOV_DF); #undef ALU1 #undef ALU2 #undef ALU3 + void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null()); void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value); /*! Barrier message (to synchronize threads of a workgroup) */ void BARRIER(GenRegister src); @@ -142,9 +143,9 @@ namespace gbe /*! Atomic instructions */ void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum); /*! Read 64-bits float arrays */ - void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum); + void READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum); /*! Write 64-bits float arrays */ - void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum); + void WRITE_FLOAT64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum); /*! Untyped read (upto 4 channels) */ void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum); /*! Untyped write (upto 4 channels) */ diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index d4be8bf..83d9b00 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -466,9 +466,9 @@ namespace gbe /*! Atomic instruction */ void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti); /*! Read 64 bits float array */ - void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti); + void READ_FLOAT64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti); /*! Write 64 bits float array */ - void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti); + void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t valueNum, uint32_t bti); /*! Untyped read (up to 4 elements) */ void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti); /*! Untyped write (up to 4 elements) */ @@ -760,12 +760,16 @@ namespace gbe void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); } void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); } + /* elemNum contains all the temporary register and the + real destination registers.*/ void Selection::Opaque::READ_FLOAT64(Reg addr, + Reg tempAddr, const GenRegister *dst, uint32_t elemNum, + uint32_t valueNum, uint32_t bti) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1); + SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 2); SelectionVector *srcVector = this->appendVector(); SelectionVector *dstVector = this->appendVector(); @@ -773,11 +777,12 @@ namespace gbe for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID) = dst[elemID]; insn->src(0) = addr; + insn->src(1) = tempAddr; insn->extra.function = bti; - insn->extra.elem = elemNum; + insn->extra.elem = valueNum; - // Sends require contiguous allocation - dstVector->regNum = elemNum; + // Only the temporary registers need contiguous allocation + dstVector->regNum = elemNum - valueNum; dstVector->isSrc = 0; dstVector->reg = &insn->dst(0); @@ -814,9 +819,12 @@ namespace gbe srcVector->reg = &insn->src(0); } + /* elemNum contains all the temporary register and the + real data registers.*/ void Selection::Opaque::WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, + uint32_t valueNum, uint32_t bti) { SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1); @@ -827,10 +835,10 @@ namespace gbe for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->src(elemID+1) = src[elemID]; insn->extra.function = bti; - insn->extra.elem = elemNum; + insn->extra.elem = valueNum; - // Sends require contiguous allocation for the sources - vector->regNum = elemNum+1; + // Only the addr + temporary registers need to be contiguous. + vector->regNum = (elemNum - valueNum) + 1; vector->reg = &insn->src(0); vector->isSrc = 1; } @@ -1871,13 +1879,18 @@ namespace gbe { using namespace ir; const uint32_t valueNum = insn.getValueNum(); - vector dst(valueNum); - for (uint32_t dstID = 0; dstID < valueNum; ++dstID) - dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F); - dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); - if (sel.ctx.getSimdWidth() == 16) - dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); - sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti); + uint32_t dstID; + /* XXX support scalar only right now. */ + GBE_ASSERT(valueNum == 1); + + // The first 16 DWORD register space is for temporary usage at encode stage. + uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum; + GenRegister dst[valueNum + tmpRegNum]; + for (dstID = 0; dstID < tmpRegNum ; ++dstID) + dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD)); + for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID) + dst[dstID] = sel.selReg(insn.getValue(valueID)); + sel.READ_FLOAT64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, valueNum + tmpRegNum, valueNum, bti); } void emitByteGather(Selection::Opaque &sel, @@ -1971,15 +1984,19 @@ namespace gbe const uint32_t valueNum = insn.getValueNum(); const uint32_t addrID = ir::StoreInstruction::addressIndex; GenRegister addr; - vector value(valueNum); - + uint32_t srcID; + /* XXX support scalar only right now. */ + GBE_ASSERT(valueNum == 1); addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F); - for (uint32_t valueID = 0; valueID < valueNum; ++valueID) - value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F); - value.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); - if (sel.ctx.getSimdWidth() == 16) - value.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); - sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti); + // The first 16 DWORD register space is for temporary usage at encode stage. + uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum; + GenRegister src[valueNum + tmpRegNum]; + for (srcID = 0; srcID < tmpRegNum; ++srcID) + src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD)); + + for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID) + src[srcID] = sel.selReg(insn.getValue(valueID)); + sel.WRITE_FLOAT64(addr, src, valueNum + tmpRegNum, valueNum, bti); } void emitByteScatter(Selection::Opaque &sel, diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index e7c96ac..4ba03ea 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -474,7 +474,12 @@ namespace gbe if (it != vectorMap.end()) { const SelectionVector *vector = it->second.first; const uint32_t simdWidth = ctx.getSimdWidth(); - const uint32_t alignment = simdWidth * sizeof(uint32_t); + + const ir::RegisterData regData = ctx.sel->getRegisterData(reg); + const ir::RegisterFamily family = regData.family; + const uint32_t typeSize = familyVectorSize[family]; + const uint32_t alignment = simdWidth*typeSize; + const uint32_t size = vector->regNum * alignment; uint32_t grfOffset; while ((grfOffset = ctx.allocate(size, alignment)) == 0) { @@ -483,7 +488,8 @@ namespace gbe } for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) { const ir::Register reg = vector->reg[regID].reg(); - GBE_ASSERT(RA.contains(reg) == false); + GBE_ASSERT(RA.contains(reg) == false + && ctx.sel->getRegisterData(reg).family == family); RA.insert(std::make_pair(reg, grfOffset)); } } diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index fedb743..7e48837 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -553,6 +553,11 @@ namespace gbe GEN_HORIZONTAL_STRIDE_1); } + static INLINE bool isNull(GenRegister reg) { + return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE + && reg.nr == GEN_ARF_NULL); + } + static INLINE GenRegister acc(void) { return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ACCUMULATOR, @@ -832,6 +837,26 @@ namespace gbe GEN_HORIZONTAL_STRIDE_2); } + static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) { + return GenRegister(GEN_GENERAL_REGISTER_FILE, + nr, + subnr, + GEN_TYPE_UD, + GEN_VERTICAL_STRIDE_8, + GEN_WIDTH_4, + GEN_HORIZONTAL_STRIDE_1); + } + + static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) { + return GenRegister(GEN_GENERAL_REGISTER_FILE, + nr, + subnr, + GEN_TYPE_UD, + GEN_VERTICAL_STRIDE_8, + GEN_WIDTH_4, + GEN_HORIZONTAL_STRIDE_2); + } + static INLINE GenRegister mask(uint32_t subnr) { return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr); } diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index c8c5484..b5963ad 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2371,8 +2371,8 @@ namespace gbe // Scalar is easy. We neednot build register tuples if (isScalarType(llvmType) == true) { const ir::Type type = getType(ctx, llvmType); - if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16 - OCL_SIMD_WIDTH = 8; + //if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16 + // OCL_SIMD_WIDTH = 8; const ir::Register values = this->getRegister(llvmValues); if (isLoad) ctx.LOAD(type, ptr, addrSpace, dwAligned, values); -- 2.7.4