case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
case SEL_OP_CONVI64_TO_I:
{
- int execWidth = p->curr.execWidth;
- GenRegister xsrc = src.bottom_half(), xdst = dst;
- p->push();
- p->curr.execWidth = 8;
- for(int i = 0; i < execWidth/4; i ++) {
- p->curr.chooseNib(i);
- p->MOV(xdst, xsrc);
- xdst = GenRegister::suboffset(xdst, 4);
- xsrc = GenRegister::suboffset(xsrc, 4);
- }
- p->pop();
+ p->MOV(dst, src.bottom_half());
break;
}
case SEL_OP_BRC:
p->MOV_DF(dst, src, tmp);
break;
case SEL_OP_CONVI_TO_I64: {
- GenRegister middle;
- if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
+ GenRegister middle = src;
+ if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
middle = tmp;
- middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
+ middle.type = GEN_TYPE_D;
p->MOV(middle, src);
- } else {
- middle = src;
}
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->MOV(dst.bottom_half(), middle);
- if(middle.is_signed_int())
- p->ASR(dst.top_half(), middle, GenRegister::immud(31));
- else
- p->MOV(dst.top_half(), GenRegister::immd(0));
- dst = GenRegister::suboffset(dst, 4);
- middle = GenRegister::suboffset(middle, 4);
- }
- p->pop();
+
+ p->MOV(dst.bottom_half(), middle);
+ if(src.is_signed_int())
+ p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
+ else
+ p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
break;
}
default:
GenRegister tmp = ra->genReg(insn.dst(1));
switch (insn.opcode) {
case SEL_OP_I64ADD: {
- GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
- y = GenRegister::suboffset(x, p->curr.execWidth);
+ tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+ GenRegister x = tmp.bottom_half();
+ GenRegister y = tmp.top_half(this->simdWidth);
+
loadBottomHalf(x, src0);
loadBottomHalf(y, src1);
addWithCarry(x, x, y);
break;
}
case SEL_OP_I64SUB: {
- GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
- y = GenRegister::suboffset(x, p->curr.execWidth);
+ tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+ GenRegister x = tmp.bottom_half();
+ GenRegister y = tmp.top_half(this->simdWidth);
+
loadBottomHalf(x, src0);
loadBottomHalf(y, src1);
subWithBorrow(x, x, y);
case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
case SEL_OP_SEL_INT64:
{
- GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
- xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
- xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
- p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
- xdst = GenRegister::suboffset(xdst, 4);
- xsrc0 = GenRegister::suboffset(xsrc0, 4);
- xsrc1 = GenRegister::suboffset(xsrc1, 4);
- }
- p->pop();
+ p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
}
break;
case SEL_OP_AND: p->AND(dst, src0, src1, insn.extra.function); break;
case SEL_OP_XOR: p->XOR(dst, src0, src1, insn.extra.function); break;
case SEL_OP_I64AND:
{
- GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
- xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
- xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
- p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
- xdst = GenRegister::suboffset(xdst, 4),
- xsrc0 = GenRegister::suboffset(xsrc0, 4),
- xsrc1 = GenRegister::suboffset(xsrc1, 4);
- }
- p->pop();
+ p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
}
break;
case SEL_OP_I64OR:
{
- GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
- xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
- xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
- p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
- xdst = GenRegister::suboffset(xdst, 4),
- xsrc0 = GenRegister::suboffset(xsrc0, 4),
- xsrc1 = GenRegister::suboffset(xsrc1, 4);
- }
- p->pop();
+ p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
}
break;
case SEL_OP_I64XOR:
{
- GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
- xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
- xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
- p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
- xdst = GenRegister::suboffset(xdst, 4),
- xsrc0 = GenRegister::suboffset(xsrc0, 4),
- xsrc1 = GenRegister::suboffset(xsrc1, 4);
- }
- p->pop();
+ p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
}
break;
case SEL_OP_SHR: p->SHR(dst, src0, src1); break;
GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->MOV(xdst.top_half(), xsrc0.bottom_half());
- p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
- xdst = GenRegister::suboffset(xdst, 4);
- xsrc0 = GenRegister::suboffset(xsrc0, 4);
- xsrc1 = GenRegister::suboffset(xsrc1, 4);
- }
- p->pop();
+ p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
+ p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
}
break;
default: NOT_IMPLEMENTED;
}
void GenContext::collectShifter(GenRegister dest, GenRegister src) {
- int execWidth = p->curr.execWidth;
p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->AND(dest, src.bottom_half(), GenRegister::immud(63));
- dest = GenRegister::suboffset(dest, 4);
- src = GenRegister::suboffset(src, 4);
- }
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->AND(dest, src.bottom_half(), GenRegister::immud(63));
p->pop();
}
}
void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
- int execWidth = p->curr.execWidth;
- src = src.top_half();
- p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->curr.execWidth = 8;
- p->MOV(dest, src);
- p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
- if (execWidth == 16) {
- p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
- p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
- }
- p->pop();
+ p->MOV(dest, src.top_half(this->simdWidth));
}
void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
- int execWidth = p->curr.execWidth;
- dest = dest.top_half();
- p->push();
- p->curr.noMask = 0;
- p->curr.execWidth = 8;
- p->MOV(dest, src);
- p->curr.nibControl = 1;
- p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
- if (execWidth == 16) {
- p->curr.quarterControl = 1;
- p->curr.nibControl = 0;
- p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
- p->curr.nibControl = 1;
- p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
- }
- p->pop();
+ p->MOV(dest.top_half(this->simdWidth), src);
}
void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
- int execWidth = p->curr.execWidth;
- src = src.bottom_half();
- p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->curr.execWidth = 8;
- p->MOV(dest, src);
- p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
- if (execWidth == 16) {
- p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
- p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
- }
- p->pop();
+ p->MOV(dest, src.bottom_half());
}
void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
- int execWidth = p->curr.execWidth;
- dest = dest.bottom_half();
- p->push();
- p->curr.execWidth = 8;
- p->curr.noMask = 0;
- p->MOV(dest, src);
- p->curr.nibControl = 1;
- p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
- if (execWidth == 16) {
- p->curr.quarterControl = 1;
- p->curr.nibControl = 0;
- p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
- p->curr.nibControl = 1;
- p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
- }
- p->pop();
+ p->MOV(dest.bottom_half(), src);
}
void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
p->pop();
}
- // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
- // then follow the real destination registers.
- // For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
const uint32_t elemNum = insn.extra.elem;
- const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
- const GenRegister tempAddr = ra->genReg(insn.dst(tmpRegSize + 1));
- const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
- const GenRegister tmp = ra->genReg(insn.dst(0));
+ const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.getbti();
- p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
+ p->UNTYPED_READ(dst, src, bti, elemNum*2);
}
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
p->UNTYPED_READ(dst, src, bti, elemNum);
}
- // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
- // then follow the real destination registers.
- // For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.dst(0));
const uint32_t elemNum = insn.extra.elem;
- const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
- const GenRegister data = ra->genReg(insn.src(1));
const uint32_t bti = insn.getbti();
- p->MOV(src, addr);
- p->WRITE64(src, data, bti, elemNum, sel->isScalarReg(data.reg()));
+ p->UNTYPED_WRITE(src, bti, elemNum*2);
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID, int jump_width) :
stateNum(0), gen(gen), deviceID(deviceID), jump_width(jump_width)
{
+ this->simdWidth = simdWidth;
this->curr.execWidth = simdWidth;
this->curr.quarterControl = GEN_COMPRESSION_Q1;
this->curr.noMask = 0;
0
};
- void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
- GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
- src = GenRegister::retype(src, GEN_TYPE_UD);
- addr = GenRegister::retype(addr, GEN_TYPE_UD);
- tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
- uint32_t originSimdWidth = curr.execWidth;
- uint32_t originPredicate = curr.predicate;
- uint32_t originMask = curr.noMask;
- push();
- for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
- channels < originSimdWidth; channels += 8, currQuarter++) {
- curr.predicate = GEN_PREDICATE_NONE;
- curr.noMask = GEN_MASK_DISABLE;
- curr.execWidth = 8;
- /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
- which is what we want here. */
- MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
- ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
- MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
- ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
- // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
- curr.execWidth = 16;
- this->UNTYPED_READ(tmp, addr, bti, elemNum);
- if (originSimdWidth == 16)
- curr.quarterControl = currQuarter;
- curr.predicate = originPredicate;
- curr.noMask = originMask;
- // Back to simd8 for correct predication flag.
- curr.execWidth = 8;
- MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
- }
- pop();
- }
-
- void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
- GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
- GenRegister unpacked;
- msg = GenRegister::retype(msg, GEN_TYPE_UD);
- int originSimdWidth = curr.execWidth;
- int originPredicate = curr.predicate;
- int originMask = curr.noMask;
- push();
- for (uint32_t half = 0; half < 2; half++) {
- curr.predicate = GEN_PREDICATE_NONE;
- curr.noMask = GEN_MASK_DISABLE;
- curr.execWidth = 8;
- if (is_scalar) {
- unpacked = data32;
- unpacked.subnr += half * 4;
- } else
- unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
- MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
- if (originSimdWidth == 16) {
- if (is_scalar) {
- unpacked = data32;
- unpacked.subnr += half * 4;
- } else
- unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
- MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
- curr.execWidth = 16;
- }
- if (half == 1)
- ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
- curr.predicate = originPredicate;
- curr.noMask = originMask;
- this->UNTYPED_WRITE(msg, bti, elemNum);
- }
- pop();
- }
-
void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
if (dst.isdf() && src.isdf()) {
handleDouble(p, opcode, dst, src);
} else if (dst.isint64() && src.isint64()) { // handle int64
- int execWidth = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- for (int nib = 0; nib < execWidth / 4; nib ++) {
- p->curr.chooseNib(nib);
- p->MOV(dst.bottom_half(), src.bottom_half());
- p->MOV(dst.top_half(), src.top_half());
- dst = GenRegister::suboffset(dst, 4);
- src = GenRegister::suboffset(src, 4);
- }
- p->pop();
+ p->MOV(dst.bottom_half(), src.bottom_half());
+ p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
} else if (needToSplitAlu1(p, dst, src) == false) {
if(compactAlu1(p, opcode, dst, src, condition, false))
return;
void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
- int execWidth = curr.execWidth;
- push();
- curr.execWidth = 8;
- for(int nib = 0; nib < execWidth/4; nib ++) {
- curr.chooseNib(nib);
- MOV(dest.top_half(), u1);
- MOV(dest.bottom_half(), u0);
- dest = GenRegister::suboffset(dest, 4);
- }
- pop();
+ MOV(dest.bottom_half(), u0);
+ MOV(dest.top_half(this->simdWidth), u1);
}
void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
uint32_t deviceID;
/*! The constant for jump. */
const int jump_width;
+ /*! simd width for this codegen */
+ uint32_t simdWidth;
////////////////////////////////////////////////////////////////////////
// Encoding functions
////////////////////////////////////////////////////////////////////////
void WAIT(void);
/*! Atomic instructions */
virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
- /*! Read 64-bits float/int arrays */
- void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
- /*! Write 64-bits float/int arrays */
- void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
/*! Untyped read (upto 4 channels) */
virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
/*! Atomic instruction */
void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
/*! Read 64 bits float/int array */
- void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+ void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Write 64 bits float/int array */
- void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
+ void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */
void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
- /* elemNum contains all the temporary register and the
- real destination registers.*/
void Selection::Opaque::READ64(Reg addr,
- Reg tempAddr,
const GenRegister *dst,
uint32_t elemNum,
- uint32_t valueNum,
uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID) = dst[elemID];
- /* temporary addr register is to be modified, set it to dst registers.*/
- insn->dst(elemNum) = tempAddr;
insn->src(0) = addr;
insn->setbti(bti);
- insn->extra.elem = valueNum;
+ insn->extra.elem = elemNum;
- // Only the temporary registers need contiguous allocation
- dstVector->regNum = elemNum - valueNum;
+ dstVector->regNum = elemNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
- // Source cannot be scalar (yet)
srcVector->regNum = 1;
srcVector->isSrc = 1;
srcVector->reg = &insn->src(0);
dstVector->regNum = elemNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
- // Source cannot be scalar (yet)
+
srcVector->regNum = 1;
srcVector->isSrc = 1;
srcVector->reg = &insn->src(0);
}
- /* elemNum contains all the temporary register and the
- real data registers.*/
void Selection::Opaque::WRITE64(Reg addr,
const GenRegister *src,
uint32_t srcNum,
- const GenRegister *dst,
- uint32_t dstNum,
uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
SelectionVector *vector = this->appendVector();
// Regular instruction to encode
insn->src(0) = addr;
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(elemID + 1) = src[elemID];
- for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
- insn->dst(elemID) = dst[elemID];
+
insn->setbti(bti);
insn->extra.elem = srcNum;
- // Only the addr + temporary registers need to be contiguous.
- vector->regNum = dstNum;
- vector->reg = &insn->dst(0);
+ vector->regNum = srcNum + 1;
+ vector->reg = &insn->src(0);
vector->isSrc = 1;
}
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
- uint32_t dstID;
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- // The first 16 DWORD register space is for temporary usage at encode stage.
- uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
- GenRegister dst[valueNum + tmpRegNum];
- for (dstID = 0; dstID < tmpRegNum ; ++dstID)
- dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
- for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
- dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
- sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
+ GenRegister dst[valueNum];
+ for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
+ sel.READ64(addr, dst, valueNum, bti);
}
void emitByteGather(Selection::Opaque &sel,
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
- uint32_t srcID;
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- addr = GenRegister::retype(addr, GEN_TYPE_F);
- // The first 16 DWORD register space is for temporary usage at encode stage.
- uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+ addr = GenRegister::retype(addr, GEN_TYPE_UD);
GenRegister src[valueNum];
- GenRegister dst[tmpRegNum + 1];
- /* dst 0 is for the temporary address register. */
- dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
- for (srcID = 0; srcID < tmpRegNum; ++srcID)
- dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
- sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
+ sel.WRITE64(addr, src, valueNum, bti);
}
void emitByteScatter(Selection::Opaque &sel,
sel.curr.noMask = 1;
}
+ // As we store long/ulong low/high part separately,
+ // we need to deal with it separately, we need to change it back again
+ // when hardware support native long type.
+ const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
+ const int simdWidth = sel.curr.execWidth;
+
for(int i = 0; i < narrowNum; i++, index++) {
GenRegister narrowReg, wideReg;
if(narrowDst) {
GBE_ASSERT(multiple == 8);
}
}
- if(index % multiple) {
+
+ if(!isInt64 && index % multiple) {
wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
wideReg.subphysical = 1;
}
+ if(isInt64) {
+ // offset to next half
+ wideReg.subphysical = 1;
+ if(i >= multiple/2)
+ wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
+ if(index % (multiple/2))
+ wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
+ }
+
GenRegister xdst = narrowDst ? narrowReg : wideReg;
GenRegister xsrc = narrowDst ? wideReg : narrowReg;
- if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == TYPE_DOUBLE) ||
- (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == TYPE_DOUBLE)) {
- const int simdWidth = sel.curr.execWidth;
+ if(isInt64) {
+ sel.MOV(xdst, xsrc);
+ } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
sel.push();
sel.curr.execWidth = 8;
xdst.subphysical = 1;
continue;
uint32_t alignment;
- ir::RegisterFamily family;
- getRegAttrib(reg, alignment, &family);
- const uint32_t size = vector->regNum * alignment;
- const uint32_t grfOffset = allocateReg(interval, size, alignment);
+ uint32_t size = 0;
+ for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+ getRegAttrib(vector->reg[regID].reg(), alignment, NULL);
+ size += alignment;
+ }
+ // FIXME this is workaround for scheduling limitation, which requires 2*GEN_REG_SIZE under SIMD16.
+ const uint32_t maxAlignment = ctx.getSimdWidth()/8*GEN_REG_SIZE;
+ const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
if(grfOffset == 0) {
- GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
+ ir::RegisterFamily family;
for(int i = vector->regNum-1; i >= 0; i--) {
+ family = ctx.sel->getRegisterFamily(vector->reg[i].reg());
+ // we currently only support DWORD/QWORD spill
+ if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
+ return false;
if (!spillReg(vector->reg[i].reg()))
return false;
}
continue;
}
+ uint32_t subOffset = 0;
for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
const ir::Register reg = vector->reg[regID].reg();
- GBE_ASSERT(RA.contains(reg) == false
- && ctx.sel->getRegisterData(reg).family == family);
- insertNewReg(reg, grfOffset + alignment * regID, true);
- ctx.splitBlock(grfOffset, alignment * regID); //splitBlock will not split if regID == 0
+ GBE_ASSERT(RA.contains(reg) == false);
+ getRegAttrib(reg, alignment, NULL);
+ // check all sub registers aligned correctly
+ GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
+ insertNewReg(reg, grfOffset + subOffset, true);
+ ctx.splitBlock(grfOffset, subOffset); //splitBlock will not split if regID == 0
+ subOffset += alignment;
}
}
// Case 2: This is a regular scalar register, allocate it alone
return false;
}
- INLINE GenRegister top_half(void) const {
- GenRegister r = bottom_half();
- r.subnr += 4;
- r.nr += r.subnr / 32;
- r.subnr %= 32;
- return r;
+ INLINE GenRegister top_half(int simdWidth) const {
+ GBE_ASSERT(isint64());
+ GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+
+ if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
+ reg.nr += reg.subnr / 32;
+ reg.subnr %= 32;
+ } else {
+ reg.subnr += typeSize(reg.type);
+ reg.nr += reg.subnr/32;
+ reg.subnr %= 32;
+ }
+ return reg;
}
INLINE GenRegister bottom_half(void) const {
GBE_ASSERT(isint64());
- GenRegister r = h2(*this);
- r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
- if(r.vstride != GEN_VERTICAL_STRIDE_0)
- r.vstride = GEN_VERTICAL_STRIDE_16;
+ GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
return r;
}
uint32_t srcElemNum = 0, dstElemNum = 0 ;
ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+ // As long and double are not compatible in register storage
+ // and we do not support double yet, simply put an assert here
+ GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
+ GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
+
if(srcElemNum > 1 || dstElemNum > 1) {
// Build the tuple data in the vector
vector<ir::Register> srcTupleData;