[GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
- [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 },
- [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 },
- [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
- [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+ [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
[GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
[GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
} else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
inst->header.opcode == GEN_OPCODE_ELSE ||
inst->header.opcode == GEN_OPCODE_ENDIF ||
- inst->header.opcode == GEN_OPCODE_WHILE)) {
- // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
- assert(0);
+ inst->header.opcode == GEN_OPCODE_WHILE ||
+ inst->header.opcode == GEN_OPCODE_BRD ||
+ inst->header.opcode == GEN_OPCODE_JMPI)) {
+ format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
} else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
inst->header.opcode == GEN_OPCODE_CONTINUE ||
- inst->header.opcode == GEN_OPCODE_HALT)) {
- // XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip);
- assert(0);
- } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+ inst->header.opcode == GEN_OPCODE_HALT ||
+ inst->header.opcode == GEN_OPCODE_BRC)) {
+ format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+ }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
format (file, " %d", inst->bits3.d);
- }
+ }*/
if (opcode[inst->header.opcode].nsrc > 0) {
pad (file, 32);
const LabelIndex label = pair.first;
const int32_t insnID = pair.second;
const int32_t targetID = labelPos.find(label)->second;
- p->patchJMPI(insnID, (targetID-insnID-1) * 2);
+ p->patchJMPI(insnID, (targetID - insnID) * 2);
+ }
+ for (auto pair : branchPos3) {
+ const LabelPair labelPair = pair.first;
+ const int32_t insnID = pair.second;
+ const int32_t jip = labelPos.find(labelPair.l0)->second + labelPair.offset0;
+ const int32_t uip = labelPos.find(labelPair.l1)->second + labelPair.offset1;
+ assert((jip - insnID) * 2 < 32767 && (jip - insnID) * 2 > -32768);
+ assert((uip - insnID) * 2 < 32767 && (uip - insnID) * 2 > -32768);
+ p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2));
}
}
void GenContext::clearFlagRegister(void) {
// when group size not aligned to simdWidth, flag register need clear to
// make prediction(any8/16h) work correctly
- const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
- const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
- uint32_t execWidth = p->curr.execWidth;
+ const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- /* clear all the bit in f0.0. */
- p->curr.execWidth = 1;
- p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000));
- /* clear the barrier mask bits to all zero0*/
- p->curr.noMask = 0;
- p->curr.useFlag(0, 0);
- p->curr.execWidth = execWidth;
- /* set all the active lane to 1. Inactive lane remains 0. */
- p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
- p->curr.noMask = 1;
- p->curr.execWidth = 1;
- p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
- p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
- p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+ p->curr.noMask = 0;
+ p->MOV(blockip, GenRegister::immuw(0));
p->pop();
}
// Check that everything is consistent in the kernel code
const uint32_t perLaneSize = kernel->getStackSize();
const uint32_t perThreadSize = perLaneSize * this->simdWidth;
- //const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
GBE_ASSERT(perLaneSize > 0);
GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
for (int i = 0; i < w / 8; i ++) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
p->curr.accWrEnable = 1;
p->MACH(tmp, src0, src1);
int execWidth = p->curr.execWidth;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.execWidth = 8;
for (int nib = 0; nib < execWidth / 4; nib ++) {
p->AND(dest, src.bottom_half(), GenRegister::immud(63));
void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
p->SHR(sign, high, GenRegister::immud(31));
p->push();
+ p->curr.noMask = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
I64FullMult(e, f, g, h, a, b, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->OR(a, e, f);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, a, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
I64FullMult(e, f, g, h, a, b, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NZ, i, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->push();
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_NZ, e, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(b, one);
case SEL_OP_I64SHL:
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
case SEL_OP_I64SHR:
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
f.type = GEN_TYPE_D;
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
collectShifter(a, y);
loadBottomHalf(e, x);
loadTopHalf(f, x);
p->push();
p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32)); //high == 0
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dst, low);
p->pop();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32)); //exp>23 && high!=0
p->pop();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dst_ud, GenRegister::immud(0)); //exp==9, SHR == 0
p->SHL(high, low, tmp);
p->MOV(low, GenRegister::immud(0));
- p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
+ p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2);
p->curr.predicate = GEN_PREDICATE_NONE;
p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31)); //update dst where high != 0
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
- p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
+ p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2);
p->pop();
p->MOV(tmp_high, high);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
p->curr.predicate = GEN_PREDICATE_NORMAL;
UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
p->curr.predicate = GEN_PREDICATE_NORMAL;
if(dst.is_signed_int()) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
f1.width = GEN_WIDTH_1;
GenRegister f2 = GenRegister::suboffset(f1, 1);
GenRegister f3 = GenRegister::suboffset(f1, 2);
- GenRegister f4 = GenRegister::suboffset(f1, 3);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
- saveFlag(f4, flag, subFlag);
+ p->curr.noMask = 1;
loadTopHalf(tmp0, src0);
loadTopHalf(tmp1, src1);
switch(insn.extra.function) {
NOT_IMPLEMENTED;
}
p->curr.execWidth = 1;
- p->AND(f1, f1, f4);
p->MOV(GenRegister::flag(flag, subFlag), f1);
p->pop();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->MOV(dst, GenRegister::immd(0));
+ p->curr.noMask = 0;
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dst, GenRegister::immd(-1));
p->pop();
p->ADD(c, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
if(! dst.is_signed_int()) {
p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
p->MOV(a, GenRegister::immud(0x80000000u));
p->MOV(b, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
p->curr.predicate = GEN_PREDICATE_NORMAL;
p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
p->ADD(c, c, d);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
if(! dst.is_signed_int()) {
p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
src = src.top_half();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
int execWidth = p->curr.execWidth;
dest = dest.top_half();
p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->curr.nibControl = 1;
src = src.bottom_half();
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.execWidth = 8;
p->MOV(dest, src);
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
dest = dest.bottom_half();
p->push();
p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
p->MOV(dest, src);
p->curr.nibControl = 1;
p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
loadBottomHalf(d, y);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
p->ADD(e, e, f);
// condition <- (c,d)==0 && (a,b)>=(e,f)
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->MOV(l, zero);
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_EQ, a, e);
p->ADD(m, m, one);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
p->curr.noMask = 1;
p->AND(flagReg, flagReg, emaskReg);
- p->curr.predicate = GEN_PREDICATE_NORMAL;
// under condition, jump back to start point
if (simdWidth == 8)
p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
else
NOT_IMPLEMENTED;
int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+ p->curr.noMask = 1;
p->JMPI(zero);
- p->patchJMPI(p->n_instruction()-2, jip);
+ p->patchJMPI(p->n_instruction() - 2, jip + 2);
p->pop();
// end of loop
}
if(x.is_signed_int()) {
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
p->curr.predicate = GEN_PREDICATE_NORMAL;
}
void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
- NOT_IMPLEMENTED;
+ p->NOP();
}
void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
const GenRegister fenceDst = ra->genReg(insn.dst(0));
uint32_t barrierType = insn.extra.barrierType;
const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
- GenRegister blockIP;
- uint32_t exeWidth = p->curr.execWidth;
- ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
-
- if (exeWidth == 16)
- blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
- else if (exeWidth == 8)
- blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
- p->push();
- /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
- after the barrier, If there is any lane still remains zero. */
- p->MOV(blockIP, GenRegister::immuw(0xFFFF));
- p->curr.noMask = 1;
- p->curr.execWidth = 1;
- this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
- if (exeWidth == 16)
- p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
- else if (exeWidth == 8)
- p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
- else
- NOT_IMPLEMENTED;
- p->curr.inversePredicate = 1;
- // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
- // jump to next basic block.
- p->JMPI(GenRegister::immud(0));
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
- p->pop();
-
- p->push();
- p->curr.useFlag(0, 0);
- /* Restore the blockIP to current label. */
- p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
if (barrierType == ir::syncGlobalBarrier) {
p->FENCE(fenceDst);
p->MOV(fenceDst, fenceDst);
}
- p->curr.predicate = GEN_PREDICATE_NONE;
- // As only the payload.2 is used and all the other regions are ignored
- // SIMD8 mode here is safe.
- p->curr.execWidth = 8;
- p->curr.physicalFlag = 0;
- p->curr.noMask = 1;
- // Copy barrier id from r0.
- p->AND(src, barrierId, GenRegister::immud(0x0f000000));
- // A barrier is OK to start the thread synchronization *and* SLM fence
- p->BARRIER(src);
- // Now we wait for the other threads
- p->curr.execWidth = 1;
- p->WAIT();
- // we executed the barrier then restore the barrier soft mask to initial value.
- p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+ p->push();
+ // As only the payload.2 is used and all the other regions are ignored
+ // SIMD8 mode here is safe.
+ p->curr.execWidth = 8;
+ p->curr.physicalFlag = 0;
+ p->curr.noMask = 1;
+ // Copy barrier id from r0.
+ p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+ // A barrier is OK to start the thread synchronization *and* SLM fence
+ p->BARRIER(src);
+ p->curr.execWidth = 1;
+ // Now we wait for the other threads
+ p->WAIT();
p->pop();
}
uint32_t end_of_thread:1;
} gen7_msg_gw;
+ struct {
+ uint32_t jip:16;
+ uint32_t uip:16;
+ } gen7_branch;
+
int d;
uint32_t ud;
float f;
GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
push();
curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
curr.execWidth = 1;
MOV(r, GenRegister::immud(u.u[1]));
MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
push();
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
MOV(r0, src0);
MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
curr.predicate = GEN_PREDICATE_NORMAL;
push();
curr.execWidth = 8;
curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
MOV(r0, GenRegister::suboffset(src0, 8));
MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
curr.predicate = GEN_PREDICATE_NORMAL;
#define ALU2_BRA(OP) \
void GenEncoder::OP(GenRegister src) { \
- alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(), src); \
+ alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
}
ALU2_BRA(IF)
GBE_ASSERT(insnID < this->store.size());
GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
insn.header.opcode == GEN_OPCODE_BRD ||
- insn.header.opcode == GEN_OPCODE_ENDIF);
- if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
- this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ insn.header.opcode == GEN_OPCODE_ENDIF ||
+ insn.header.opcode == GEN_OPCODE_IF ||
+ insn.header.opcode == GEN_OPCODE_BRC);
+
+ if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768)) {
+ int offset = 0;
+ if (insn.header.opcode == GEN_OPCODE_IF) {
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ return;
+ }
+ else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+ offset = -2;
+ /*assert(jumpDistance > -32769 && jumpDistance < 32768);*/
+ }
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset));
} else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
// For the conditional jump distance out of S15 range, we need to use an
// inverted jmp followed by a add ip, ip, distance to implement.
// for all the branching instruction. And need to adjust the distance
// for those branch instruction's start point and end point contains
// this instruction.
+ GenInstruction &insn2 = this->store[insnID+1];
+ GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
insn.header.opcode = GEN_OPCODE_ADD;
this->setDst(&insn, GenRegister::ip());
this->setSrc0(&insn, GenRegister::ip());
- this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
} else {
insn.header.predicate_inverse ^= 1;
this->setSrc1(&insn, GenRegister::immd(2));
insn2.header.opcode = GEN_OPCODE_ADD;
this->setDst(&insn2, GenRegister::ip());
this->setSrc0(&insn2, GenRegister::ip());
- this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
+ this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
}
}
*
* Also, there is some extra kludge to handle the predicates for JMPI.
*
- * See TODO for a better idea for branching and masking
- *
* TODO:
* =====
*
* interesting approach which consists in traversing the dominator tree in post
* order
*
- * About masking and branching, a much better idea (that I found later unfortunately)
- * is to replace the use of the flag by uses of if/endif to enclose the basic
- * block. So, instead of using predication, we use auto-masking. The very cool
- * consequence is that we can reintegrate back the structured branches.
- * Basically, we will be able to identify branches that can be mapped to
- * structured branches and mix nicely unstructured branches (which will use
- * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
- * fast)
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
*/
#include "backend/gen_insn_selection.hpp"
INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
/*! indicate whether a register is a scalar/uniform register. */
INLINE bool isScalarReg(const ir::Register ®) const {
-#if 0
- printf("reg %d ", reg.value());
- printf("uniform: %d ", getRegisterData(reg).isUniform());
- if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n"); return true; }
- if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push location.\n"); return true; }
- if (reg == ir::ocl::groupid0 ||
- reg == ir::ocl::groupid1 ||
- reg == ir::ocl::groupid2 ||
- reg == ir::ocl::barrierid ||
- reg == ir::ocl::threadn ||
- reg == ir::ocl::numgroup0 ||
- reg == ir::ocl::numgroup1 ||
- reg == ir::ocl::numgroup2 ||
- reg == ir::ocl::lsize0 ||
- reg == ir::ocl::lsize1 ||
- reg == ir::ocl::lsize2 ||
- reg == ir::ocl::gsize0 ||
- reg == ir::ocl::gsize1 ||
- reg == ir::ocl::gsize2 ||
- reg == ir::ocl::goffset0 ||
- reg == ir::ocl::goffset1 ||
- reg == ir::ocl::goffset2 ||
- reg == ir::ocl::workdim ||
- reg == ir::ocl::emask ||
- reg == ir::ocl::notemask ||
- reg == ir::ocl::barriermask
- ) {
- printf("special reg.\n");
- return true;
- }
- return false;
-#endif
const ir::RegisterData ®Data = getRegisterData(reg);
return regData.isUniform();
}
}
void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
insn->src(0) = src;
insn->index = uint16_t(jip);
}
for (uint32_t regID = 0; regID < this->regNum; ++regID)
this->regDAG[regID] = NULL;
+ this->block->hasBarrier = false;
+ this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+ bb.getLastInstruction()->getOpcode() == OP_RET;
+ if (!this->block->hasBranch)
+ this->block->endifOffset = -1;
+
// Build the DAG on the fly
uint32_t insnNum = 0;
const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+ if (insn.getOpcode() == OP_SYNC)
+ this->block->hasBarrier = true;
// Build a selectionDAG node for instruction
SelectionDAG *dag = this->newSelectionDAG(insn);
void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
{
// Bottom up code generation
+ bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
// Process all possible patterns for this instruction
SelectionDAG &dag = *insnDAG[insnID];
// Start a new code fragment
this->startBackwardGeneration();
+ // If there is no branch at the end of this block.
// Try all the patterns from best to worst
+
do {
if ((*it)->emit(*this, dag))
break;
} while (it != end);
GBE_ASSERT(it != end);
+ if (needEndif) {
+ const ir::BasicBlock *curr = insn.getParent();
+ const ir::BasicBlock *next = curr->getNextBlock();
+ this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+ needEndif = false;
+ }
+
// Output the code in the current basic block
this->endBackwardGeneration();
}
const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
sel.push();
+ sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.execWidth = simdWidth;
sel.SEL_CMP(genCmp, tmp, src0, src1);
const Type type = insn.getType();
const Immediate imm = insn.getImmediate();
const GenRegister dst = sel.selReg(insn.getDst(0), type);
- GenRegister flagReg;
sel.push();
if (sel.isScalarOrBool(insn.getDst(0)) == true) {
{
using namespace ir;
const ir::Register reg = sel.reg(FAMILY_DWORD);
- const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
const uint32_t params = insn.getParameters();
- sel.push();
- sel.curr.predicate = GEN_PREDICATE_NONE;
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
- sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask);
- sel.MOV(GenRegister::flag(1, 1), barrierMask);
- sel.pop();
-
// A barrier is OK to start the thread synchronization *and* SLM fence
- sel.push();
- //sel.curr.predicate = GEN_PREDICATE_NONE;
- sel.curr.flag = 1;
- sel.curr.subFlag = 1;
- sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
- sel.pop();
+ sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
return true;
}
GenRegister tmpDst;
if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)
- tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL);
+ tmpDst = sel.selReg(dst, TYPE_BOOL);
else
tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32);
sel.push();
sel.curr.flag = 1;
sel.curr.subFlag = 1;
- sel.curr.predicate = GEN_PREDICATE_NONE;
if (type == TYPE_S64 || type == TYPE_U64) {
GenRegister tmp[3];
for(int i=0; i<3; i++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
- sel.push();
- sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
- sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
- sel.pop();
- sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst);
} else if(opcode == OP_ORD) {
sel.push();
- sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
- sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0));
+ sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
sel.pop();
- sel.curr.predicate = GEN_PREDICATE_NORMAL;
-
- sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
- sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
} else
sel.CMP(getGenCompare(opcode), src0, src1, tmpDst);
sel.pop();
if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16))
sel.MOV(sel.selReg(dst, TYPE_U16), GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg));
- else
- sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst);
return true;
}
};
markAllChildren(dag);
}
- // Since we cannot predicate the select instruction with our current mask,
- // we need to perform the selection in two steps (one to select, one to
- // update the destination register)
- const RegisterFamily family = getFamily(type);
- const GenRegister tmp = sel.selReg(sel.reg(family), type);
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Register pred = insn.getPredicate();
sel.push();
sel.curr.flag = 1;
sel.curr.subFlag = 1;
sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
- sel.curr.noMask = 0;
+ //sel.curr.noMask = 0;
sel.curr.predicate = GEN_PREDICATE_NORMAL;
if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
- sel.SEL_INT64(tmp, src0, src1);
+ sel.SEL_INT64(dst, src0, src1);
else
- sel.SEL(tmp, src0, src1);
+ sel.SEL(dst, src0, src1);
sel.pop();
- // Update the destination register properly now
- sel.MOV(dst, tmp);
return true;
}
};
DECL_CTOR(TernaryInstruction, 1, 1);
};
+
/*! Label instruction pattern */
DECL_PATTERN(LabelInstruction)
{
const uint32_t simdWidth = sel.ctx.getSimdWidth();
sel.LABEL(label);
- // Do not emit any code for the "returning" block. There is no need for it
- if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+ // Do not emit any code for the "returning" block. There is no need for it
+ if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
return true;
+ LabelIndex jip;
+ const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+ if (sel.ctx.hasJIP(&insn))
+ jip = sel.ctx.getLabelIndex(&insn);
+ else
+ jip = nextLabel;
+
// Emit the mask computation at the head of each basic block
sel.push();
+ sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
- sel.curr.flag = 0;
- sel.curr.subFlag = 0;
sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1);
sel.pop();
- // If it is required, insert a JUMP to bypass the block
- if (sel.ctx.hasJIP(&insn)) {
- const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+ if (sel.block->hasBarrier) {
+ // If this block has barrier, we don't execute the block until all lanes
+ // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+ // lane not reached, we jump to jip. And no need to issue if/endif for
+ // this block, as it will always excute with all lanes activated.
sel.push();
-
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
sel.curr.predicate = GEN_PREDICATE_NONE;
- GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
- GenRegister flagReg = GenRegister::flag(0, 0);
- sel.AND(flagReg, flagReg, emaskReg);
-
+ sel.curr.noMask = 1;
+ sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
if (simdWidth == 8)
- sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
else if (simdWidth == 16)
- sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
else
NOT_IMPLEMENTED;
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
sel.curr.inversePredicate = 1;
- sel.curr.flag = 0;
- sel.curr.subFlag = 0;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
+ // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+ sel.pop();
+ }
+ else {
+ if (sel.ctx.hasJIP(&insn)) {
+ // If it is required, insert a JUMP to bypass the block
+ sel.push();
+ if (simdWidth == 8)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else if (simdWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else
+ NOT_IMPLEMENTED;
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ sel.curr.inversePredicate = 1;
+ sel.JMPI(GenRegister::immd(0), jip);
+ sel.pop();
+ }
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ // It's easier to set the jip to a relative position over next block.
+ sel.IF(GenRegister::immd(0), nextLabel, nextLabel, sel.block->endifOffset, sel.block->endifOffset);
+ sel.pop();
}
+
return true;
}
DECL_CTOR(LabelInstruction, 1, 1);
/*! Branch instruction pattern */
DECL_PATTERN(BranchInstruction)
{
-
void emitForwardBranch(Selection::Opaque &sel,
const ir::BranchInstruction &insn,
ir::LabelIndex dst,
{
using namespace ir;
const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
- const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
// We will not emit any jump if we must go the next block anyway
const BasicBlock *curr = insn.getParent();
const BasicBlock *next = curr->getNextBlock();
const LabelIndex nextLabel = next->getLabelIndex();
-
if (insn.isPredicated() == true) {
const Register pred = insn.getPredicateIndex();
-
sel.push();
// we don't need to set next label to the pcip
// as if there is no backward jump latter, then obviously everything will work fine.
sel.curr.flag = 0;
sel.curr.subFlag = 0;
sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), nextLabel);
+ sel.block->endifOffset = -1;
sel.pop();
-
- if (nextLabel == jip) return;
} else {
// Update the PcIPs
+ const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
- // Do not emit branch when we go to the next block anyway
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), nextLabel);
+ sel.block->endifOffset = -1;
if (nextLabel == jip) return;
+ // Branch to the jump target
sel.push();
sel.curr.execWidth = 1;
sel.curr.noMask = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
+ // FIXME just for the correct endif offset.
+ // JMPI still has 2 instruction.
+ sel.block->endifOffset -= 2;
}
}
// that actually take the branch
const LabelIndex next = bb.getNextBlock()->getLabelIndex();
sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
-
+ GBE_ASSERT(jip == dst);
sel.push();
sel.curr.flag = 0;
sel.curr.subFlag = 0;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0));
- // Re-update the PcIPs for the branches that takes the backward jump
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
- // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
- // will check those bits as well.
sel.curr.predicate = GEN_PREDICATE_NONE;
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), next);
sel.curr.execWidth = 1;
- sel.curr.noMask = 1;
- GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
- sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1), emaskReg);
-
- // Branch to the jump target
- if (simdWidth == 8)
- sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
- else if (simdWidth == 16)
+ if (simdWidth == 16)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
else
- NOT_SUPPORTED;
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ sel.curr.noMask = 1;
sel.JMPI(GenRegister::immd(0), jip);
+ sel.block->endifOffset = -3;
sel.pop();
-
} else {
-
+ const LabelIndex next = bb.getNextBlock()->getLabelIndex();
// Update the PcIPs
sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), next);
// Branch to the jump target
sel.push();
sel.curr.execWidth = 1;
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.JMPI(GenRegister::immd(0), jip);
sel.pop();
+ sel.block->endifOffset = -3;
}
}
/*! Translate IR compare to Gen compare */
uint32_t getGenCompare(ir::Opcode opcode);
+ #define GEN_MAX_LABEL 0xFFFF
+
/*! Selection opcodes properly encoded from 0 to n for fast jump tables
* generations
*/
void append(SelectionInstruction *insn);
/*! Append a new selection instruction at the beginning of the block */
void prepend(SelectionInstruction *insn);
+ int endifOffset;
+ bool hasBarrier;
+ bool hasBranch;
};
/*! Owns the selection engine */
DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
-DECL_SELECTION_IR(BRC, BinaryInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
DECL_SELECTION_IR(BRD, UnaryInstruction)
DECL_SELECTION_IR(IF, UnaryInstruction)
DECL_SELECTION_IR(ENDIF, UnaryInstruction)
this->noMask = 0;
this->flag = 0;
this->subFlag = 0;
- this->predicate = GEN_PREDICATE_NORMAL;
+ this->predicate = GEN_PREDICATE_NONE;
this->inversePredicate = 0;
this->physicalFlag = 1;
this->flagIndex = 0;
GEN_HORIZONTAL_STRIDE_1);
}
+ static INLINE GenRegister nullud(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_NULL,
+ 0,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+
static INLINE bool isNull(GenRegister reg) {
return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
&& reg.nr == GEN_ARF_NULL);
{
driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
assert(driver->bufmgr);
+ //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
}
memset(desc, 0, sizeof(*desc));
ker_bo = (drm_intel_bo *) kernel->bo;
desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
- desc->desc1.single_program_flow = 1;
+ desc->desc1.single_program_flow = 0;
desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
desc->desc5.rounding_mode = 0; /* round to nearest even */