From 4850c558cbec9234250a6e81601dd70259e10f74 Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Tue, 18 Mar 2014 15:28:44 +0800 Subject: [PATCH] GBE: Disable SPF and use JMPI + IF/ENDIF to handle each blocks. When enable SPF (single program flow), we always need to use f0 as the predication of almost each instruction. This bring some trouble when we want to get tow levels mask mechanism, for an example the SEL instruction, and some BOOL operations. We have to use more than one instructions to do that and simply introduce 100% of overhead of those instructions. v2: fix the wrong assertion. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" Reviewed-by: "Song, Ruiling" --- backend/src/backend/gen/gen_mesa_disasm.c | 31 ++-- backend/src/backend/gen_context.cpp | 141 ++++++++--------- backend/src/backend/gen_defs.hpp | 5 + backend/src/backend/gen_encoder.cpp | 29 +++- backend/src/backend/gen_insn_selection.cpp | 237 +++++++++++++---------------- backend/src/backend/gen_insn_selection.hpp | 5 + backend/src/backend/gen_insn_selection.hxx | 2 +- backend/src/backend/gen_register.hpp | 13 +- src/intel/intel_driver.c | 1 + src/intel/intel_gpgpu.c | 2 +- 10 files changed, 236 insertions(+), 230 deletions(-) diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index 84ef0c8..e58ef31 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -100,13 +100,13 @@ static const struct { [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 }, [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 }, - [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 }, - [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, - [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 }, - [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, - [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, - [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, - [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, + [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 }, + [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 }, + [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 }, + [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 }, + [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 }, + [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 }, + [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 }, [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, @@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void *opaque_insn) } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF || inst->header.opcode == GEN_OPCODE_ELSE || inst->header.opcode == GEN_OPCODE_ENDIF || - inst->header.opcode == GEN_OPCODE_WHILE)) { - // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count); - assert(0); + inst->header.opcode == GEN_OPCODE_WHILE || + inst->header.opcode == GEN_OPCODE_BRD || + inst->header.opcode == GEN_OPCODE_JMPI)) { + format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip); } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK || inst->header.opcode == GEN_OPCODE_CONTINUE || - inst->header.opcode == GEN_OPCODE_HALT)) { - // XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip); - assert(0); - } else if (inst->header.opcode == GEN_OPCODE_JMPI) { + inst->header.opcode == GEN_OPCODE_HALT || + inst->header.opcode == GEN_OPCODE_BRC)) { + format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip); + }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) { format (file, " %d", inst->bits3.d); - } + }*/ if (opcode[inst->header.opcode].nsrc > 0) { pad (file, 32); diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index c46127a..19bc9d2 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -87,33 +87,29 @@ namespace gbe const LabelIndex label = pair.first; const int32_t insnID = pair.second; const int32_t targetID = labelPos.find(label)->second; - p->patchJMPI(insnID, (targetID-insnID-1) * 2); + p->patchJMPI(insnID, (targetID - insnID) * 2); + } + for (auto pair : branchPos3) { + const LabelPair labelPair = pair.first; + const int32_t insnID = pair.second; + const int32_t jip = labelPos.find(labelPair.l0)->second + labelPair.offset0; + const int32_t uip = labelPos.find(labelPair.l1)->second + labelPair.offset1; + assert((jip - insnID) * 2 < 32767 && (jip - insnID) * 2 > -32768); + assert((uip - insnID) * 2 < 32767 && (uip - insnID) * 2 > -32768); + p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * 2)); } } void GenContext::clearFlagRegister(void) { // when group size not aligned to simdWidth, flag register need clear to // make prediction(any8/16h) work correctly - const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask)); - const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)); - uint32_t execWidth = p->curr.execWidth; + const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); p->push(); - p->curr.predicate = GEN_PREDICATE_NONE; - p->curr.noMask = 1; - /* clear all the bit in f0.0. */ - p->curr.execWidth = 1; - p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000)); - /* clear the barrier mask bits to all zero0*/ - p->curr.noMask = 0; - p->curr.useFlag(0, 0); - p->curr.execWidth = execWidth; - /* set all the active lane to 1. Inactive lane remains 0. */ - p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0)); - p->curr.noMask = 1; - p->curr.execWidth = 1; - p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW)); - p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF)); - p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg); + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL)); + p->curr.noMask = 0; + p->MOV(blockip, GenRegister::immuw(0)); p->pop(); } @@ -148,7 +144,6 @@ namespace gbe // Check that everything is consistent in the kernel code const uint32_t perLaneSize = kernel->getStackSize(); const uint32_t perThreadSize = perLaneSize * this->simdWidth; - //const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER); GBE_ASSERT(perLaneSize > 0); GBE_ASSERT(isPowerOf<2>(perLaneSize) == true); GBE_ASSERT(isPowerOf<2>(perThreadSize) == true); @@ -325,6 +320,7 @@ namespace gbe for (int i = 0; i < w / 8; i ++) { p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1); p->curr.accWrEnable = 1; p->MACH(tmp, src0, src1); @@ -500,6 +496,7 @@ namespace gbe int execWidth = p->curr.execWidth; p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.execWidth = 8; for (int nib = 0; nib < execWidth / 4; nib ++) { p->AND(dest, src.bottom_half(), GenRegister::immud(63)); @@ -539,6 +536,7 @@ namespace gbe void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) { p->SHR(sign, high, GenRegister::immud(31)); p->push(); + p->curr.noMask = 1; p->curr.predicate = GEN_PREDICATE_NONE; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0)); @@ -574,6 +572,7 @@ namespace gbe I64FullMult(e, f, g, h, a, b, c, d); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0)); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -626,6 +625,7 @@ namespace gbe p->OR(a, e, f); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_NZ, a, zero); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -639,6 +639,7 @@ namespace gbe I64FullMult(e, f, g, h, a, b, c, d); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_NZ, i, zero); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -670,6 +671,7 @@ namespace gbe p->push(); p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->CMP(GEN_CONDITIONAL_NZ, e, zero); p->curr.predicate = GEN_PREDICATE_NORMAL; p->MOV(b, one); @@ -793,6 +795,7 @@ namespace gbe case SEL_OP_I64SHL: p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; collectShifter(a, y); loadBottomHalf(e, x); loadTopHalf(f, x); @@ -820,6 +823,7 @@ namespace gbe case SEL_OP_I64SHR: p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; collectShifter(a, y); loadBottomHalf(e, x); loadTopHalf(f, x); @@ -848,6 +852,7 @@ namespace gbe f.type = GEN_TYPE_D; p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; collectShifter(a, y); loadBottomHalf(e, x); loadTopHalf(f, x); @@ -894,6 +899,7 @@ namespace gbe p->push(); p->curr.useFlag(flag.flag_nr(), flag.flag_subnr()); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32)); //high == 0 p->curr.predicate = GEN_PREDICATE_NORMAL; p->MOV(dst, low); @@ -911,6 +917,7 @@ namespace gbe p->pop(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23)); p->curr.predicate = GEN_PREDICATE_NORMAL; p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32)); //exp>23 && high!=0 @@ -936,6 +943,7 @@ namespace gbe p->pop(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23)); p->curr.predicate = GEN_PREDICATE_NORMAL; p->MOV(dst_ud, GenRegister::immud(0)); //exp==9, SHR == 0 @@ -956,7 +964,7 @@ namespace gbe p->SHL(high, low, tmp); p->MOV(low, GenRegister::immud(0)); - p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2); + p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2); p->curr.predicate = GEN_PREDICATE_NONE; p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31)); //update dst where high != 0 p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -970,7 +978,7 @@ namespace gbe p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000)); p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0)); p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe)); - p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2); + p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2); p->pop(); @@ -994,6 +1002,7 @@ namespace gbe p->MOV(tmp_high, high); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(f0.flag_nr(), f0.flag_subnr()); p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000)); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -1006,6 +1015,7 @@ namespace gbe UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(f0.flag_nr(), f0.flag_subnr()); p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000)); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -1039,6 +1049,7 @@ namespace gbe if(dst.is_signed_int()) { p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr()); p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0)); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -1066,11 +1077,10 @@ namespace gbe f1.width = GEN_WIDTH_1; GenRegister f2 = GenRegister::suboffset(f1, 1); GenRegister f3 = GenRegister::suboffset(f1, 2); - GenRegister f4 = GenRegister::suboffset(f1, 3); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; - saveFlag(f4, flag, subFlag); + p->curr.noMask = 1; loadTopHalf(tmp0, src0); loadTopHalf(tmp1, src1); switch(insn.extra.function) { @@ -1130,12 +1140,13 @@ namespace gbe NOT_IMPLEMENTED; } p->curr.execWidth = 1; - p->AND(f1, f1, f4); p->MOV(GenRegister::flag(flag, subFlag), f1); p->pop(); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->MOV(dst, GenRegister::immd(0)); + p->curr.noMask = 0; p->curr.predicate = GEN_PREDICATE_NORMAL; p->MOV(dst, GenRegister::immd(-1)); p->pop(); @@ -1163,6 +1174,7 @@ namespace gbe p->ADD(c, c, d); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); if(! dst.is_signed_int()) { p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0)); @@ -1176,6 +1188,7 @@ namespace gbe p->MOV(a, GenRegister::immud(0x80000000u)); p->MOV(b, GenRegister::immud(0)); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0)); p->curr.predicate = GEN_PREDICATE_NORMAL; p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u)); @@ -1209,6 +1222,7 @@ namespace gbe p->ADD(c, c, d); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); if(! dst.is_signed_int()) { p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0)); @@ -1238,6 +1252,7 @@ namespace gbe src = src.top_half(); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.execWidth = 8; p->MOV(dest, src); p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); @@ -1252,6 +1267,7 @@ namespace gbe int execWidth = p->curr.execWidth; dest = dest.top_half(); p->push(); + p->curr.predicate = GEN_PREDICATE_NORMAL; p->curr.execWidth = 8; p->MOV(dest, src); p->curr.nibControl = 1; @@ -1271,6 +1287,7 @@ namespace gbe src = src.bottom_half(); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.execWidth = 8; p->MOV(dest, src); p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); @@ -1286,6 +1303,7 @@ namespace gbe dest = dest.bottom_half(); p->push(); p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NORMAL; p->MOV(dest, src); p->curr.nibControl = 1; p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); @@ -1369,6 +1387,7 @@ namespace gbe loadBottomHalf(d, y); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c); I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d); p->ADD(e, e, f); @@ -1443,6 +1462,7 @@ namespace gbe // condition <- (c,d)==0 && (a,b)>=(e,f) p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->MOV(l, zero); p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_EQ, a, e); @@ -1477,6 +1497,7 @@ namespace gbe p->ADD(m, m, one); p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64)); @@ -1484,7 +1505,6 @@ namespace gbe p->curr.noMask = 1; p->AND(flagReg, flagReg, emaskReg); - p->curr.predicate = GEN_PREDICATE_NORMAL; // under condition, jump back to start point if (simdWidth == 8) p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; @@ -1493,8 +1513,9 @@ namespace gbe else NOT_IMPLEMENTED; int jip = -(int)(p->n_instruction() - loop_start + 1) * 2; + p->curr.noMask = 1; p->JMPI(zero); - p->patchJMPI(p->n_instruction()-2, jip); + p->patchJMPI(p->n_instruction() - 2, jip + 2); p->pop(); // end of loop } @@ -1502,6 +1523,7 @@ namespace gbe if(x.is_signed_int()) { p->push(); p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); p->CMP(GEN_CONDITIONAL_NEQ, k, zero); p->curr.predicate = GEN_PREDICATE_NORMAL; @@ -1534,7 +1556,7 @@ namespace gbe } void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) { - NOT_IMPLEMENTED; + p->NOP(); } void GenContext::emitWaitInstruction(const SelectionInstruction &insn) { @@ -1546,59 +1568,24 @@ namespace gbe const GenRegister fenceDst = ra->genReg(insn.dst(0)); uint32_t barrierType = insn.extra.barrierType; const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); - GenRegister blockIP; - uint32_t exeWidth = p->curr.execWidth; - ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex(); - - if (exeWidth == 16) - blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip)); - else if (exeWidth == 8) - blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); - p->push(); - /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions - after the barrier, If there is any lane still remains zero. */ - p->MOV(blockIP, GenRegister::immuw(0xFFFF)); - p->curr.noMask = 1; - p->curr.execWidth = 1; - this->branchPos2.push_back(std::make_pair(label, p->n_instruction())); - if (exeWidth == 16) - p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; - else if (exeWidth == 8) - p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; - else - NOT_IMPLEMENTED; - p->curr.inversePredicate = 1; - // If not all channel is set to 1, the barrier is still waiting for other lanes to complete, - // jump to next basic block. - p->JMPI(GenRegister::immud(0)); - p->curr.predicate = GEN_PREDICATE_NONE; - p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask))); - p->pop(); - - p->push(); - p->curr.useFlag(0, 0); - /* Restore the blockIP to current label. */ - p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex())); if (barrierType == ir::syncGlobalBarrier) { p->FENCE(fenceDst); p->MOV(fenceDst, fenceDst); } - p->curr.predicate = GEN_PREDICATE_NONE; - // As only the payload.2 is used and all the other regions are ignored - // SIMD8 mode here is safe. - p->curr.execWidth = 8; - p->curr.physicalFlag = 0; - p->curr.noMask = 1; - // Copy barrier id from r0. - p->AND(src, barrierId, GenRegister::immud(0x0f000000)); - // A barrier is OK to start the thread synchronization *and* SLM fence - p->BARRIER(src); - // Now we wait for the other threads - p->curr.execWidth = 1; - p->WAIT(); - // we executed the barrier then restore the barrier soft mask to initial value. - p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask))); + p->push(); + // As only the payload.2 is used and all the other regions are ignored + // SIMD8 mode here is safe. + p->curr.execWidth = 8; + p->curr.physicalFlag = 0; + p->curr.noMask = 1; + // Copy barrier id from r0. + p->AND(src, barrierId, GenRegister::immud(0x0f000000)); + // A barrier is OK to start the thread synchronization *and* SLM fence + p->BARRIER(src); + p->curr.execWidth = 1; + // Now we wait for the other threads + p->WAIT(); p->pop(); } diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp index 7c49497..e731174 100644 --- a/backend/src/backend/gen_defs.hpp +++ b/backend/src/backend/gen_defs.hpp @@ -896,6 +896,11 @@ struct GenInstruction uint32_t end_of_thread:1; } gen7_msg_gw; + struct { + uint32_t jip:16; + uint32_t uip:16; + } gen7_branch; + int d; uint32_t ud; float f; diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index fc7e53d..06aa769 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -837,6 +837,7 @@ namespace gbe GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD); push(); curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = 1; curr.execWidth = 1; MOV(r, GenRegister::immud(u.u[1])); MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0])); @@ -907,6 +908,7 @@ namespace gbe push(); curr.execWidth = 8; curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = 1; MOV(r0, src0); MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4)); curr.predicate = GEN_PREDICATE_NORMAL; @@ -920,6 +922,7 @@ namespace gbe push(); curr.execWidth = 8; curr.predicate = GEN_PREDICATE_NONE; + curr.noMask = 1; MOV(r0, GenRegister::suboffset(src0, 8)); MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12)); curr.predicate = GEN_PREDICATE_NORMAL; @@ -1058,7 +1061,7 @@ namespace gbe #define ALU2_BRA(OP) \ void GenEncoder::OP(GenRegister src) { \ - alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(), src); \ + alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \ } ALU2_BRA(IF) @@ -1071,9 +1074,21 @@ namespace gbe GBE_ASSERT(insnID < this->store.size()); GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI || insn.header.opcode == GEN_OPCODE_BRD || - insn.header.opcode == GEN_OPCODE_ENDIF); - if ( jumpDistance > -32769 && jumpDistance < 32768 ) { - this->setSrc1(&insn, GenRegister::immd(jumpDistance)); + insn.header.opcode == GEN_OPCODE_ENDIF || + insn.header.opcode == GEN_OPCODE_IF || + insn.header.opcode == GEN_OPCODE_BRC); + + if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768)) { + int offset = 0; + if (insn.header.opcode == GEN_OPCODE_IF) { + this->setSrc1(&insn, GenRegister::immd(jumpDistance)); + return; + } + else if (insn.header.opcode == GEN_OPCODE_JMPI) { + offset = -2; + /*assert(jumpDistance > -32769 && jumpDistance < 32768);*/ + } + this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset)); } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) { // For the conditional jump distance out of S15 range, we need to use an // inverted jmp followed by a add ip, ip, distance to implement. @@ -1085,10 +1100,12 @@ namespace gbe // for all the branching instruction. And need to adjust the distance // for those branch instruction's start point and end point contains // this instruction. + GenInstruction &insn2 = this->store[insnID+1]; + GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP); insn.header.opcode = GEN_OPCODE_ADD; this->setDst(&insn, GenRegister::ip()); this->setSrc0(&insn, GenRegister::ip()); - this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8)); + this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8)); } else { insn.header.predicate_inverse ^= 1; this->setSrc1(&insn, GenRegister::immd(2)); @@ -1099,7 +1116,7 @@ namespace gbe insn2.header.opcode = GEN_OPCODE_ADD; this->setDst(&insn2, GenRegister::ip()); this->setSrc0(&insn2, GenRegister::ip()); - this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8)); + this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8)); } } diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index d86e04c..147c3e6 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -76,8 +76,6 @@ * * Also, there is some extra kludge to handle the predicates for JMPI. * - * See TODO for a better idea for branching and masking - * * TODO: * ===== * @@ -92,14 +90,9 @@ * interesting approach which consists in traversing the dominator tree in post * order * - * About masking and branching, a much better idea (that I found later unfortunately) - * is to replace the use of the flag by uses of if/endif to enclose the basic - * block. So, instead of using predication, we use auto-masking. The very cool - * consequence is that we can reintegrate back the structured branches. - * Basically, we will be able to identify branches that can be mapped to - * structured branches and mix nicely unstructured branches (which will use - * jpmi, if/endif to mask the blocks) and structured branches (which are pretty - * fast) + * We already use if/endif to enclose each basic block. We will continue to identify + * those blocks which could match to structured branching and use pure structured + * instruction to handle them completely. */ #include "backend/gen_insn_selection.hpp" @@ -320,38 +313,6 @@ namespace gbe INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool); /*! indicate whether a register is a scalar/uniform register. */ INLINE bool isScalarReg(const ir::Register ®) const { -#if 0 - printf("reg %d ", reg.value()); - printf("uniform: %d ", getRegisterData(reg).isUniform()); - if (ctx.getFunction().getArg(reg) != NULL) { printf("true function arg\n"); return true; } - if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push location.\n"); return true; } - if (reg == ir::ocl::groupid0 || - reg == ir::ocl::groupid1 || - reg == ir::ocl::groupid2 || - reg == ir::ocl::barrierid || - reg == ir::ocl::threadn || - reg == ir::ocl::numgroup0 || - reg == ir::ocl::numgroup1 || - reg == ir::ocl::numgroup2 || - reg == ir::ocl::lsize0 || - reg == ir::ocl::lsize1 || - reg == ir::ocl::lsize2 || - reg == ir::ocl::gsize0 || - reg == ir::ocl::gsize1 || - reg == ir::ocl::gsize2 || - reg == ir::ocl::goffset0 || - reg == ir::ocl::goffset1 || - reg == ir::ocl::goffset2 || - reg == ir::ocl::workdim || - reg == ir::ocl::emask || - reg == ir::ocl::notemask || - reg == ir::ocl::barriermask - ) { - printf("special reg.\n"); - return true; - } - return false; -#endif const ir::RegisterData ®Data = getRegisterData(reg); return regData.isUniform(); } @@ -992,7 +953,7 @@ namespace gbe } void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1); + SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1); insn->src(0) = src; insn->index = uint16_t(jip); } @@ -1412,9 +1373,17 @@ namespace gbe for (uint32_t regID = 0; regID < this->regNum; ++regID) this->regDAG[regID] = NULL; + this->block->hasBarrier = false; + this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA || + bb.getLastInstruction()->getOpcode() == OP_RET; + if (!this->block->hasBranch) + this->block->endifOffset = -1; + // Build the DAG on the fly uint32_t insnNum = 0; const_cast(bb).foreach([&](const Instruction &insn) { + if (insn.getOpcode() == OP_SYNC) + this->block->hasBarrier = true; // Build a selectionDAG node for instruction SelectionDAG *dag = this->newSelectionDAG(insn); @@ -1465,6 +1434,7 @@ namespace gbe void Selection::Opaque::matchBasicBlock(uint32_t insnNum) { // Bottom up code generation + bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier; for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) { // Process all possible patterns for this instruction SelectionDAG &dag = *insnDAG[insnID]; @@ -1476,8 +1446,10 @@ namespace gbe // Start a new code fragment this->startBackwardGeneration(); + // If there is no branch at the end of this block. // Try all the patterns from best to worst + do { if ((*it)->emit(*this, dag)) break; @@ -1485,6 +1457,13 @@ namespace gbe } while (it != end); GBE_ASSERT(it != end); + if (needEndif) { + const ir::BasicBlock *curr = insn.getParent(); + const ir::BasicBlock *next = curr->getNextBlock(); + this->ENDIF(GenRegister::immd(0), next->getLabelIndex()); + needEndif = false; + } + // Output the code in the current basic block this->endBackwardGeneration(); } @@ -2133,6 +2112,7 @@ namespace gbe const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type); sel.push(); + sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.execWidth = simdWidth; sel.SEL_CMP(genCmp, tmp, src0, src1); @@ -2329,7 +2309,6 @@ namespace gbe const Type type = insn.getType(); const Immediate imm = insn.getImmediate(); const GenRegister dst = sel.selReg(insn.getDst(0), type); - GenRegister flagReg; sel.push(); if (sel.isScalarOrBool(insn.getDst(0)) == true) { @@ -2371,24 +2350,10 @@ namespace gbe { using namespace ir; const ir::Register reg = sel.reg(FAMILY_DWORD); - const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL); const uint32_t params = insn.getParameters(); - sel.push(); - sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.noMask = 1; - sel.curr.execWidth = 1; - sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask); - sel.MOV(GenRegister::flag(1, 1), barrierMask); - sel.pop(); - // A barrier is OK to start the thread synchronization *and* SLM fence - sel.push(); - //sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.flag = 1; - sel.curr.subFlag = 1; - sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params); - sel.pop(); + sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params); return true; } @@ -2696,7 +2661,7 @@ namespace gbe GenRegister tmpDst; if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16) - tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL); + tmpDst = sel.selReg(dst, TYPE_BOOL); else tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32); @@ -2724,36 +2689,23 @@ namespace gbe sel.push(); sel.curr.flag = 1; sel.curr.subFlag = 1; - sel.curr.predicate = GEN_PREDICATE_NONE; if (type == TYPE_S64 || type == TYPE_U64) { GenRegister tmp[3]; for(int i=0; i<3; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); - sel.push(); - sel.curr.execWidth = 1; - sel.curr.noMask = 1; - sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0)); - sel.pop(); - sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst); } else if(opcode == OP_ORD) { sel.push(); - sel.curr.execWidth = 1; - sel.curr.noMask = 1; - sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0)); + sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst); + sel.curr.predicate = GEN_PREDICATE_NORMAL; + sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst); sel.pop(); - sel.curr.predicate = GEN_PREDICATE_NORMAL; - - sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst); - sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst); } else sel.CMP(getGenCompare(opcode), src0, src1, tmpDst); sel.pop(); if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)) sel.MOV(sel.selReg(dst, TYPE_U16), GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg)); - else - sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst); return true; } }; @@ -2979,11 +2931,6 @@ namespace gbe markAllChildren(dag); } - // Since we cannot predicate the select instruction with our current mask, - // we need to perform the selection in two steps (one to select, one to - // update the destination register) - const RegisterFamily family = getFamily(type); - const GenRegister tmp = sel.selReg(sel.reg(family), type); const uint32_t simdWidth = sel.ctx.getSimdWidth(); const Register pred = insn.getPredicate(); sel.push(); @@ -2992,16 +2939,14 @@ namespace gbe sel.curr.flag = 1; sel.curr.subFlag = 1; sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0)); - sel.curr.noMask = 0; + //sel.curr.noMask = 0; sel.curr.predicate = GEN_PREDICATE_NORMAL; if(type == ir::TYPE_S64 || type == ir::TYPE_U64) - sel.SEL_INT64(tmp, src0, src1); + sel.SEL_INT64(dst, src0, src1); else - sel.SEL(tmp, src0, src1); + sel.SEL(dst, src0, src1); sel.pop(); - // Update the destination register properly now - sel.MOV(dst, tmp); return true; } }; @@ -3041,6 +2986,7 @@ namespace gbe DECL_CTOR(TernaryInstruction, 1, 1); }; + /*! Label instruction pattern */ DECL_PATTERN(LabelInstruction) { @@ -3053,42 +2999,75 @@ namespace gbe const uint32_t simdWidth = sel.ctx.getSimdWidth(); sel.LABEL(label); - // Do not emit any code for the "returning" block. There is no need for it - if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock()) + // Do not emit any code for the "returning" block. There is no need for it + if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock()) return true; + LabelIndex jip; + const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex(); + if (sel.ctx.hasJIP(&insn)) + jip = sel.ctx.getLabelIndex(&insn); + else + jip = nextLabel; + // Emit the mask computation at the head of each basic block sel.push(); + sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.flag = 0; - sel.curr.subFlag = 0; sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1); sel.pop(); - // If it is required, insert a JUMP to bypass the block - if (sel.ctx.hasJIP(&insn)) { - const LabelIndex jip = sel.ctx.getLabelIndex(&insn); + if (sel.block->hasBarrier) { + // If this block has barrier, we don't execute the block until all lanes + // are 1s. Set each reached lane to 1, then check all lanes. If there is any + // lane not reached, we jump to jip. And no need to issue if/endif for + // this block, as it will always excute with all lanes activated. sel.push(); - - sel.curr.noMask = 1; - sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NORMAL; + sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL)); sel.curr.predicate = GEN_PREDICATE_NONE; - GenRegister emaskReg = GenRegister::uw1grf(ocl::emask); - GenRegister flagReg = GenRegister::flag(0, 0); - sel.AND(flagReg, flagReg, emaskReg); - + sel.curr.noMask = 1; + sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL)); if (simdWidth == 8) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; else if (simdWidth == 16) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; else NOT_IMPLEMENTED; + sel.curr.noMask = 1; + sel.curr.execWidth = 1; sel.curr.inversePredicate = 1; - sel.curr.flag = 0; - sel.curr.subFlag = 0; sel.JMPI(GenRegister::immd(0), jip); sel.pop(); + // FIXME, if the last BRA is unconditional jump, we don't need to update the label here. + sel.push(); + sel.curr.predicate = GEN_PREDICATE_NORMAL; + sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label)); + sel.pop(); + } + else { + if (sel.ctx.hasJIP(&insn)) { + // If it is required, insert a JUMP to bypass the block + sel.push(); + if (simdWidth == 8) + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; + else if (simdWidth == 16) + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; + else + NOT_IMPLEMENTED; + sel.curr.noMask = 1; + sel.curr.execWidth = 1; + sel.curr.inversePredicate = 1; + sel.JMPI(GenRegister::immd(0), jip); + sel.pop(); + } + sel.push(); + sel.curr.predicate = GEN_PREDICATE_NORMAL; + // It's easier to set the jip to a relative position over next block. + sel.IF(GenRegister::immd(0), nextLabel, nextLabel, sel.block->endifOffset, sel.block->endifOffset); + sel.pop(); } + return true; } DECL_CTOR(LabelInstruction, 1, 1); @@ -3225,7 +3204,6 @@ namespace gbe /*! Branch instruction pattern */ DECL_PATTERN(BranchInstruction) { - void emitForwardBranch(Selection::Opaque &sel, const ir::BranchInstruction &insn, ir::LabelIndex dst, @@ -3233,16 +3211,13 @@ namespace gbe { using namespace ir; const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16); - const LabelIndex jip = sel.ctx.getLabelIndex(&insn); // We will not emit any jump if we must go the next block anyway const BasicBlock *curr = insn.getParent(); const BasicBlock *next = curr->getNextBlock(); const LabelIndex nextLabel = next->getLabelIndex(); - if (insn.isPredicated() == true) { const Register pred = insn.getPredicateIndex(); - sel.push(); // we don't need to set next label to the pcip // as if there is no backward jump latter, then obviously everything will work fine. @@ -3250,22 +3225,30 @@ namespace gbe sel.curr.flag = 0; sel.curr.subFlag = 0; sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0)); + sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); + if (!sel.block->hasBarrier) + sel.ENDIF(GenRegister::immd(0), nextLabel); + sel.block->endifOffset = -1; sel.pop(); - - if (nextLabel == jip) return; } else { // Update the PcIPs + const LabelIndex jip = sel.ctx.getLabelIndex(&insn); sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); - - // Do not emit branch when we go to the next block anyway + if (!sel.block->hasBarrier) + sel.ENDIF(GenRegister::immd(0), nextLabel); + sel.block->endifOffset = -1; if (nextLabel == jip) return; + // Branch to the jump target sel.push(); sel.curr.execWidth = 1; sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.JMPI(GenRegister::immd(0), jip); sel.pop(); + // FIXME just for the correct endif offset. + // JMPI still has 2 instruction. + sel.block->endifOffset -= 2; } } @@ -3290,37 +3273,32 @@ namespace gbe // that actually take the branch const LabelIndex next = bb.getNextBlock()->getLabelIndex(); sel.MOV(ip, GenRegister::immuw(uint16_t(next))); - + GBE_ASSERT(jip == dst); sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 0; + sel.curr.predicate = GEN_PREDICATE_NONE; sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0)); - // Re-update the PcIPs for the branches that takes the backward jump + sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); - - // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16 - // will check those bits as well. sel.curr.predicate = GEN_PREDICATE_NONE; + if (!sel.block->hasBarrier) + sel.ENDIF(GenRegister::immd(0), next); sel.curr.execWidth = 1; - sel.curr.noMask = 1; - GenRegister emaskReg = GenRegister::uw1grf(ocl::emask); - sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1), emaskReg); - - // Branch to the jump target - if (simdWidth == 8) - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; - else if (simdWidth == 16) + if (simdWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; else - NOT_SUPPORTED; + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; + sel.curr.noMask = 1; sel.JMPI(GenRegister::immd(0), jip); + sel.block->endifOffset = -3; sel.pop(); - } else { - + const LabelIndex next = bb.getNextBlock()->getLabelIndex(); // Update the PcIPs sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); - + if (!sel.block->hasBarrier) + sel.ENDIF(GenRegister::immd(0), next); // Branch to the jump target sel.push(); sel.curr.execWidth = 1; @@ -3328,6 +3306,7 @@ namespace gbe sel.curr.predicate = GEN_PREDICATE_NONE; sel.JMPI(GenRegister::immd(0), jip); sel.pop(); + sel.block->endifOffset = -3; } } diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp index 04fbb9f..8557768 100644 --- a/backend/src/backend/gen_insn_selection.hpp +++ b/backend/src/backend/gen_insn_selection.hpp @@ -42,6 +42,8 @@ namespace gbe /*! Translate IR compare to Gen compare */ uint32_t getGenCompare(ir::Opcode opcode); + #define GEN_MAX_LABEL 0xFFFF + /*! Selection opcodes properly encoded from 0 to n for fast jump tables * generations */ @@ -190,6 +192,9 @@ namespace gbe void append(SelectionInstruction *insn); /*! Append a new selection instruction at the beginning of the block */ void prepend(SelectionInstruction *insn); + int endifOffset; + bool hasBarrier; + bool hasBranch; }; /*! Owns the selection engine */ diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index d318f8e..ddc9d5e 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction) DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction) DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction) DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction) -DECL_SELECTION_IR(BRC, BinaryInstruction) +DECL_SELECTION_IR(BRC, UnaryInstruction) DECL_SELECTION_IR(BRD, UnaryInstruction) DECL_SELECTION_IR(IF, UnaryInstruction) DECL_SELECTION_IR(ENDIF, UnaryInstruction) diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index 25cb428..051f16d 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -118,7 +118,7 @@ namespace gbe this->noMask = 0; this->flag = 0; this->subFlag = 0; - this->predicate = GEN_PREDICATE_NORMAL; + this->predicate = GEN_PREDICATE_NONE; this->inversePredicate = 0; this->physicalFlag = 1; this->flagIndex = 0; @@ -657,6 +657,17 @@ namespace gbe GEN_HORIZONTAL_STRIDE_1); } + static INLINE GenRegister nullud(void) { + return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, + GEN_ARF_NULL, + 0, + GEN_TYPE_UD, + GEN_VERTICAL_STRIDE_8, + GEN_WIDTH_8, + GEN_HORIZONTAL_STRIDE_1); + } + + static INLINE bool isNull(GenRegister reg) { return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE && reg.nr == GEN_ARF_NULL); diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index 5e474de..c2308da 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver) { driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE); assert(driver->bufmgr); + //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1); drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr); } diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index e95b050..4819e9e 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) memset(desc, 0, sizeof(*desc)); ker_bo = (drm_intel_bo *) kernel->bo; desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */ - desc->desc1.single_program_flow = 1; + desc->desc1.single_program_flow = 0; desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */ desc->desc5.rounding_mode = 0; /* round to nearest even */ -- 2.7.4