From 858414e497141b54c0df18510cb764d3770a2d9c Mon Sep 17 00:00:00 2001 From: Zhigang Gong Date: Fri, 28 Mar 2014 00:38:29 +0800 Subject: [PATCH] GBE: Optimize the bool register allocation/processing. Previously, we have a global flag allocation implemntation. After some analysis, I found the global flag allocation is not the best solution here. As for the cross block reference of bool value, we have to combine it with current emask. There is no obvious advantage to allocate deadicate physical flag register for those cross block usage. We just need to allocate physical flag within each BB. We need to handle the following cases: 1. The bool's liveness never beyond this BB. And the bool is only used as a dst register or a pred register. This bool value could be allocated in physical flag only if there is enough physical flag. We already identified those bool at the instruction select stage, and put them in the flagBooleans set. 2. The bool is defined in another BB and used in this BB, then we need to prepend an instruction at the position where we use it. 3. The bool is defined in this BB but is also used as some instruction's source registers rather than the pred register. We have to keep the normal grf (UW8/UW16) register for this bool. For some CMP instruction, we need to append a SEL instruction convert the flag to the grf register. 4. Even for the spilling flag, if there is only one spilling flag, we will also try to reuse the temporary flag register latter. This requires all the instructions should got it flag at the instruction selection stage. And should not use the flag physical number directly at the gen_context stage. Otherwise, may break the algorithm here. We will track all the validated bool value and to avoid any redundant validation for the same flag. But if there is no enough physical flag, we have to spill the previous allocated physical flag. And the spilling policy is to spill the allocate flag which live to the last time end point. Let's see an real example of the improvement of this patch: I take the compiler_vect_compare as an example, before this patch, the instructions are as below: ( 24) cmp.g.f1.1(8) null g110<8,8,1>D 0D { align1 WE_normal 1Q }; ( 26) cmp.g.f1.1(8) null g111<8,8,1>D 0D { align1 WE_normal 2Q }; ( 28) (+f1.1) sel(16) g109<1>UW g1.2<0,1,0>UW g1<0,1,0>UW { align1 WE_normal 1H }; ( 30) cmp.g.f1.1(8) null g112<8,8,1>D 0D { align1 WE_normal 1Q }; ( 32) cmp.g.f1.1(8) null g113<8,8,1>D 0D { align1 WE_normal 2Q }; ( 34) (+f1.1) sel(16) g108<1>UW g1.2<0,1,0>UW g1<0,1,0>UW { align1 WE_normal 1H }; ( 36) cmp.g.f1.1(8) null g114<8,8,1>D 0D { align1 WE_normal 1Q }; ( 38) cmp.g.f1.1(8) null g115<8,8,1>D 0D { align1 WE_normal 2Q }; ( 40) (+f1.1) sel(16) g107<1>UW g1.2<0,1,0>UW g1<0,1,0>UW { align1 WE_normal 1H }; ( 42) cmp.g.f1.1(8) null g116<8,8,1>D 0D { align1 WE_normal 1Q }; ( 44) cmp.g.f1.1(8) null g117<8,8,1>D 0D { align1 WE_normal 2Q }; ( 46) (+f1.1) sel(16) g106<1>UW g1.2<0,1,0>UW g1<0,1,0>UW { align1 WE_normal 1H }; ( 48) mov(16) g104<1>F -nanF { align1 WE_normal 1H }; ( 50) cmp.ne.f1.1(16) null g109<8,8,1>UW 0x0UW { align1 WE_normal 1H switch }; ( 52) (+f1.1) sel(16) g96<1>D g104<8,8,1>D 0D { align1 WE_normal 1H }; ( 54) cmp.ne.f1.1(16) null g108<8,8,1>UW 0x0UW { align1 WE_normal 1H switch }; ( 56) (+f1.1) sel(16) g98<1>D g104<8,8,1>D 0D { align1 WE_normal 1H }; ( 58) cmp.ne.f1.1(16) null g107<8,8,1>UW 0x0UW { align1 WE_normal 1H switch }; ( 60) (+f1.1) sel(16) g100<1>D g104<8,8,1>D 0D { align1 WE_normal 1H }; ( 62) cmp.ne.f1.1(16) null g106<8,8,1>UW 0x0UW { align1 WE_normal 1H switch }; ( 64) (+f1.1) sel(16) g102<1>D g104<8,8,1>D 0D { align1 WE_normal 1H }; ( 66) add(16) g94<1>D g1.3<0,1,0>D g120<8,8,1>D { align1 WE_normal 1H }; ( 68) send(16) null g94<8,8,1>UD data (bti: 1, rgba: 0, SIMD16, legacy, Untyped Surface Write) mlen 10 rlen 0 { align1 WE_normal 1H }; ( 70) mov(16) g2<1>UW 0x1UW { align1 WE_normal 1H }; ( 72) endif(16) 2 null { align1 WE_normal 1H }; After this patch, it becomes: ( 24) cmp.g(8) null g110<8,8,1>D 0D { align1 WE_normal 1Q }; ( 26) cmp.g(8) null g111<8,8,1>D 0D { align1 WE_normal 2Q }; ( 28) cmp.g.f1.1(8) null g112<8,8,1>D 0D { align1 WE_normal 1Q }; ( 30) cmp.g.f1.1(8) null g113<8,8,1>D 0D { align1 WE_normal 2Q }; ( 32) cmp.g.f0.1(8) null g114<8,8,1>D 0D { align1 WE_normal 1Q }; ( 34) cmp.g.f0.1(8) null g115<8,8,1>D 0D { align1 WE_normal 2Q }; ( 36) (+f0.1) sel(16) g109<1>UW g1.2<0,1,0>UW g1<0,1,0>UW { align1 WE_normal 1H }; ( 38) cmp.g.f1.0(8) null g116<8,8,1>D 0D { align1 WE_normal 1Q }; ( 40) cmp.g.f1.0(8) null g117<8,8,1>D 0D { align1 WE_normal 2Q }; ( 42) mov(16) g106<1>F -nanF { align1 WE_normal 1H }; ( 44) (+f0) sel(16) g98<1>D g106<8,8,1>D 0D { align1 WE_normal 1H }; ( 46) (+f1.1) sel(16) g100<1>D g106<8,8,1>D 0D { align1 WE_normal 1H }; ( 48) (+f0.1) sel(16) g102<1>D g106<8,8,1>D 0D { align1 WE_normal 1H }; ( 50) (+f1) sel(16) g104<1>D g106<8,8,1>D 0D { align1 WE_normal 1H }; ( 52) add(16) g96<1>D g1.3<0,1,0>D g120<8,8,1>D { align1 WE_normal 1H }; ( 54) send(16) null g96<8,8,1>UD data (bti: 1, rgba: 0, SIMD16, legacy, Untyped Surface Write) mlen 10 rlen 0 { align1 WE_normal 1H }; ( 56) mov(16) g2<1>UW 0x1UW { align1 WE_normal 1H }; ( 58) endif(16) 2 null { align1 WE_normal 1H }; It reduces the instruction count from 25 to 18. Save about 28% instructions. v2: Fix some minor bugs. Signed-off-by: Zhigang Gong Reviewed-by: "Yang, Rong R" Reviewed-by: "Song, Ruiling" --- backend/src/backend/gen_context.cpp | 46 ++-- backend/src/backend/gen_encoder.cpp | 35 ++- backend/src/backend/gen_encoder.hpp | 10 +- backend/src/backend/gen_insn_selection.cpp | 136 ++++++++--- backend/src/backend/gen_reg_allocation.cpp | 375 ++++++++++++++++++----------- backend/src/backend/gen_register.hpp | 11 +- 6 files changed, 406 insertions(+), 207 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index c9e0835..3224c06 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -190,7 +190,7 @@ namespace gbe const GenRegister dst = ra->genReg(insn.dst(0)); const GenRegister src = ra->genReg(insn.src(0)); switch (insn.opcode) { - case SEL_OP_MOV: p->MOV(dst, src); break; + case SEL_OP_MOV: p->MOV(dst, src, insn.extra.function); break; case SEL_OP_FBH: p->FBH(dst, src); break; case SEL_OP_FBL: p->FBL(dst, src); break; case SEL_OP_NOT: p->NOT(dst, src); break; @@ -407,9 +407,9 @@ namespace gbe p->pop(); } break; - case SEL_OP_AND: p->AND(dst, src0, src1); break; - case SEL_OP_OR: p->OR (dst, src0, src1); break; - case SEL_OP_XOR: p->XOR(dst, src0, src1); break; + case SEL_OP_AND: p->AND(dst, src0, src1, insn.extra.function); break; + case SEL_OP_OR: p->OR (dst, src0, src1, insn.extra.function); break; + case SEL_OP_XOR: p->XOR(dst, src0, src1, insn.extra.function); break; case SEL_OP_I64AND: { GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL), @@ -566,7 +566,9 @@ namespace gbe GenRegister g = ra->genReg(insn.dst(7)); GenRegister h = ra->genReg(insn.dst(8)); GenRegister i = ra->genReg(insn.dst(9)); - GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10))); + //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10))); + // We just simply use the temporary flag here. + GenRegister flagReg = GenRegister::flag(0, 1); loadTopHalf(a, x); loadBottomHalf(b, x); loadTopHalf(c, y); @@ -613,7 +615,9 @@ namespace gbe GenRegister g = ra->genReg(insn.dst(7)); GenRegister h = ra->genReg(insn.dst(8)); GenRegister i = ra->genReg(insn.dst(9)); - GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10))); + //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10))); + // We just simply use the temporary flag here. + GenRegister flagReg = GenRegister::flag(0, 1); GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1); loadTopHalf(a, x); loadBottomHalf(b, x); @@ -797,7 +801,9 @@ namespace gbe GenRegister e = ra->genReg(insn.dst(5)); GenRegister f = ra->genReg(insn.dst(6)); a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD; - GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(7))); + //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(7))); + // We just simply use the temporary flag here. + GenRegister flagReg = GenRegister::flag(0, 1); GenRegister zero = GenRegister::immud(0); switch(insn.opcode) { case SEL_OP_I64SHL: @@ -1001,7 +1007,9 @@ namespace gbe GenRegister mantissa = ra->genReg(insn.dst(4)); GenRegister tmp = ra->genReg(insn.dst(5)); GenRegister tmp_high = ra->genReg(insn.dst(6)); - GenRegister f0 = checkFlagRegister(ra->genReg(insn.dst(7))); + //GenRegister f0 = checkFlagRegister(ra->genReg(insn.dst(7))); + // We just simply use the temporary flag here. + GenRegister f0 = GenRegister::flag(0, 1); loadTopHalf(high, src); loadBottomHalf(low, src); if(!src.is_signed_int()) { @@ -1039,7 +1047,9 @@ namespace gbe GenRegister dst = ra->genReg(insn.dst(0)); GenRegister high = ra->genReg(insn.dst(1)); GenRegister tmp = ra->genReg(insn.dst(2)); - GenRegister flag0 = checkFlagRegister(ra->genReg(insn.dst(3))); + //GenRegister flag0 = checkFlagRegister(ra->genReg(insn.dst(3))); + // We just simply use the temporary flag here. + GenRegister flag0 = GenRegister::flag(0, 1); if(dst.is_signed_int()) high = GenRegister::retype(high, GEN_TYPE_D); @@ -1160,7 +1170,9 @@ namespace gbe GenRegister c = ra->genReg(insn.dst(3)); GenRegister d = ra->genReg(insn.dst(4)); GenRegister e = ra->genReg(insn.dst(5)); - GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6))); + //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6))); + // We just simply use the temporary flag here. + GenRegister flagReg = GenRegister::flag(0, 1); loadTopHalf(a, x); loadBottomHalf(b, x); loadTopHalf(c, y); @@ -1208,7 +1220,9 @@ namespace gbe GenRegister c = ra->genReg(insn.dst(3)); GenRegister d = ra->genReg(insn.dst(4)); GenRegister e = ra->genReg(insn.dst(5)); - GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6))); + //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6))); + // We just simply use the temporary flag here. + GenRegister flagReg = GenRegister::flag(0, 1); loadTopHalf(a, x); loadBottomHalf(b, x); loadTopHalf(c, y); @@ -1266,7 +1280,7 @@ namespace gbe int execWidth = p->curr.execWidth; dest = dest.top_half(); p->push(); - p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.noMask = 0; p->curr.execWidth = 8; p->MOV(dest, src); p->curr.nibControl = 1; @@ -1302,7 +1316,7 @@ namespace gbe dest = dest.bottom_half(); p->push(); p->curr.execWidth = 8; - p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.noMask = 0; p->MOV(dest, src); p->curr.nibControl = 1; p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); @@ -1414,7 +1428,9 @@ namespace gbe GenRegister k = ra->genReg(insn.dst(11)); GenRegister l = ra->genReg(insn.dst(12)); GenRegister m = ra->genReg(insn.dst(13)); - GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(14))); + //GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(14))); + // We just simply use the temporary flag here. + GenRegister flagReg = GenRegister::flag(0, 1); GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1), imm31 = GenRegister::immud(31); @@ -1511,7 +1527,7 @@ namespace gbe int jip = -(int)(p->n_instruction() - loop_start + 1) * 2; p->curr.noMask = 1; p->JMPI(zero); - p->patchJMPI(p->n_instruction() - 2, jip + 2); + p->patchJMPI(p->n_instruction() - 1, jip + 2); p->pop(); // end of loop } diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index e8670b9..9df031e 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -661,7 +661,8 @@ namespace gbe } } - INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) { + INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, + GenRegister src, uint32_t condition = 0) { if (dst.isdf() && src.isdf()) { handleDouble(p, opcode, dst, src); } else if (dst.isint64() && src.isint64()) { // handle int64 @@ -678,6 +679,11 @@ namespace gbe p->pop(); } else if (needToSplitAlu1(p, dst, src) == false) { GenInstruction *insn = p->next(opcode); + if (condition != 0) { + GBE_ASSERT(opcode == GEN_OPCODE_MOV || + opcode == GEN_OPCODE_NOT); + insn->header.destreg_or_condmod = condition; + } p->setHeader(insn); p->setDst(insn, dst); p->setSrc0(insn, src); @@ -706,12 +712,19 @@ namespace gbe uint32_t opcode, GenRegister dst, GenRegister src0, - GenRegister src1) + GenRegister src1, + uint32_t condition = 0) { if (dst.isdf() && src0.isdf() && src1.isdf()) { handleDouble(p, opcode, dst, src0, src1); } else if (needToSplitAlu2(p, dst, src0, src1) == false) { GenInstruction *insn = p->next(opcode); + if (condition != 0) { + GBE_ASSERT(opcode == GEN_OPCODE_OR || + opcode == GEN_OPCODE_XOR || + opcode == GEN_OPCODE_AND); + insn->header.destreg_or_condmod = condition; + } p->setHeader(insn); p->setDst(insn, dst); p->setSrc0(insn, src0); @@ -817,15 +830,21 @@ namespace gbe #undef NO_SWIZZLE #define ALU1(OP) \ - void GenEncoder::OP(GenRegister dest, GenRegister src0) { \ - alu1(this, GEN_OPCODE_##OP, dest, src0); \ + void GenEncoder::OP(GenRegister dest, GenRegister src0, uint32_t condition) { \ + alu1(this, GEN_OPCODE_##OP, dest, src0, condition); \ } #define ALU2(OP) \ void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1) { \ - alu2(this, GEN_OPCODE_##OP, dest, src0, src1); \ + alu2(this, GEN_OPCODE_##OP, dest, src0, src1, 0); \ } +#define ALU2_MOD(OP) \ + void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition) { \ + alu2(this, GEN_OPCODE_##OP, dest, src0, src1, condition); \ + } + + #define ALU3(OP) \ void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \ alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \ @@ -947,9 +966,9 @@ namespace gbe ALU1(F32TO16) ALU2(SEL) ALU1(NOT) - ALU2(AND) - ALU2(OR) - ALU2(XOR) + ALU2_MOD(AND) + ALU2_MOD(OR) + ALU2_MOD(XOR) ALU2(SHR) ALU2(SHL) ALU2(RSR) diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index f5e8548..50662fb 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -86,8 +86,9 @@ namespace gbe // Encoding functions //////////////////////////////////////////////////////////////////////// -#define ALU1(OP) void OP(GenRegister dest, GenRegister src0); +#define ALU1(OP) void OP(GenRegister dest, GenRegister src0, uint32_t condition = 0); #define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1); +#define ALU2_MOD(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition = 0); #define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2); ALU1(MOV) ALU1(FBH) @@ -103,9 +104,9 @@ namespace gbe ALU1(F32TO16) ALU2(SEL) ALU1(NOT) - ALU2(AND) - ALU2(OR) - ALU2(XOR) + ALU2_MOD(AND) + ALU2_MOD(OR) + ALU2_MOD(XOR) ALU2(SHR) ALU2(SHL) ALU2(RSR) @@ -126,6 +127,7 @@ namespace gbe ALU1(BRD) #undef ALU1 #undef ALU2 +#undef ALU2_MOD #undef ALU3 void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null()); void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value); diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index aec0459..975c71a 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -148,7 +148,9 @@ namespace gbe SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) : parent(NULL), opcode(op), dstNum(dst), srcNum(src) - {} + { + extra.function = 0; + } void SelectionInstruction::prepend(SelectionInstruction &other) { gbe::prepend(&other, this); @@ -225,6 +227,7 @@ namespace gbe GBE_ASSERT(insn.getSrcNum() < 127); for (uint32_t childID = 0; childID < childNum; ++childID) this->child[childID] = NULL; + computeBool = false; } /*! Mergeable are non-root instructions with valid sources */ INLINE void setAsMergeable(uint32_t which) { mergeable|=(1<setAsMergeable(srcID); dag->child[srcID] = child; + // Check whether this bool is used as a normal source + // oprand other than BRA/SEL. + if (getRegisterFamily(reg) == FAMILY_BOOL) { + if (insn.getOpcode() != OP_BRA && + (insn.getOpcode() != OP_SEL || + (insn.getOpcode() == OP_SEL && srcID != 0))) + child->computeBool = true; + } } else dag->child[srcID] = NULL; } @@ -1686,8 +1699,16 @@ namespace gbe if (dst.isdf()) { ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD); sel.MOV_DF(dst, src, sel.selReg(r)); - } else - sel.MOV(dst, src); + } else { + sel.push(); + if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) { + sel.curr.physicalFlag = 0; + sel.curr.flagIndex = (uint16_t)(insn.getDst(0)); + sel.curr.modFlag = 1; + } + sel.MOV(dst, src); + sel.pop(); + } break; case ir::OP_RNDD: sel.RNDD(dst, src); break; case ir::OP_RNDE: sel.RNDE(dst, src); break; @@ -1842,6 +1863,16 @@ namespace gbe } // Output the binary instruction + if (sel.curr.execWidth != 1 && + sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL) { + GBE_ASSERT(insn.getOpcode() == OP_AND || + insn.getOpcode() == OP_OR || + insn.getOpcode() == OP_XOR); + sel.curr.physicalFlag = 0; + sel.curr.flagIndex = (uint16_t)(insn.getDst(0)); + sel.curr.modFlag = 1; + } + switch (opcode) { case OP_ADD: if (type == Type::TYPE_U64 || type == Type::TYPE_S64) { @@ -2316,6 +2347,11 @@ namespace gbe switch (type) { case TYPE_BOOL: + if (!sel.isScalarOrBool(insn.getDst(0))) { + sel.curr.modFlag = 1; + sel.curr.physicalFlag = 0; + sel.curr.flagIndex = (uint16_t) insn.getDst(0); + } sel.MOV(dst, imm.data.b ? GenRegister::immuw(0xffff) : GenRegister::immuw(0)); break; case TYPE_U32: @@ -2656,13 +2692,20 @@ namespace gbe const Type type = insn.getType(); const Register dst = insn.getDst(0); GenRegister tmpDst; + const BasicBlock *curr = insn.getParent(); + const ir::Liveness &liveness = sel.ctx.getLiveness(); + const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(curr); + bool needStoreBool = false; + if (liveOut.contains(dst) || dag.computeBool) + needStoreBool = true; + if(type == TYPE_S64 || type == TYPE_U64 || type == TYPE_DOUBLE || type == TYPE_FLOAT || - type == TYPE_U32 || type == TYPE_S32 ) + type == TYPE_U32 || type == TYPE_S32 /*|| + (!needStoreBool)*/) tmpDst = GenRegister::nullud(); else tmpDst = sel.selReg(dst, TYPE_BOOL); - // Look for immediate values for the right source GenRegister src0, src1; SelectionDAG *dag0 = dag.child[0]; @@ -2685,35 +2728,42 @@ namespace gbe } sel.push(); - sel.curr.flag = 1; - sel.curr.subFlag = 1; + sel.curr.physicalFlag = 0; + sel.curr.modFlag = 1; + sel.curr.flagIndex = (uint16_t)dst; + sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean. if (type == TYPE_S64 || type == TYPE_U64) { GenRegister tmp[3]; for(int i=0; i<3; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); + sel.curr.flagGen = 1; sel.I64CMP(getGenCompare(opcode), src0, src1, tmp); } else if(opcode == OP_ORD) { sel.push(); sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst); sel.curr.predicate = GEN_PREDICATE_NORMAL; + sel.curr.flagGen = 1; sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst); sel.pop(); - } else + } else { + if((type == TYPE_S64 || type == TYPE_U64 || + type == TYPE_DOUBLE || type == TYPE_FLOAT || + type == TYPE_U32 || type == TYPE_S32)) + sel.curr.flagGen = 1; sel.CMP(getGenCompare(opcode), src0, src1, tmpDst); + } +#if 0 + if((type == TYPE_S64 || type == TYPE_U64 || + type == TYPE_DOUBLE || type == TYPE_FLOAT || + type == TYPE_U32 || type == TYPE_S32) /*&& + needStoreBool*/) { + sel.curr.predicate = GEN_PREDICATE_NORMAL; + sel.SEL(sel.selReg(dst, TYPE_U16), + sel.selReg(ir::ocl::one, TYPE_U16), + sel.selReg(ir::ocl::zero, TYPE_U16)); + } +#endif sel.pop(); - - if(type == TYPE_S64 || type == TYPE_U64 || - type == TYPE_DOUBLE || type == TYPE_FLOAT || - type == TYPE_U32 || type == TYPE_S32 ) { - sel.push(); - sel.curr.flag = 1; - sel.curr.subFlag = 1; - sel.curr.predicate = GEN_PREDICATE_NORMAL; - sel.SEL(sel.selReg(dst, TYPE_U16), - sel.selReg(ir::ocl::one, TYPE_U16), - sel.selReg(ir::ocl::zero, TYPE_U16)); - sel.pop(); - } return true; } }; @@ -2941,13 +2991,11 @@ namespace gbe const Register pred = insn.getPredicate(); sel.push(); - sel.curr.predicate = GEN_PREDICATE_NONE; - sel.curr.execWidth = simdWidth; - sel.curr.flag = 1; - sel.curr.subFlag = 1; - sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0)); - //sel.curr.noMask = 0; + sel.curr.physicalFlag = 0; + sel.curr.flagIndex = (uint16_t) pred; sel.curr.predicate = GEN_PREDICATE_NORMAL; + if (!dag0) + sel.curr.externFlag = 1; if(type == ir::TYPE_S64 || type == ir::TYPE_U64) sel.SEL_INT64(dst, src0, src1); else @@ -3214,8 +3262,15 @@ namespace gbe }; /*! Branch instruction pattern */ - DECL_PATTERN(BranchInstruction) + class BranchInstructionPattern : public SelectionPattern { + public: + BranchInstructionPattern(void) : SelectionPattern(1,1) { + for (uint32_t op = 0; op < ir::OP_INVALID; ++op) + if (ir::isOpcodeFrom(ir::Opcode(op)) == true) + this->opcodes.push_back(ir::Opcode(op)); + } + void emitForwardBranch(Selection::Opaque &sel, const ir::BranchInstruction &insn, ir::LabelIndex dst, @@ -3234,11 +3289,11 @@ namespace gbe // we don't need to set next label to the pcip // as if there is no backward jump latter, then obviously everything will work fine. // If there is backward jump latter, then all the pcip will be updated correctly there. - sel.curr.flag = 0; - sel.curr.subFlag = 0; - sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0)); + sel.curr.physicalFlag = 0; + sel.curr.flagIndex = (uint16_t) pred; sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); + sel.curr.predicate = GEN_PREDICATE_NONE; if (!sel.block->hasBarrier) sel.ENDIF(GenRegister::immd(0), nextLabel); sel.block->endifOffset = -1; @@ -3285,10 +3340,8 @@ namespace gbe sel.MOV(ip, GenRegister::immuw(uint16_t(next))); GBE_ASSERT(jip == dst); sel.push(); - sel.curr.flag = 0; - sel.curr.subFlag = 0; - sel.curr.predicate = GEN_PREDICATE_NONE; - sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), GenRegister::immuw(0)); + sel.curr.physicalFlag = 0; + sel.curr.flagIndex = (uint16_t) pred; sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); sel.block->endifOffset = -1; @@ -3320,8 +3373,9 @@ namespace gbe } } - INLINE bool emitOne(Selection::Opaque &sel, const ir::BranchInstruction &insn) const { + INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; + const ir::BranchInstruction &insn = cast(dag.insn); const Opcode opcode = insn.getOpcode(); if (opcode == OP_RET) sel.EOT(); @@ -3329,17 +3383,25 @@ namespace gbe const LabelIndex dst = insn.getLabelIndex(); const LabelIndex src = insn.getParent()->getLabelIndex(); + sel.push(); + if (insn.isPredicated() == true) { + if (dag.child[0] == NULL) + sel.curr.externFlag = 1; + } + // We handle foward and backward branches differently if (uint32_t(dst) <= uint32_t(src)) this->emitBackwardBranch(sel, insn, dst, src); else this->emitForwardBranch(sel, insn, dst, src); + sel.pop(); } else NOT_IMPLEMENTED; + + markAllChildren(dag); return true; } - DECL_CTOR(BranchInstruction, 1, 1); }; /*! Sort patterns */ diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 3f2e746..c6d7d58 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -153,8 +153,13 @@ namespace gbe vector vectors; /*! The set of booleans that will go to GRF (cannot be kept into flags) */ set grfBooleans; + /*! The set of booleans which be held in flags, don't need to allocate grf */ + set flagBooleans; /*! All the register intervals */ vector intervals; + /*! All the boolean register intervals on the corresponding BB*/ + typedef map RegIntervalMap; + map boolIntervalsMap; /*! Intervals sorting based on starting point positions */ vector starting; /*! Intervals sorting based on ending point positions */ @@ -365,154 +370,219 @@ namespace gbe return ret; } - void GenRegAllocator::Opaque::allocateFlags(Selection &selection) { - - // Store the registers allocated in the map - map allocatedFlags; - GenRegInterval spill = ir::Register(ir::RegisterFile::MAX_INDEX); - // we have two flags we use for booleans f1.0 and f1.1 - const uint32_t flagNum = 2; - uint32_t freeFlags[] = {0,1}; - uint32_t freeNum = flagNum; - - // Perform the linear scan allocator on the flag registers only. We only use - // two flags registers for the booleans right now: f1.0 and f1.1 - const uint32_t regNum = ctx.sel->getRegNum(); - uint32_t endID = 0; // interval to expire - for (uint32_t startID = 0; startID < regNum; ++startID) { - const GenRegInterval &interval = *this->starting[startID]; - const ir::Register reg = interval.reg; - if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL) - continue; // Not a flag. We don't care - if (grfBooleans.contains(reg)) - continue; // Cannot use a flag register - if (interval.maxID == -INT_MAX) - continue; // Unused register - if (freeNum != 0) { - spill = interval; - allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum])); + void GenRegAllocator::Opaque::allocateFlags(Selection &selection) { + // Previously, we have a global flag allocation implemntation. + // After some analysis, I found the global flag allocation is not + // the best solution here. + // As for the cross block reference of bool value, we have to + // combine it with current emask. There is no obvious advantage to + // allocate deadicate physical flag register for those cross block usage. + // We just need to allocate physical flag within each BB. We need to handle + // the following cases: + // + // 1. The bool's liveness never beyond this BB. And the bool is only used as + // a dst register or a pred register. This bool value could be + // allocated in physical flag only if there is enough physical flag. + // We already identified those bool at the instruction select stage, and + // put them in the flagBooleans set. + // 2. The bool is defined in another BB and used in this BB, then we need + // to prepend an instruction at the position where we use it. + // 3. The bool is defined in this BB but is also used as some instruction's + // source registers rather than the pred register. We have to keep the normal + // grf (UW8/UW16) register for this bool. For some CMP instruction, we need to + // append a SEL instruction convert the flag to the grf register. + // 4. Even for the spilling flag, if there is only one spilling flag, we will also + // try to reuse the temporary flag register latter. This requires all the + // instructions should got it flag at the instruction selection stage. And should + // not use the flag physical number directly at the gen_context stage. Otherwise, + // may break the algorithm here. + // We will track all the validated bool value and to avoid any redundant + // validation for the same flag. But if there is no enough physical flag, + // we have to spill the previous allocated physical flag. And the spilling + // policy is to spill the allocate flag which live to the last time end point. + + // we have three flags we use for booleans f0.0 , f1.0 and f1.1 + for (auto &block : *selection.blockList) { + // Store the registers allocated in the map + map allocatedFlags; + map allocatedFlagIntervals; + + const uint32_t flagNum = 3; + uint32_t freeFlags[] = {2, 3, 0}; + uint32_t freeNum = flagNum; + if (boolIntervalsMap.find(&block) == boolIntervalsMap.end()) + continue; + const auto boolsMap = boolIntervalsMap[&block]; + vector flagStarting; + vector flagEnding; + GBE_ASSERT(boolsMap->size() > 0); + uint32_t regNum = boolsMap->size(); + flagStarting.resize(regNum); + flagEnding.resize(regNum); + uint32_t id = 0; + for (auto &interval : *boolsMap) { + flagStarting[id] = flagEnding[id] = &interval.second; + id++; } - else { + std::sort(flagStarting.begin(), flagStarting.end(), cmp); + std::sort(flagEnding.begin(), flagEnding.end(), cmp); + + uint32_t endID = 0; // interval to expire + for (uint32_t startID = 0; startID < regNum; ++startID) { + const GenRegInterval *interval = flagStarting[startID]; + const ir::Register reg = interval->reg; + GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL); + if (freeNum != 0) { + allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum])); + allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum])); + } else { // Try to expire one register - while (endID != ending.size()) { - const GenRegInterval *toExpire = this->ending[endID]; - const ir::Register reg = toExpire->reg; + while (endID != flagEnding.size()) { + const GenRegInterval *toExpire = flagEnding[endID]; // Dead code produced by the insn selection -> we skip it if (toExpire->minID > toExpire->maxID) { endID++; continue; } // We cannot expire this interval and the next ones - if (toExpire->maxID >= interval.minID) + if (toExpire->maxID >= interval->minID) break; - // Must be a boolean allocated with a flag register - if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL || grfBooleans.contains(reg)) { + // We reuse a flag from a previous interval (the oldest one) + auto it = allocatedFlags.find(toExpire->reg); + if (it == allocatedFlags.end()) { endID++; continue; } - // We reuse a flag from a previous interval (the oldest one) - auto it = allocatedFlags.find(toExpire->reg); - GBE_ASSERT(it != allocatedFlags.end()); freeFlags[freeNum++] = it->second; endID++; break; } - - // We need to spill one of the previous boolean values - if (freeNum == 0) { - GBE_ASSERT(uint16_t(spill.reg) != ir::RegisterFile::MAX_INDEX); - // We spill the last inserted boolean and use its flag instead for - // this one - if (spill.maxID > interval.maxID) { - auto it = allocatedFlags.find(spill.reg); - GBE_ASSERT(it != allocatedFlags.end()); - allocatedFlags.insert(std::make_pair(reg, it->second)); - allocatedFlags.erase(spill.reg); - grfBooleans.insert(spill.reg); - spill = interval; + if (freeNum != 0) { + allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum])); + allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum])); + } + else { + // FIXME we may sort the allocated flags before do the spilling in the furture. + int32_t spill = -1; + const GenRegInterval *spillInterval = NULL; + int32_t maxID = 0; + for (auto &it : allocatedFlagIntervals) { + if (it.first->maxID <= interval->minID) + continue; + if (it.first->maxID > maxID && it.second != 0) { + maxID = it.first->maxID; + spill = it.second; + spillInterval = it.first; + } } - // We will use a grf for the current register - else { - grfBooleans.insert(reg); + if (spill != -1) { + allocatedFlags.insert(std::make_pair(reg, spill)); + allocatedFlagIntervals.insert(std::make_pair(interval, spill)); + allocatedFlags.erase(spillInterval->reg); + allocatedFlagIntervals.erase(spillInterval); + // We spill this flag booleans register, so erase it from the flag boolean set. + if (flagBooleans.contains(spillInterval->reg)) + flagBooleans.erase(spillInterval->reg); + } else { + GBE_ASSERT(0); } } - else - allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum])); - } - } - - // Now, we traverse all the selection instructions and we patch them to make - // them use flag registers - for (auto &block : *selection.blockList) - for (auto &insn : block.insnList) { - const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum; - - // Patch the source booleans - for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { - const GenRegister selReg = insn.src(srcID); - const ir::Register reg = selReg.reg(); - if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL) - continue; - auto it = allocatedFlags.find(reg); - if (it == allocatedFlags.end()) - continue; - // Use a flag register for it now - insn.src(srcID) = GenRegister::flag(1,it->second); - } - - // Patch the destination booleans - for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { - const GenRegister selReg = insn.dst(dstID); - const ir::Register reg = selReg.reg(); - if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL) - continue; - auto it = allocatedFlags.find(reg); - if (it == allocatedFlags.end()) - continue; - // Use a flag register for it now - insn.dst(dstID) = GenRegister::flag(1,it->second); + } } + delete boolsMap; - // Patch the predicate now. Note that only compares actually modify it (it - // is called a "conditional modifier"). The other instructions just read - // it - if (insn.state.physicalFlag == 0) { - auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex)); - // Just patch it if we can use a flag directly - if (it != allocatedFlags.end()) { - insn.state.flag = 1; - insn.state.subFlag = it->second; - insn.state.physicalFlag = 1; - } - // When we let the boolean in a GRF, use f0.1 as a temporary - else { - // Mov the GRF to the flag such that the flag can be read - SelectionInstruction *mov0 = selection.create(SEL_OP_MOV,1,1); - mov0->state = GenInstructionState(1); - mov0->state.predicate = GEN_PREDICATE_NONE; - mov0->state.noMask = 1; - mov0->src(0) = GenRegister::uw1grf(ir::Register(insn.state.flagIndex)); - mov0->dst(0) = GenRegister::flag(0,1); - - // Do not prepend if the flag is not read (== used only as a - // conditional modifier) - if (insn.state.predicate != GEN_PREDICATE_NONE) - insn.prepend(*mov0); - - // We can use f0.1 (our "backdoor" flag) - insn.state.flag = 0; - insn.state.subFlag = 1; - insn.state.physicalFlag = 1; - - // Compare instructions update the flags so we must copy it back to - // the GRF - if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) { - SelectionInstruction *mov1 = selection.create(SEL_OP_MOV,1,1); - mov1->state = mov0->state; - mov1->dst(0) = mov0->src(0); - mov1->src(0) = mov0->dst(0); - insn.append(*mov1); + // Now, we traverse all the selection instructions and we patch them to make + // them use flag registers + set validatedFlags; + uint16_t validTempFlagReg = 0; + for (auto &insn : block.insnList) { + // Patch the predicate now. Note that only compares actually modify it (it + // is called a "conditional modifier"). The other instructions just read + // it + if (insn.state.physicalFlag == 0) { + auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex)); + if (it != allocatedFlags.end()) { + insn.state.flag = it->second / 2; + insn.state.subFlag = it->second & 1; + insn.state.physicalFlag = 1; + // modFlag is for the LOADI/MOV/AND/OR/XOR instructions which will modify a + // flag register. We set the condition for them to save one instruction if possible. + if (insn.state.modFlag == 1 && + (insn.opcode == SEL_OP_MOV || + insn.opcode == SEL_OP_AND || + insn.opcode == SEL_OP_OR || + insn.opcode == SEL_OP_XOR)) + insn.extra.function = GEN_CONDITIONAL_NEQ; + if ((insn.state.externFlag && + insn.state.predicate != GEN_PREDICATE_NONE && + validatedFlags.find(insn.state.flagIndex) == validatedFlags.end())) { + // This is an external bool, we need to validate it if it is not validated yet. + SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2); + cmp0->state = GenInstructionState(insn.state.execWidth); + cmp0->state.flag = insn.state.flag; + cmp0->state.subFlag = insn.state.subFlag; + cmp0->src(0) = GenRegister::uw8grf(ir::Register(insn.state.flagIndex)); + cmp0->src(1) = GenRegister::immuw(0); + cmp0->dst(0) = GenRegister::null(); + cmp0->extra.function = GEN_CONDITIONAL_NEQ; + insn.prepend(*cmp0); + validatedFlags.insert(insn.state.flagIndex); + } + } else { + // This bool doesn't have a deadicated flag, we use temporary flag here. + // each time we need to validate it from the grf register. + // We track the last temporary validate register, if it's the same as + // current, we can avoid the revalidation. + insn.state.flag = 0; + insn.state.subFlag = 1; + insn.state.physicalFlag = 1; + if ((insn.state.predicate != GEN_PREDICATE_NONE) + && validTempFlagReg != insn.state.flagIndex) { + SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2); + cmp0->state = GenInstructionState(insn.state.execWidth); + cmp0->state.flag = insn.state.flag; + cmp0->state.subFlag = insn.state.subFlag; + cmp0->src(0) = GenRegister::uw8grf(ir::Register(insn.state.flagIndex)); + cmp0->src(1) = GenRegister::immuw(0); + cmp0->dst(0) = GenRegister::null(); + cmp0->extra.function = GEN_CONDITIONAL_NEQ; + insn.prepend(*cmp0); + } + if (insn.state.modFlag == 0) + validTempFlagReg = insn.state.flagIndex; + else + validTempFlagReg = 0; + } + if (insn.opcode == SEL_OP_CMP && + flagBooleans.contains((ir::Register)(insn.dst(0).value.reg))) { + // This is a CMP for a pure flag booleans, we don't need to write result to + // the grf. And latter, we will not allocate grf for it. + insn.dst(0) = GenRegister::null(); + } + // If the instruction requires to generate (CMP for long/int/float..) + // the flag value to the register, and it's not a pure flag boolean, + // we need to use SEL instruction to generate the flag value to the UW8 + // register. + if (insn.state.flagGen == 1 && + !flagBooleans.contains((ir::Register)(insn.state.flagIndex))) { + SelectionInstruction *sel0 = selection.create(SEL_OP_SEL, 1, 2); + sel0->state = GenInstructionState(ctx.getSimdWidth()); + sel0->state.flag = insn.state.flag; + sel0->state.subFlag = insn.state.subFlag; + sel0->state.predicate = GEN_PREDICATE_NORMAL; + sel0->src(0) = GenRegister::uw1grf(ir::ocl::one); + sel0->src(1) = GenRegister::uw1grf(ir::ocl::zero); + sel0->dst(0) = GenRegister::uw8grf((ir::Register)insn.state.flagIndex); + insn.append(*sel0); + // We use the zero one after the liveness analysis, we have to update + // the liveness data manually here. + GenRegInterval &interval0 = intervals[ir::ocl::zero]; + GenRegInterval &interval1 = intervals[ir::ocl::one]; + interval0.minID = std::min(interval0.minID, (int32_t)insn.ID); + interval0.maxID = std::max(interval0.maxID, (int32_t)insn.ID); + interval1.minID = std::min(interval1.minID, (int32_t)insn.ID); + interval1.maxID = std::max(interval1.maxID, (int32_t)insn.ID); } } } @@ -530,6 +600,9 @@ namespace gbe if (RA.contains(reg)) continue; // already allocated + if (flagBooleans.contains(reg)) + continue; + // Case 1: the register belongs to a vector, allocate all the registers in // one piece auto it = vectorMap.find(reg); @@ -621,6 +694,8 @@ namespace gbe INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg) { auto it = RA.find(reg); + if (flagBooleans.contains(reg)) + return false; GBE_ASSERT(it != RA.end()); // offset less than 32 means it is not managed by our reg allocator. if (it->second < 32) @@ -803,6 +878,7 @@ namespace gbe int32_t firstID = insnID; // Update the intervals of each used register. Note that we do not // register allocate R0, so we skip all sub-registers in r0 + RegIntervalMap *boolsMap = new RegIntervalMap; for (auto &insn : block.insnList) { const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum; insn.ID = insnID; @@ -831,23 +907,33 @@ namespace gbe this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID); } - // Flag registers can only go to src[0] -#if 0 - const SelectionOpcode opcode = SelectionOpcode(insn.opcode); - if (opcode == SEL_OP_AND || opcode == SEL_OP_OR || opcode == SEL_OP_XOR - || opcode == SEL_OP_I64AND || opcode == SEL_OP_I64OR || opcode == SEL_OP_I64XOR) { - if (insn.src(1).physical == 0) { - const ir::Register reg = insn.src(1).reg(); - if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL) - grfBooleans.insert(reg); - } - } -#endif // OK, a flag is used as a predicate or a conditional modifier if (insn.state.physicalFlag == 0) { const ir::Register reg = ir::Register(insn.state.flagIndex); this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID); this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID); + // Check whether this is a pure flag booleans candidate. + if (insn.state.grfFlag == 0) + flagBooleans.insert(reg); + GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL); + // update the bool register's per-BB's interval data + if (boolsMap->find(reg) == boolsMap->end()) { + GenRegInterval boolInterval(reg); + boolsMap->insert(std::make_pair(reg, boolInterval)); + } + boolsMap->find(reg)->second.minID = std::min(boolsMap->find(reg)->second.minID, insnID); + boolsMap->find(reg)->second.maxID = std::max(boolsMap->find(reg)->second.maxID, insnID); + if (&insn == block.insnList.back() && + insn.opcode == SEL_OP_JMPI && + insn.state.predicate != GEN_PREDICATE_NONE) { + // If this is the last instruction and is a predicated JMPI. + // We must extent its liveness before any other instrution. + // As we need to allocate f0 to it, and need to keep the f0 + // unchanged during the block. The root cause is this instruction + // is out-of the if/endif region, so we have to borrow the f0 + // to get correct bits for all channels. + boolsMap->find(reg)->second.minID = 0; + } } lastID = insnID; insnID++; @@ -856,12 +942,17 @@ namespace gbe // All registers alive at the begining of the block must update their intervals. const ir::BasicBlock *bb = block.bb; for (auto reg : ctx.getLiveIn(bb)) - this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID); + this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID); // All registers alive at the end of the block must have their intervals // updated as well for (auto reg : ctx.getLiveOut(bb)) this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID); + + if (boolsMap->size() > 0) + boolIntervalsMap.insert(std::make_pair(&block, boolsMap)); + else + delete boolsMap; } this->intervals[ocl::retVal].minID = INT_MAX; @@ -870,6 +961,9 @@ namespace gbe // Allocate all the vectors first since they need to be contiguous this->allocateVector(selection); + // First we try to put all booleans registers into flags + this->allocateFlags(selection); + // Sort both intervals in starting point and ending point increasing orders const uint32_t regNum = ctx.sel->getRegNum(); this->starting.resize(regNum); @@ -889,9 +983,6 @@ namespace gbe break; } - // First we try to put all booleans registers into flags - //this->allocateFlags(selection); - // Allocate all the GRFs now (regular register and boolean that are not in // flag registers) return this->allocateGRFs(selection); diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index 051f16d..0480dd8 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -118,6 +118,10 @@ namespace gbe this->noMask = 0; this->flag = 0; this->subFlag = 0; + this->grfFlag = 1; + this->externFlag = 0; + this->modFlag = 0; + this->flagGen = 0; this->predicate = GEN_PREDICATE_NONE; this->inversePredicate = 0; this->physicalFlag = 1; @@ -125,9 +129,14 @@ namespace gbe this->saturate = GEN_MATH_SATURATE_NONE; } uint32_t physicalFlag:1; //!< Physical or virtual flag register - uint32_t flag:1; //!< Only if physical flag + uint32_t flag:1; //!< Only if physical flag, uint32_t subFlag:1; //!< Only if physical flag uint32_t flagIndex:16; //!< Only if virtual flag (index of the register) + uint32_t grfFlag:1; //!< Only if virtual flag, 0 means we do not need to allocate GRF. + uint32_t externFlag:1; //!< Only if virtual flag, 1 means this flag is from external BB. + uint32_t modFlag:1; //!< Only if virtual flag, 1 means will modify flag. + uint32_t flagGen:1; //!< Only if virtual flag, 1 means the gen_context stage may need to + //!< generate the flag. uint32_t execWidth:5; uint32_t quarterControl:1; uint32_t nibControl:1; -- 2.7.4