From aa5a21a9067982e6ffdc490afc82752243b48757 Mon Sep 17 00:00:00 2001 From: Mike Danes Date: Mon, 2 Jan 2017 11:05:31 +0200 Subject: [PATCH] Move compare "decomposition" code In the original implementation long compares were decomposed in LowerCompare and the rest of the work was done in TreeNodeInfoInitCmp, this restores that ordering. This enables the and-cmp to test transform for the high part comparison of a long comparison on 32 bit. The low part is unfortunately not handled because the CMP and the AND are in different BBs. Code like mov eax, dword ptr [edi] mov edi, dword ptr [edi+4] and eax, 0xD1FFAB1E and edi, 0xD1FFAB1E jne SHORT G_M14457_IG11 test eax, eax je G_M14457_IG15 becomes mov eax, dword ptr [edi] and eax, 0xD1FFAB1E test dword ptr [edi+4], 0xD1FFAB1E jne SHORT G_M14457_IG11 test eax, eax je G_M14457_IG15 --- src/jit/lower.cpp | 481 +++++++++++++++++++++++++++--------------------------- 1 file changed, 241 insertions(+), 240 deletions(-) diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index d74976b..2533782 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -1936,6 +1936,247 @@ GenTree* Lowering::LowerTailCallViaHelper(GenTreeCall* call, GenTree* callTarget void Lowering::LowerCompare(GenTree* cmp) { +#ifndef _TARGET_64BIT_ + LIR::Use cmpUse; + + if ((cmp->gtGetOp1()->TypeGet() == TYP_LONG) && BlockRange().TryGetUse(cmp, &cmpUse) && + cmpUse.User()->OperIs(GT_JTRUE)) + { + // For 32-bit targets any comparison that feeds a `GT_JTRUE` node must be lowered + // such that the liveness of the operands to the is properly visible to the rest + // of the backend. As such, a 64-bit comparison is lowered from something like this: + // + // ------------ BB02 [004..014) -> BB02 (cond), preds={BB02,BB01} succs={BB03,BB02} + // N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 + // + // /--* t6 int + // N002 ( 2, 3) [000007] ---------U-- t7 = * cast long <- ulong <- uint $3c0 + // + // N003 ( 3, 10) [000009] ------------ t9 = lconst long 0x0000000000000003 $101 + // + // /--* t7 long + // +--* t9 long + // N004 ( 9, 17) [000010] N------N-U-- t10 = * < int $149 + // + // /--* t10 int + // N005 ( 11, 19) [000011] ------------ * jmpTrue void + // + // To something like this: + // + // ------------ BB02 [004..014) -> BB03 (cond), preds={BB06,BB07,BB01} succs={BB06,BB03} + // [000099] ------------ t99 = const int 0 + // + // [000101] ------------ t101 = const int 0 + // + // /--* t99 int + // +--* t101 int + // N004 ( 9, 17) [000010] N------N-U-- t10 = * > int $149 + // + // /--* t10 int + // N005 ( 11, 19) [000011] ------------ * jmpTrue void + // + // + // ------------ BB06 [???..???) -> BB02 (cond), preds={BB02} succs={BB07,BB02} + // [000105] -------N-U-- jcc void cond=< + // + // + // ------------ BB07 [???..???) -> BB02 (cond), preds={BB06} succs={BB03,BB02} + // N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 + // + // N003 ( 3, 10) [000009] ------------ t9 = const int 3 + // + // /--* t6 int + // +--* t9 int + // [000106] N------N-U-- t106 = * < int + // + // /--* t106 int + // [000107] ------------ * jmpTrue void + // + // Which will eventually generate code similar to the following: + // + // 33DB xor ebx, ebx + // 85DB test ebx, ebx + // 7707 ja SHORT G_M50523_IG04 + // 72E7 jb SHORT G_M50523_IG03 + // 83F803 cmp eax, 3 + // 72E2 jb SHORT G_M50523_IG03 + // + + GenTree* src1 = cmp->gtGetOp1(); + GenTree* src2 = cmp->gtGetOp2(); + unsigned weight = m_block->getBBWeight(comp); + + LIR::Use loSrc1(BlockRange(), &(src1->gtOp.gtOp1), src1); + LIR::Use loSrc2(BlockRange(), &(src2->gtOp.gtOp1), src2); + + // TODO-CQ-32bit: We should move more code to the new basic block, currently we're only moving + // constants and lclvars. In particular, it would be nice to move GT_AND nodes as that would + // enable the and-cmp to test transform that happens later in this function. Though that's not + // exactly ideal, the and-cmp to test transform should run before this code but: + // - it would need to run before decomposition otherwise it won't recognize the 0 constant + // because after decomposition it is packed in a GT_LONG + // - this code would also need to handle GT_TEST_EQ/GT_TEST_NE + + if (!loSrc1.Def()->OperIs(GT_CNS_INT, GT_LCL_VAR)) + { + loSrc1.ReplaceWithLclVar(comp, weight); + } + + if (!loSrc2.Def()->OperIs(GT_CNS_INT, GT_LCL_VAR)) + { + loSrc2.ReplaceWithLclVar(comp, weight); + } + + BasicBlock* jumpDest = m_block->bbJumpDest; + BasicBlock* nextDest = m_block->bbNext; + BasicBlock* newBlock = comp->fgSplitBlockAtEnd(m_block); + + cmp->gtType = TYP_INT; + cmp->gtOp.gtOp1 = src1->gtOp.gtOp2; + cmp->gtOp.gtOp2 = src2->gtOp.gtOp2; + + if (cmp->OperIs(GT_EQ, GT_NE)) + { + // 64-bit equality comparisons (no matter the polarity) require two 32-bit comparisons: one for the upper 32 + // bits and one for the lower 32 bits. As such, we update the flow graph like so: + // + // Before: + // BB0: cond + // / \ + // false true + // | | + // BB1 BB2 + // + // After: + // BB0: cond(hi) + // / \ + // false true + // | | + // | BB3: cond(lo) + // | / \ + // | false true + // \ / | + // BB1 BB2 + // + + BlockRange().Remove(loSrc1.Def()); + BlockRange().Remove(loSrc2.Def()); + GenTree* loCmp = comp->gtNewOperNode(cmp->OperGet(), TYP_INT, loSrc1.Def(), loSrc2.Def()); + loCmp->gtFlags = cmp->gtFlags; + GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); + LIR::AsRange(newBlock).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + + m_block->bbJumpKind = BBJ_COND; + + if (cmp->OperIs(GT_EQ)) + { + cmp->gtOper = GT_NE; + m_block->bbJumpDest = nextDest; + nextDest->bbFlags |= BBF_JMP_TARGET; + comp->fgAddRefPred(nextDest, m_block); + } + else + { + m_block->bbJumpDest = jumpDest; + comp->fgAddRefPred(jumpDest, m_block); + } + + assert(newBlock->bbJumpKind == BBJ_COND); + assert(newBlock->bbJumpDest == jumpDest); + } + else + { + // 64-bit ordinal comparisons are more complicated: they require two comparisons for the upper 32 bits and + // one comparison for the lower 32 bits. We update the flowgraph as such: + // + // Before: + // BB0: cond + // / \ + // false true + // | | + // BB1 BB2 + // + // After: + // BB0: (!cond(hi) && !eq(hi)) + // / \ + // true false + // | | + // | BB3: (cond(hi) && !eq(hi)) + // | / \ + // | false true + // | | | + // | BB4: cond(lo) | + // | / \ | + // | false true | + // \ / \ / + // BB1 BB2 + // + // + // Note that the actual comparisons used to implement "(!cond(hi) && !eq(hi))" and "(cond(hi) && !eq(hi))" + // differ based on the original condition, and all consist of a single node. The switch statement below + // performs the necessary mapping. + // + + genTreeOps hiCmpOper; + genTreeOps loCmpOper; + + switch (cmp->OperGet()) + { + case GT_LT: + cmp->gtOper = GT_GT; + hiCmpOper = GT_LT; + loCmpOper = GT_LT; + break; + case GT_LE: + cmp->gtOper = GT_GT; + hiCmpOper = GT_LT; + loCmpOper = GT_LE; + break; + case GT_GT: + cmp->gtOper = GT_LT; + hiCmpOper = GT_GT; + loCmpOper = GT_GT; + break; + case GT_GE: + cmp->gtOper = GT_LT; + hiCmpOper = GT_GT; + loCmpOper = GT_GE; + break; + default: + unreached(); + } + + BasicBlock* newBlock2 = comp->fgSplitBlockAtEnd(newBlock); + + GenTree* hiJcc = new (comp, GT_JCC) GenTreeJumpCC(hiCmpOper); + hiJcc->gtFlags = cmp->gtFlags; + LIR::AsRange(newBlock).InsertAfter(nullptr, hiJcc); + + BlockRange().Remove(loSrc1.Def()); + BlockRange().Remove(loSrc2.Def()); + GenTree* loCmp = comp->gtNewOperNode(loCmpOper, TYP_INT, loSrc1.Def(), loSrc2.Def()); + loCmp->gtFlags = cmp->gtFlags | GTF_UNSIGNED; + GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); + LIR::AsRange(newBlock2).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); + + m_block->bbJumpKind = BBJ_COND; + m_block->bbJumpDest = nextDest; + nextDest->bbFlags |= BBF_JMP_TARGET; + comp->fgAddRefPred(nextDest, m_block); + + newBlock->bbJumpKind = BBJ_COND; + newBlock->bbJumpDest = jumpDest; + comp->fgAddRefPred(jumpDest, newBlock); + + assert(newBlock2->bbJumpKind == BBJ_COND); + assert(newBlock2->bbJumpDest == jumpDest); + } + + BlockRange().Remove(src1); + BlockRange().Remove(src2); + } +#endif + #ifdef _TARGET_XARCH_ #ifdef _TARGET_AMD64_ if (cmp->gtGetOp1()->TypeGet() != cmp->gtGetOp2()->TypeGet()) @@ -2112,247 +2353,7 @@ void Lowering::LowerCompare(GenTree* cmp) cmp->gtFlags |= GTF_UNSIGNED; } } - #endif // _TARGET_XARCH_ - -#ifndef _TARGET_64BIT_ - if (cmp->gtGetOp1()->TypeGet() != TYP_LONG) - { - return; - } - - LIR::Use cmpUse; - - if (!BlockRange().TryGetUse(cmp, &cmpUse) || cmpUse.User()->OperGet() != GT_JTRUE) - { - return; - } - - // For 32-bit targets any comparison that feeds a `GT_JTRUE` node must be lowered - // such that the liveness of the operands to the is properly visible to the rest - // of the backend. As such, a 64-bit comparison is lowered from something like this: - // - // ------------ BB02 [004..014) -> BB02 (cond), preds={BB02,BB01} succs={BB03,BB02} - // N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 - // - // /--* t6 int - // N002 ( 2, 3) [000007] ---------U-- t7 = * cast long <- ulong <- uint $3c0 - // - // N003 ( 3, 10) [000009] ------------ t9 = lconst long 0x0000000000000003 $101 - // - // /--* t7 long - // +--* t9 long - // N004 ( 9, 17) [000010] N------N-U-- t10 = * < int $149 - // - // /--* t10 int - // N005 ( 11, 19) [000011] ------------ * jmpTrue void - // - // To something like this: - // - // ------------ BB02 [004..014) -> BB03 (cond), preds={BB06,BB07,BB01} succs={BB06,BB03} - // [000099] ------------ t99 = const int 0 - // - // [000101] ------------ t101 = const int 0 - // - // /--* t99 int - // +--* t101 int - // N004 ( 9, 17) [000010] N------N-U-- t10 = * > int $149 - // - // /--* t10 int - // N005 ( 11, 19) [000011] ------------ * jmpTrue void - // - // - // ------------ BB06 [???..???) -> BB02 (cond), preds={BB02} succs={BB07,BB02} - // [000105] -------N-U-- jcc void cond=< - // - // - // ------------ BB07 [???..???) -> BB02 (cond), preds={BB06} succs={BB03,BB02} - // N001 ( 1, 1) [000006] ------------ t6 = lclVar int V02 loc0 u:5 $148 - // - // N003 ( 3, 10) [000009] ------------ t9 = const int 3 - // - // /--* t6 int - // +--* t9 int - // [000106] N------N-U-- t106 = * < int - // - // /--* t106 int - // [000107] ------------ * jmpTrue void - // - // Which will eventually generate code similar to the following: - // - // 33DB xor ebx, ebx - // 85DB test ebx, ebx - // 7707 ja SHORT G_M50523_IG04 - // 72E7 jb SHORT G_M50523_IG03 - // 83F803 cmp eax, 3 - // 72E2 jb SHORT G_M50523_IG03 - // - - GenTree* src1 = cmp->gtGetOp1(); - GenTree* src2 = cmp->gtGetOp2(); - unsigned weight = m_block->getBBWeight(comp); - - LIR::Use loSrc1(BlockRange(), &(src1->gtOp.gtOp1), src1); - LIR::Use loSrc2(BlockRange(), &(src2->gtOp.gtOp1), src2); - - if (loSrc1.Def()->OperGet() != GT_CNS_INT && loSrc1.Def()->OperGet() != GT_LCL_VAR) - { - loSrc1.ReplaceWithLclVar(comp, weight); - } - - if (loSrc2.Def()->OperGet() != GT_CNS_INT && loSrc2.Def()->OperGet() != GT_LCL_VAR) - { - loSrc2.ReplaceWithLclVar(comp, weight); - } - - BasicBlock* jumpDest = m_block->bbJumpDest; - BasicBlock* nextDest = m_block->bbNext; - BasicBlock* newBlock = comp->fgSplitBlockAtEnd(m_block); - - cmp->gtType = TYP_INT; - cmp->gtOp.gtOp1 = src1->gtOp.gtOp2; - cmp->gtOp.gtOp2 = src2->gtOp.gtOp2; - - if (cmp->OperGet() == GT_EQ || cmp->OperGet() == GT_NE) - { - // 64-bit equality comparisons (no matter the polarity) require two 32-bit comparisons: one for the upper 32 - // bits and one for the lower 32 bits. As such, we update the flow graph like so: - // - // Before: - // BB0: cond - // / \ - // false true - // | | - // BB1 BB2 - // - // After: - // BB0: cond(hi) - // / \ - // false true - // | | - // | BB3: cond(lo) - // | / \ - // | false true - // \ / | - // BB1 BB2 - // - - BlockRange().Remove(loSrc1.Def()); - BlockRange().Remove(loSrc2.Def()); - GenTree* loCmp = comp->gtNewOperNode(cmp->OperGet(), TYP_INT, loSrc1.Def(), loSrc2.Def()); - loCmp->gtFlags = cmp->gtFlags; - GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); - LIR::AsRange(newBlock).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); - - m_block->bbJumpKind = BBJ_COND; - - if (cmp->OperGet() == GT_EQ) - { - cmp->gtOper = GT_NE; - m_block->bbJumpDest = nextDest; - nextDest->bbFlags |= BBF_JMP_TARGET; - comp->fgAddRefPred(nextDest, m_block); - } - else - { - m_block->bbJumpDest = jumpDest; - comp->fgAddRefPred(jumpDest, m_block); - } - - assert(newBlock->bbJumpKind == BBJ_COND); - assert(newBlock->bbJumpDest == jumpDest); - } - else - { - // 64-bit ordinal comparisons are more complicated: they require two comparisons for the upper 32 bits and one - // comparison for the lower 32 bits. We update the flowgraph as such: - // - // Before: - // BB0: cond - // / \ - // false true - // | | - // BB1 BB2 - // - // After: - // BB0: (!cond(hi) && !eq(hi)) - // / \ - // true false - // | | - // | BB3: (cond(hi) && !eq(hi)) - // | / \ - // | false true - // | | | - // | BB4: cond(lo) | - // | / \ | - // | false true | - // \ / \ / - // BB1 BB2 - // - // - // Note that the actual comparisons used to implement "(!cond(hi) && !eq(hi))" and "(cond(hi) && !eq(hi))" - // differ based on the original condition, and all consist of a single node. The switch statement below - // performs the necessary mapping. - // - - genTreeOps hiCmpOper; - genTreeOps loCmpOper; - - switch (cmp->OperGet()) - { - case GT_LT: - cmp->gtOper = GT_GT; - hiCmpOper = GT_LT; - loCmpOper = GT_LT; - break; - case GT_LE: - cmp->gtOper = GT_GT; - hiCmpOper = GT_LT; - loCmpOper = GT_LE; - break; - case GT_GT: - cmp->gtOper = GT_LT; - hiCmpOper = GT_GT; - loCmpOper = GT_GT; - break; - case GT_GE: - cmp->gtOper = GT_LT; - hiCmpOper = GT_GT; - loCmpOper = GT_GE; - break; - default: - unreached(); - } - - BasicBlock* newBlock2 = comp->fgSplitBlockAtEnd(newBlock); - - GenTree* hiJcc = new (comp, GT_JCC) GenTreeJumpCC(hiCmpOper); - hiJcc->gtFlags = cmp->gtFlags; - LIR::AsRange(newBlock).InsertAfter(nullptr, hiJcc); - - BlockRange().Remove(loSrc1.Def()); - BlockRange().Remove(loSrc2.Def()); - GenTree* loCmp = comp->gtNewOperNode(loCmpOper, TYP_INT, loSrc1.Def(), loSrc2.Def()); - loCmp->gtFlags = cmp->gtFlags | GTF_UNSIGNED; - GenTree* loJtrue = comp->gtNewOperNode(GT_JTRUE, TYP_VOID, loCmp); - LIR::AsRange(newBlock2).InsertAfter(nullptr, loSrc1.Def(), loSrc2.Def(), loCmp, loJtrue); - - m_block->bbJumpKind = BBJ_COND; - m_block->bbJumpDest = nextDest; - nextDest->bbFlags |= BBF_JMP_TARGET; - comp->fgAddRefPred(nextDest, m_block); - - newBlock->bbJumpKind = BBJ_COND; - newBlock->bbJumpDest = jumpDest; - comp->fgAddRefPred(jumpDest, newBlock); - - assert(newBlock2->bbJumpKind == BBJ_COND); - assert(newBlock2->bbJumpDest == jumpDest); - } - - BlockRange().Remove(src1); - BlockRange().Remove(src2); -#endif } // Lower "jmp " tail call to insert PInvoke method epilog if required. -- 2.7.4