Simplify SIMD EQ/NE optimization
authorMike Danes <onemihaid@hotmail.com>
Sun, 17 Sep 2017 09:49:10 +0000 (12:49 +0300)
committerMike Danes <onemihaid@hotmail.com>
Mon, 2 Oct 2017 05:30:07 +0000 (08:30 +0300)
src/jit/codegenxarch.cpp
src/jit/lower.cpp
src/jit/lowerxarch.cpp
src/jit/lsra.cpp
src/jit/lsraxarch.cpp
src/jit/simdcodegenxarch.cpp

index 58c6c10..43327af 100644 (file)
@@ -6282,27 +6282,6 @@ void CodeGen::genCompareInt(GenTreePtr treeNode)
         return;
     }
 
-#ifdef FEATURE_SIMD
-    // If we have GT_JTRUE(GT_EQ/NE(GT_SIMD((in)Equality, v1, v2), true/false)),
-    // then we don't need to generate code for GT_EQ/GT_NE, since SIMD (in)Equality intrinsic
-    // would set or clear Zero flag.
-    if ((targetReg == REG_NA) && tree->OperIs(GT_EQ, GT_NE))
-    {
-        // Is it a SIMD (in)Equality that doesn't need to materialize result into a register?
-        if ((op1->gtRegNum == REG_NA) && op1->IsSIMDEqualityOrInequality())
-        {
-            // Must be comparing against true or false.
-            assert(op2->IsIntegralConst(0) || op2->IsIntegralConst(1));
-            assert(op2->isContainedIntOrIImmed());
-
-            // In this case SIMD (in)Equality will set or clear
-            // Zero flag, based on which GT_JTRUE would generate
-            // the right conditional jump.
-            return;
-        }
-    }
-#endif // FEATURE_SIMD
-
     genConsumeOperands(tree);
 
     assert(!op1->isContainedIntOrIImmed());
index d82676e..0b9f9b2 100644 (file)
@@ -5606,56 +5606,6 @@ void Lowering::ContainCheckJTrue(GenTreeOp* node)
     // The compare does not need to be generated into a register.
     GenTree* cmp                   = node->gtGetOp1();
     cmp->gtLsraInfo.isNoRegCompare = true;
-
-#ifdef FEATURE_SIMD
-    assert(node->OperIs(GT_JTRUE));
-
-    // Say we have the following IR
-    //   simdCompareResult = GT_SIMD((In)Equality, v1, v2)
-    //   integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
-    //   GT_JTRUE(integerCompareResult)
-    //
-    // In this case we don't need to generate code for GT_EQ_/NE, since SIMD (In)Equality
-    // intrinsic will set or clear the Zero flag.
-    genTreeOps cmpOper = cmp->OperGet();
-    if (cmpOper == GT_EQ || cmpOper == GT_NE)
-    {
-        GenTree* cmpOp1 = cmp->gtGetOp1();
-        GenTree* cmpOp2 = cmp->gtGetOp2();
-
-        if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
-        {
-            // We always generate code for a SIMD equality comparison, though it produces no value.
-            // Neither the GT_JTRUE nor the immediate need to be evaluated.
-            MakeSrcContained(cmp, cmpOp2);
-            cmpOp1->gtLsraInfo.isNoRegCompare = true;
-            // We have to reverse compare oper in the following cases:
-            // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
-            //    Therefore, if compare oper is == or != against false(0), we will
-            //    be checking opposite of what is required.
-            //
-            // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
-            //    Therefore, if compare oper is == or != against true(1), we will
-            //    be checking opposite of what is required.
-            GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
-            if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
-            {
-                if (cmpOp2->IsIntegralConst(0))
-                {
-                    cmp->SetOper(GenTree::ReverseRelop(cmpOper));
-                }
-            }
-            else
-            {
-                assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
-                if (cmpOp2->IsIntegralConst(1))
-                {
-                    cmp->SetOper(GenTree::ReverseRelop(cmpOper));
-                }
-            }
-        }
-    }
-#endif // FEATURE_SIMD
 }
 
 #endif // !LEGACY_BACKEND
index bb45279..834cd34 100644 (file)
@@ -794,6 +794,73 @@ void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
         // the addr of SIMD vector with the given index.
         simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
     }
+    else if (simdNode->IsSIMDEqualityOrInequality())
+    {
+        LIR::Use simdUse;
+
+        if (BlockRange().TryGetUse(simdNode, &simdUse))
+        {
+            //
+            // Try to transform JTRUE(EQ|NE(SIMD<OpEquality|OpInEquality>(x, y), 0|1)) into
+            // JCC(SIMD<OpEquality|OpInEquality>(x, y)). SIMD<OpEquality|OpInEquality>(x, y)
+            // is expected to set the Zero flag appropriately.
+            // All the involved nodes must form a continuous range, there's no other way to
+            // guarantee that condition flags aren't changed between the SIMD node and the JCC
+            // node.
+            //
+
+            bool     transformed = false;
+            GenTree* simdUser    = simdUse.User();
+
+            if (simdUser->OperIs(GT_EQ, GT_NE) && simdUser->gtGetOp2()->IsCnsIntOrI() &&
+                (simdNode->gtNext == simdUser->gtGetOp2()) && (simdUser->gtGetOp2()->gtNext == simdUser))
+            {
+                ssize_t relopOp2Value = simdUser->gtGetOp2()->AsIntCon()->IconValue();
+
+                if ((relopOp2Value == 0) || (relopOp2Value == 1))
+                {
+                    GenTree* jtrue = simdUser->gtNext;
+
+                    if ((jtrue != nullptr) && jtrue->OperIs(GT_JTRUE) && (jtrue->gtGetOp1() == simdUser))
+                    {
+                        if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) != simdUser->OperIs(GT_EQ))
+                        {
+                            relopOp2Value ^= 1;
+                        }
+
+                        jtrue->ChangeOper(GT_JCC);
+                        GenTreeCC* jcc = jtrue->AsCC();
+                        jcc->gtFlags |= GTF_USE_FLAGS;
+                        jcc->gtCondition = (relopOp2Value == 0) ? GT_NE : GT_EQ;
+
+                        BlockRange().Remove(simdUser->gtGetOp2());
+                        BlockRange().Remove(simdUser);
+                        transformed = true;
+                    }
+                }
+            }
+
+            if (!transformed)
+            {
+                //
+                // The code generated for SIMD SIMD<OpEquality|OpInEquality>(x, y) nodes sets
+                // the Zero flag like integer compares do so we can simply use SETCC<EQ|NE>
+                // to produce the desired result. This avoids the need for subsequent phases
+                // to have to handle 2 cases (set flags/set destination register).
+                //
+
+                genTreeOps condition = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? GT_EQ : GT_NE;
+                GenTreeCC* setcc     = new (comp, GT_SETCC) GenTreeCC(GT_SETCC, condition, simdNode->TypeGet());
+                setcc->gtFlags |= GTF_USE_FLAGS;
+                BlockRange().InsertAfter(simdNode, setcc);
+                simdUse.ReplaceWith(comp, setcc);
+            }
+        }
+
+        simdNode->gtFlags |= GTF_SET_FLAGS;
+        simdNode->SetUnusedValue();
+        simdNode->gtLsraInfo.isNoRegCompare = true;
+    }
 #endif
     ContainCheckSIMD(simdNode);
 }
index db6e699..1ace2f0 100644 (file)
@@ -4842,7 +4842,7 @@ void LinearScan::buildIntervals()
             TreeNodeInfoInit(node);
 
             // If the node produces an unused value, mark it as a local def-use
-            if (node->IsValue() && node->IsUnusedValue())
+            if (node->IsValue() && node->IsUnusedValue() && !node->gtLsraInfo.isNoRegCompare)
             {
                 node->gtLsraInfo.isLocalDefUse = true;
                 node->gtLsraInfo.dstCount      = 0;
index 8ac8e23..4fff111 100644 (file)
@@ -254,20 +254,6 @@ void LinearScan::TreeNodeInfoInit(GenTree* tree)
 
             GenTree* cmp = tree->gtGetOp1();
             assert(cmp->gtLsraInfo.dstCount == 0);
-
-#ifdef FEATURE_SIMD
-            GenTree* cmpOp1 = cmp->gtGetOp1();
-            GenTree* cmpOp2 = cmp->gtGetOp2();
-            if (cmpOp1->IsSIMDEqualityOrInequality() && cmpOp2->isContained())
-            {
-                genTreeOps cmpOper = cmp->OperGet();
-
-                // We always generate code for a SIMD equality comparison, but the compare itself produces no value.
-                // Neither the SIMD node nor the immediate need to be evaluated into a register.
-                assert(cmpOp1->gtLsraInfo.dstCount == 0);
-                assert(cmpOp2->gtLsraInfo.dstCount == 0);
-            }
-#endif // FEATURE_SIMD
         }
         break;
 
@@ -2136,41 +2122,26 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree)
 
         case SIMDIntrinsicOpEquality:
         case SIMDIntrinsicOpInEquality:
-
-            // On SSE4/AVX, we can generate optimal code for (in)equality
-            // against zero using ptest. We can safely do this optimization
-            // for integral vectors but not for floating-point for the reason
-            // that we have +0.0 and -0.0 and +0.0 == -0.0
             if (simdTree->gtGetOp2()->isContained())
             {
+                // If the second operand is contained then ContainCheckSIMD has determined
+                // that PTEST can be used. We only need a single source register and no
+                // internal registers.
                 info->srcCount = 1;
             }
             else
             {
-                info->srcCount = 2;
-                // Need one SIMD register as scratch.
-                // See genSIMDIntrinsicRelOp() for details on code sequence generated and
-                // the need for one scratch register.
-                //
-                // Note these intrinsics produce a BOOL result, hence internal float
-                // registers reserved are guaranteed to be different from target
-                // integer register without explicitly specifying.
+                // Can't use PTEST so we need 2 source registers, 1 internal SIMD register
+                // (to hold the result of PCMPEQD or other similar SIMD compare instruction)
+                // and one internal INT register (to hold the result of PMOVMSKB).
+                info->srcCount           = 2;
                 info->internalFloatCount = 1;
                 info->setInternalCandidates(this, allSIMDRegs());
+                info->internalIntCount = 1;
+                info->addInternalCandidates(this, allRegs(TYP_INT));
             }
-            if (info->isNoRegCompare)
-            {
-                info->dstCount = 0;
-                // Codegen of SIMD (in)Equality uses target integer reg only for setting flags.
-                // A target reg is not needed on AVX when comparing against Vector Zero.
-                // In all other cases we need to reserve an int type internal register if we
-                // don't have a target register on the compare.
-                if (!compiler->canUseAVX() || !simdTree->gtGetOp2()->IsIntegralConstVector(0))
-                {
-                    info->internalIntCount = 1;
-                    info->addInternalCandidates(this, allRegs(TYP_INT));
-                }
-            }
+            // These SIMD nodes only set the condition flags.
+            info->dstCount = 0;
             break;
 
         case SIMDIntrinsicDotProduct:
index b7c9b29..429cff0 100644 (file)
@@ -2049,6 +2049,9 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
         case SIMDIntrinsicOpEquality:
         case SIMDIntrinsicOpInEquality:
         {
+            // We're only setting condition flags, if a 0/1 value is desired then Lowering should have inserted a SETCC.
+            assert(targetReg == REG_NA);
+
             var_types simdType = op1->TypeGet();
             // TODO-1stClassStructs: Temporary to minimize asmDiffs
             if (simdType == TYP_DOUBLE)
@@ -2064,9 +2067,9 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
             }
 
             // On SSE4/AVX, we can generate optimal code for (in)equality against zero using ptest.
-            if ((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0))
+            if (op2->isContained())
             {
-                assert(op2->isContained());
+                assert((compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) && op2->IsIntegralConstVector(0));
                 inst_RV_RV(INS_ptest, op1->gtRegNum, op1->gtRegNum, simdType, emitActualTypeSize(simdType));
             }
             else
@@ -2103,22 +2106,7 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                     inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
                 }
 
-                regNumber intReg;
-                if (targetReg == REG_NA)
-                {
-                    // If we are not materializing result into a register,
-                    // we would have reserved an int type internal register.
-                    intReg = simdNode->GetSingleTempReg(RBM_ALLINT);
-                }
-                else
-                {
-                    // We can use targetReg for setting flags.
-                    intReg = targetReg;
-
-                    // Must have not reserved any int type internal registers.
-                    assert(simdNode->AvailableTempRegCount(RBM_ALLINT) == 0);
-                }
-
+                regNumber intReg = simdNode->GetSingleTempReg(RBM_ALLINT);
                 inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
                 // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
                 // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
@@ -2147,27 +2135,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 }
                 getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
             }
-
-            if (targetReg != REG_NA)
-            {
-                // If we need to materialize result into a register,  targetReg needs to
-                // be set to 1 on true and zero on false.
-                // Equality:
-                //   cmp targetReg, 0xFFFFFFFF or 0xFFFF
-                //   sete targetReg
-                //   movzx targetReg, targetReg
-                //
-                // InEquality:
-                //   cmp targetReg, 0xFFFFFFFF or 0xFFFF
-                //   setne targetReg
-                //   movzx targetReg, targetReg
-                //
-                assert(simdNode->TypeGet() == TYP_INT);
-                inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg,
-                        TYP_INT, EA_1BYTE);
-                // Set the higher bytes to 0
-                inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
-            }
         }
         break;