Optimize codegen when SIMD (in)Equality that produces bool result is compared against...
authorsivarv <sivarv@microsoft.com>
Wed, 28 Sep 2016 23:29:14 +0000 (16:29 -0700)
committersivarv <sivarv@microsoft.com>
Thu, 29 Sep 2016 23:36:38 +0000 (16:36 -0700)
src/jit/codegenxarch.cpp
src/jit/gentree.cpp
src/jit/gentree.h
src/jit/lowerxarch.cpp
src/jit/lsra.cpp
src/jit/simdcodegenxarch.cpp
tests/src/JIT/SIMD/VectorIntEquals.cs

index 269995f8e86a555b1cd899f5abf61d681a526070..6045b83bd1aceebcf66a60ee7e1385616c0a9948 100644 (file)
@@ -7570,18 +7570,42 @@ void CodeGen::genCompareInt(GenTreePtr treeNode)
 {
     assert(treeNode->OperIsCompare());
 
-    GenTreeOp* tree    = treeNode->AsOp();
-    GenTreePtr op1     = tree->gtOp1;
-    GenTreePtr op2     = tree->gtOp2;
-    var_types  op1Type = op1->TypeGet();
-    var_types  op2Type = op2->TypeGet();
+    GenTreeOp* tree      = treeNode->AsOp();
+    GenTreePtr op1       = tree->gtOp1;
+    GenTreePtr op2       = tree->gtOp2;
+    var_types  op1Type   = op1->TypeGet();
+    var_types  op2Type   = op2->TypeGet();
+    regNumber  targetReg = treeNode->gtRegNum;
+
+#ifdef FEATURE_SIMD
+    // If we have GT_JTRUE(GT_EQ/NE(GT_SIMD((in)Equality, v1, v2), true/false)),
+    // then we don't need to generate code for GT_EQ/GT_NE, since SIMD (in)Equality intrinsic
+    // would set or clear Zero flag.
+    //
+    // Is treeNode == or != that doesn't need to materialize result into a register?
+    if ((tree->OperGet() == GT_EQ || tree->OperGet() == GT_NE) && (targetReg == REG_NA))
+    {
+        // Is it a SIMD (in)Equality that doesn't need to
+        // materialize result into a register?
+        if (op1->gtRegNum == REG_NA && op1->IsSIMDEqualityOrInequality())
+        {
+            // Must be comparing against true or false.
+            assert(op2->IsIntegralConst(0) || op2->IsIntegralConst(1));
+            assert(op2->isContainedIntOrIImmed());
+
+            // In this case SIMD (in)Equality will set or clear
+            // Zero flag, based on which GT_JTRUE would generate
+            // the right conditional jump.
+            return;
+        }
+    }
+#endif // FEATURE_SIMD
 
     genConsumeOperands(tree);
 
     instruction ins;
     emitAttr    cmpAttr;
 
-    regNumber targetReg = treeNode->gtRegNum;
     // TODO-CQ: We should be able to support swapping op1 and op2 to generate cmp reg, imm.
     // https://github.com/dotnet/coreclr/issues/7270
     assert(!op1->isContainedIntOrIImmed()); // We no longer support
index a60574120116460d3fe13cde4bb8bf35f99d4e6f..8c2dd967c8bbcc939e8319663512d329f6d57413 100644 (file)
@@ -15615,6 +15615,12 @@ bool GenTree::isContained() const
         return false;
     }
 
+    // these either produce a result in register or set flags reg.
+    if (IsSIMDEqualityOrInequality())
+    {
+        return false;
+    }
+
     // TODO-Cleanup : this is not clean, would be nice to have some way of marking this.
     switch (OperGet())
     {
index a4c023aaee6342d9270eb3eb6625c7a183bdb9ac..fb4dfa3cf7d0003c9368884930df14d0a5a34003 100644 (file)
@@ -1494,6 +1494,8 @@ public:
 
     inline bool IsBoxedValue();
 
+    inline bool IsSIMDEqualityOrInequality() const;
+
     static bool OperIsList(genTreeOps gtOper)
     {
         return gtOper == GT_LIST;
@@ -4913,6 +4915,21 @@ inline bool GenTree::IsBoxedValue()
     return (gtOper == GT_BOX) && (gtFlags & GTF_BOX_VALUE);
 }
 
+inline bool GenTree::IsSIMDEqualityOrInequality() const
+{
+#ifdef FEATURE_SIMD
+    if (gtOper == GT_SIMD)
+    {
+        // Has to cast away const-ness since AsSIMD() method is non-const.
+        GenTreeSIMD* simdNode = const_cast<GenTree*>(this)->AsSIMD();
+        return (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality ||
+                simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+    }
+#endif
+
+    return false;
+}
+
 inline GenTreePtr GenTree::MoveNext()
 {
     assert(OperIsAnyList());
index 408398bb16d95b63c501c008a62e97007b77fa49..9489dab8b10058f44d106f25ddae934e96534d20 100644 (file)
@@ -328,10 +328,83 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
             break;
 
         case GT_JTRUE:
+        {
             info->srcCount = 0;
             info->dstCount = 0;
-            l->clearDstCount(tree->gtOp.gtOp1);
-            break;
+
+            GenTree* cmp = tree->gtGetOp1();
+            l->clearDstCount(cmp);
+
+#ifdef FEATURE_SIMD
+            // Say we have the following IR
+            //   simdCompareResult = GT_SIMD((In)Equality, v1, v2)
+            //   integerCompareResult = GT_EQ/NE(simdCompareResult, true/false)
+            //   GT_JTRUE(integerCompareResult)
+            //
+            // In this case we don't need to generate code fo GT_EQ_/NE, since SIMD (In)Equality
+            // intrinsic would set or clear Zero flag.
+
+            genTreeOps cmpOper = cmp->OperGet();
+            if (cmpOper == GT_EQ || cmpOper == GT_NE)
+            {
+                GenTree* cmpOp1 = cmp->gtGetOp1();
+                GenTree* cmpOp2 = cmp->gtGetOp2();
+
+                if (cmpOp1->IsSIMDEqualityOrInequality() && (cmpOp2->IsIntegralConst(0) || cmpOp2->IsIntegralConst(1)))
+                {
+                    // clear dstCount on SIMD node to indicate that
+                    // result doesn't need to be materialized into a register.
+                    l->clearOperandCounts(cmp);
+                    l->clearDstCount(cmpOp1);
+                    l->clearOperandCounts(cmpOp2);
+
+                    // Codegen of SIMD (in)Equality uses target integer reg
+                    // only for setting flags.  Target reg is not needed on AVX
+                    // when comparing against Vector Zero.  In all other cases
+                    // we need to reserve an int type internal register, since we
+                    // have cleared dstCount.
+                    if (compiler->canUseAVX() && cmpOp1->gtGetOp2()->IsIntegralConstVector(0))
+                    {
+                        // We don't need an internal register,since we use vptest
+                        // for setting flags.
+                    }
+                    else
+                    {
+                        ++(cmpOp1->gtLsraInfo.internalIntCount);
+                        regMaskTP internalCandidates = cmpOp1->gtLsraInfo.getInternalCandidates(l);
+                        internalCandidates |= l->allRegs(TYP_INT);
+                        cmpOp1->gtLsraInfo.setInternalCandidates(l, internalCandidates);
+                    }
+
+                    // We would have to reverse compare oper in the following cases:
+                    // 1) SIMD Equality: Sets Zero flag on equal otherwise clears it.
+                    //    Therefore, if compare oper is == or != against false(0), we will
+                    //    be checking opposite of what is required.
+                    //
+                    // 2) SIMD inEquality: Clears Zero flag on true otherwise sets it.
+                    //    Therefore, if compare oper is == or != against true(1), we will
+                    //    be checking opposite of what is required.
+                    GenTreeSIMD* simdNode = cmpOp1->AsSIMD();
+                    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality)
+                    {
+                        if (cmpOp2->IsIntegralConst(0))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                    else
+                    {
+                        assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpInEquality);
+                        if (cmpOp2->IsIntegralConst(1))
+                        {
+                            cmp->SetOper(GenTree::ReverseRelop(cmpOper));
+                        }
+                    }
+                }
+            }
+#endif // FEATURE_SIMD
+        }
+        break;
 
         case GT_JCC:
             info->srcCount = 0;
index addccf38018b4a8400e6890cbc3d9970513e6f8f..774d1fb9e864ecfc7ee327f8bf1cefe7540b66ef 100644 (file)
@@ -3290,7 +3290,7 @@ static int ComputeOperandDstCount(GenTree* operand)
         // If an operand has no destination registers but does have source registers, it must be a store
         // or a compare.
         assert(operand->OperIsStore() || operand->OperIsBlkOp() || operand->OperIsPutArgStk() ||
-               operand->OperIsCompare());
+               operand->OperIsCompare() || operand->IsSIMDEqualityOrInequality());
         return 0;
     }
     else if (!operand->OperIsFieldListHead() && (operand->OperIsStore() || operand->TypeGet() == TYP_VOID))
index 58a8ac71881b0d4f13f47173f5c01ad7519700fc..d04347cea35dac8f9ce9029d9d9de9c077a93530 100644 (file)
@@ -1032,11 +1032,10 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 //
 void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
 {
-    GenTree*  op1       = simdNode->gtGetOp1();
-    GenTree*  op2       = simdNode->gtGetOp2();
-    var_types baseType  = simdNode->gtSIMDBaseType;
-    regNumber targetReg = simdNode->gtRegNum;
-    assert(targetReg != REG_NA);
+    GenTree*       op1        = simdNode->gtGetOp1();
+    GenTree*       op2        = simdNode->gtGetOp2();
+    var_types      baseType   = simdNode->gtSIMDBaseType;
+    regNumber      targetReg  = simdNode->gtRegNum;
     var_types      targetType = simdNode->TypeGet();
     InstructionSet iset       = compiler->getSIMDInstructionSet();
 
@@ -1050,6 +1049,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
         case SIMDIntrinsicEqual:
         case SIMDIntrinsicGreaterThan:
         {
+            assert(targetReg != REG_NA);
+
             // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations
             assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) && (baseType != TYP_ULONG));
 
@@ -1092,6 +1093,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
         case SIMDIntrinsicLessThan:
         case SIMDIntrinsicLessThanOrEqual:
         {
+            assert(targetReg != REG_NA);
+
             // Int vectors use ">" and ">=" with swapped operands
             assert(varTypeIsFloating(baseType));
 
@@ -1114,8 +1117,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
         case SIMDIntrinsicOpEquality:
         case SIMDIntrinsicOpInEquality:
         {
-            assert(genIsValidIntReg(targetReg));
-
             var_types simdType = op1->TypeGet();
             // TODO-1stClassStructs: Temporary to minimize asmDiffs
             if (simdType == TYP_DOUBLE)
@@ -1139,16 +1140,15 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
             }
             else
             {
+                // We need two additional XMM registers as scratch.
+                regMaskTP floatRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLFLOAT);
+                assert(floatRsvdRegs != RBM_NONE);
+                assert(genCountBits(floatRsvdRegs) == 2);
 
-                // We need two additional XMM register as scratch
-                assert(simdNode->gtRsvdRegs != RBM_NONE);
-                assert(genCountBits(simdNode->gtRsvdRegs) == 2);
-
-                regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
-                regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
-                tmpRegsMask &= ~tmpReg1Mask;
-                regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
-                regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
+                regMaskTP tmpRegMask = genFindLowestBit(floatRsvdRegs);
+                floatRsvdRegs &= ~tmpRegMask;
+                regNumber tmpReg1 = genRegNumFromMask(tmpRegMask);
+                regNumber tmpReg2 = genRegNumFromMask(floatRsvdRegs);
 
                 // tmpReg1 = (op1Reg == op2Reg)
                 // Call this value of tmpReg1 as 'compResult' for further reference below.
@@ -1219,11 +1219,36 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
                 inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
 
-                // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
+                regNumber intReg;
+                if (targetReg == REG_NA)
+                {
+                    // If we are not materializing result into a register,
+                    // we would have reserved an int type internal register.
+                    regMaskTP intRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLINT);
+                    assert(genCountBits(intRsvdRegs) == 1);
+                    intReg = genRegNumFromMask(intRsvdRegs);
+                }
+                else
+                {
+                    // We can use targetReg for setting flags.
+                    intReg = targetReg;
+
+                    // Must have not reserved any int type internal registers.
+                    assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0);
+                }
+
+                // intReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
                 // (Note that for mov_xmm2i, the int register is always in the reg2 position.
-                inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
+                inst_RV_RV(INS_mov_xmm2i, tmpReg1, intReg, TYP_INT);
+
+                //   cmp intReg, 0xFFFFFFFF
+                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, 0xFFFFFFFF);
+            }
 
-                // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
+            if (targetReg != REG_NA)
+            {
+                // If we need to materialize result into a register,  targetReg needs to
+                // be set to 1 on true and zero on false.
                 // Equality:
                 //   cmp targetReg, 0xFFFFFFFF
                 //   sete targetReg
@@ -1234,14 +1259,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 //   setne targetReg
                 //   movzx targetReg, targetReg
                 //
-                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
+                assert(simdNode->TypeGet() == TYP_INT);
+                inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg,
+                        TYP_INT, EA_1BYTE);
+                // Set the higher bytes to 0
+                inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
             }
-
-            inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT,
-                    EA_1BYTE);
-            assert(simdNode->TypeGet() == TYP_INT);
-            // Set the higher bytes to 0
-            inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
         }
         break;
 
index c5d818027c663cd3059b58511a567ff63e10c4e1..fae49aac1dcbfd050bcfa06e405f91fc160312c4 100644 (file)
@@ -39,6 +39,46 @@ internal partial class VectorTest
             return Fail;
         }
 
+        if (Vector<int>.Zero.Equals(B))
+        {
+            return Fail;
+        }
+
+        if (!(A == B))
+        {
+            return Fail;
+        }
+
+        if (A == Vector<int>.Zero)
+        {
+            return Fail;
+        }
+
+        if (!(A != Vector<int>.Zero))
+        {
+            return Fail;
+        }
+
+        if (A != B)
+        {
+            return Fail;
+        }
+
+        if (!(A != C))
+        {
+            return Fail;
+        }
+
+        if (!(Vector<int>.Zero != A))
+        {
+            return Fail;
+        }
+
+        if (Vector<int>.Zero != Vector<int>.Zero)
+        {
+            return Fail;
+        }
+
         return Pass;
     }