Change vector equality to use pmovmskb
authorMike Danes <onemihaid@hotmail.com>
Mon, 21 Nov 2016 18:30:28 +0000 (20:30 +0200)
committerMike Danes <onemihaid@hotmail.com>
Wed, 23 Nov 2016 06:12:48 +0000 (08:12 +0200)
This change replaces the rather long shuffle based compare sequence with
pmovmskb which is available in SSE2 and AVX2. The following code is now
generated:

C4E16D76D1           vpcmpeqd ymm2, ymm1
C4E17DD7C2           vpmovmskbeax, ymm2
83F8FF               cmp      eax, -1
0F94C0               sete     al
0FB6C0               movzx    rax, al

src/jit/compiler.hpp
src/jit/instrsxarch.h
src/jit/lowerxarch.cpp
src/jit/simdcodegenxarch.cpp

index 704cf18..e8358fd 100644 (file)
@@ -473,10 +473,17 @@ inline unsigned Compiler::funGetFuncIdx(BasicBlock* block)
 
 #endif // !FEATURE_EH_FUNCLETS
 
-/*****************************************************************************
- *
- *  Map a register mask to a register number
- */
+//------------------------------------------------------------------------------
+// genRegNumFromMask : Maps a single register mask to a register number.
+//
+// Arguments:
+//    mask - the register mask
+//
+// Return Value:
+//    The number of the register contained in the mask.
+//
+// Assumptions:
+//    The mask contains one and only one register.
 
 inline regNumber genRegNumFromMask(regMaskTP mask)
 {
index 4b32cd4..4317334 100644 (file)
@@ -178,6 +178,7 @@ INST3(FIRST_SSE2_INSTRUCTION, "FIRST_SSE2_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CO
 // These are the SSE instructions used on x86
 INST3( mov_i2xmm,   "movd"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0x6E)) // Move int reg to a xmm reg. reg1=xmm reg, reg2=int reg 
 INST3( mov_xmm2i,   "movd"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0x7E)) // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg 
+INST3( pmovmskb,    "pmovmskb"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0xD7)) // Move the MSB bits of all bytes in a xmm reg to an int reg
 INST3( movq,        "movq"        , 0, IUM_WR, 0, 0, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E))
 INST3( movsdsse2,   "movsd"       , 0, IUM_WR, 0, 0, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10))
 
index 2fee13f..b9d2a21 100644 (file)
@@ -2754,14 +2754,14 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             else
             {
 
-                // Need two SIMD registers as scratch.
+                // Need one SIMD register as scratch.
                 // See genSIMDIntrinsicRelOp() for details on code sequence generate and
-                // the need for two scratch registers.
+                // the need for one scratch register.
                 //
                 // Note these intrinsics produce a BOOL result, hence internal float
                 // registers reserved are guaranteed to be different from target
                 // integer register without explicitly specifying.
-                info->internalFloatCount = 2;
+                info->internalFloatCount = 1;
                 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
             }
             break;
index a55d344..20db803 100644 (file)
@@ -1141,15 +1141,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
             }
             else
             {
-                // We need two additional XMM registers as scratch.
-                regMaskTP floatRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLFLOAT);
-                assert(floatRsvdRegs != RBM_NONE);
-                assert(genCountBits(floatRsvdRegs) == 2);
-
-                regMaskTP tmpRegMask = genFindLowestBit(floatRsvdRegs);
-                floatRsvdRegs &= ~tmpRegMask;
-                regNumber tmpReg1 = genRegNumFromMask(tmpRegMask);
-                regNumber tmpReg2 = genRegNumFromMask(floatRsvdRegs);
+                // We need one additional SIMD register to store the result of the SIMD compare.
+                regNumber tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLFLOAT);
 
                 // tmpReg1 = (op1Reg == op2Reg)
                 // Call this value of tmpReg1 as 'compResult' for further reference below.
@@ -1180,54 +1173,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                     inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
                 }
 
-                // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
-                if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
-                {
-                    // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
-                    //
-                    // Generated code sequence
-                    // - vextractf128 tmpReg2, tmpReg1, 0x01
-                    //       tmpReg2[128..255] <- 0
-                    //       tmpReg2[0..127]   <- tmpReg1[128..255]
-                    // - vandps tmpReg1, tempReg2
-                    //       This will zero-out upper portion of tmpReg1 and
-                    //       lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
-                    getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
-                    inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
-                }
-                // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
-                if (simdType != TYP_SIMD8)
-                {
-                    // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
-                    // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
-                    getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
-
-                    // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
-                    //
-                    // Note that what we have computed is as follows at this point:
-                    // tmpReg1[0] = compResult[0] & compResult[2]
-                    // tmpReg1[1] = compResult[1] & compResult[3]
-                    inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
-                }
-                // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
-                // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
-
-                // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
-                // tmpReg2[0] = compResult[1] & compResult[3]
-                getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
-
-                // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
-                // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
-                inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
-
                 regNumber intReg;
                 if (targetReg == REG_NA)
                 {
                     // If we are not materializing result into a register,
                     // we would have reserved an int type internal register.
-                    regMaskTP intRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLINT);
-                    assert(genCountBits(intRsvdRegs) == 1);
-                    intReg = genRegNumFromMask(intRsvdRegs);
+                    intReg = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLINT);
                 }
                 else
                 {
@@ -1238,12 +1189,18 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                     assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0);
                 }
 
-                // intReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
-                // (Note that for mov_xmm2i, the int register is always in the reg2 position.
-                inst_RV_RV(INS_mov_xmm2i, tmpReg1, intReg, TYP_INT);
-
-                //   cmp intReg, 0xFFFFFFFF
-                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, 0xFFFFFFFF);
+                inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
+                // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
+                // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
+                // subset of each component's ones/zeroes. In the end we need to know if the result is
+                // "all ones" where the number of ones is given by the vector byte size, not by the
+                // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
+                // for SSE registers we need to compare to 0x0000FFFF.
+                // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
+                // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
+                // encoding instead of 83F8FF.
+                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg,
+                                          emitActualTypeSize(simdType) == 32 ? -1 : 0x0000FFFF);
             }
 
             if (targetReg != REG_NA)
@@ -1251,12 +1208,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 // If we need to materialize result into a register,  targetReg needs to
                 // be set to 1 on true and zero on false.
                 // Equality:
-                //   cmp targetReg, 0xFFFFFFFF
+                //   cmp targetReg, 0xFFFFFFFF or 0xFFFF
                 //   sete targetReg
                 //   movzx targetReg, targetReg
                 //
                 // InEquality:
-                //   cmp targetReg, 0xFFFFFFFF
+                //   cmp targetReg, 0xFFFFFFFF or 0xFFFF
                 //   setne targetReg
                 //   movzx targetReg, targetReg
                 //