From b1643d1efb331a85e832d94f8128c75c588b1003 Mon Sep 17 00:00:00 2001
From: Mike Danes <onemihaid@hotmail.com>
Date: Mon, 21 Nov 2016 20:30:28 +0200
Subject: [PATCH] Change vector equality to use pmovmskb

This change replaces the rather long shuffle based compare sequence with
pmovmskb which is available in SSE2 and AVX2. The following code is now
generated:

C4E16D76D1           vpcmpeqd ymm2, ymm1
C4E17DD7C2           vpmovmskbeax, ymm2
83F8FF               cmp      eax, -1
0F94C0               sete     al
0FB6C0               movzx    rax, al
---
 src/jit/compiler.hpp         | 15 ++++++---
 src/jit/instrsxarch.h        |  1 +
 src/jit/lowerxarch.cpp       |  6 ++--
 src/jit/simdcodegenxarch.cpp | 77 ++++++++++----------------------------------
 4 files changed, 32 insertions(+), 67 deletions(-)

diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp
index 704cf18..e8358fd 100644
--- a/src/jit/compiler.hpp
+++ b/src/jit/compiler.hpp
@@ -473,10 +473,17 @@ inline unsigned Compiler::funGetFuncIdx(BasicBlock* block)
 
 #endif // !FEATURE_EH_FUNCLETS
 
-/*****************************************************************************
- *
- *  Map a register mask to a register number
- */
+//------------------------------------------------------------------------------
+// genRegNumFromMask : Maps a single register mask to a register number.
+//
+// Arguments:
+//    mask - the register mask
+//
+// Return Value:
+//    The number of the register contained in the mask.
+//
+// Assumptions:
+//    The mask contains one and only one register.
 
 inline regNumber genRegNumFromMask(regMaskTP mask)
 {
diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h
index 4b32cd4..4317334 100644
--- a/src/jit/instrsxarch.h
+++ b/src/jit/instrsxarch.h
@@ -178,6 +178,7 @@ INST3(FIRST_SSE2_INSTRUCTION, "FIRST_SSE2_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CO
 // These are the SSE instructions used on x86
 INST3( mov_i2xmm,   "movd"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0x6E)) // Move int reg to a xmm reg. reg1=xmm reg, reg2=int reg 
 INST3( mov_xmm2i,   "movd"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0x7E)) // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg 
+INST3( pmovmskb,    "pmovmskb"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0xD7)) // Move the MSB bits of all bytes in a xmm reg to an int reg
 INST3( movq,        "movq"        , 0, IUM_WR, 0, 0, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E))
 INST3( movsdsse2,   "movsd"       , 0, IUM_WR, 0, 0, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10))
 
diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp
index 2fee13f..b9d2a21 100644
--- a/src/jit/lowerxarch.cpp
+++ b/src/jit/lowerxarch.cpp
@@ -2754,14 +2754,14 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             else
             {
 
-                // Need two SIMD registers as scratch.
+                // Need one SIMD register as scratch.
                 // See genSIMDIntrinsicRelOp() for details on code sequence generate and
-                // the need for two scratch registers.
+                // the need for one scratch register.
                 //
                 // Note these intrinsics produce a BOOL result, hence internal float
                 // registers reserved are guaranteed to be different from target
                 // integer register without explicitly specifying.
-                info->internalFloatCount = 2;
+                info->internalFloatCount = 1;
                 info->setInternalCandidates(lsra, lsra->allSIMDRegs());
             }
             break;
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index a55d344..20db803 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -1141,15 +1141,8 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
             }
             else
             {
-                // We need two additional XMM registers as scratch.
-                regMaskTP floatRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLFLOAT);
-                assert(floatRsvdRegs != RBM_NONE);
-                assert(genCountBits(floatRsvdRegs) == 2);
-
-                regMaskTP tmpRegMask = genFindLowestBit(floatRsvdRegs);
-                floatRsvdRegs &= ~tmpRegMask;
-                regNumber tmpReg1 = genRegNumFromMask(tmpRegMask);
-                regNumber tmpReg2 = genRegNumFromMask(floatRsvdRegs);
+                // We need one additional SIMD register to store the result of the SIMD compare.
+                regNumber tmpReg1 = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLFLOAT);
 
                 // tmpReg1 = (op1Reg == op2Reg)
                 // Call this value of tmpReg1 as 'compResult' for further reference below.
@@ -1180,54 +1173,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                     inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
                 }
 
-                // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
-                if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
-                {
-                    // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
-                    //
-                    // Generated code sequence
-                    // - vextractf128 tmpReg2, tmpReg1, 0x01
-                    //       tmpReg2[128..255] <- 0
-                    //       tmpReg2[0..127]   <- tmpReg1[128..255]
-                    // - vandps tmpReg1, tempReg2
-                    //       This will zero-out upper portion of tmpReg1 and
-                    //       lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
-                    getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
-                    inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
-                }
-                // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
-                if (simdType != TYP_SIMD8)
-                {
-                    // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
-                    // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
-                    getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
-
-                    // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
-                    //
-                    // Note that what we have computed is as follows at this point:
-                    // tmpReg1[0] = compResult[0] & compResult[2]
-                    // tmpReg1[1] = compResult[1] & compResult[3]
-                    inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
-                }
-                // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
-                // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
-
-                // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
-                // tmpReg2[0] = compResult[1] & compResult[3]
-                getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
-
-                // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
-                // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
-                inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
-
                 regNumber intReg;
                 if (targetReg == REG_NA)
                 {
                     // If we are not materializing result into a register,
                     // we would have reserved an int type internal register.
-                    regMaskTP intRsvdRegs = (simdNode->gtRsvdRegs & RBM_ALLINT);
-                    assert(genCountBits(intRsvdRegs) == 1);
-                    intReg = genRegNumFromMask(intRsvdRegs);
+                    intReg = genRegNumFromMask(simdNode->gtRsvdRegs & RBM_ALLINT);
                 }
                 else
                 {
@@ -1238,12 +1189,18 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                     assert(genCountBits(simdNode->gtRsvdRegs & RBM_ALLINT) == 0);
                 }
 
-                // intReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
-                // (Note that for mov_xmm2i, the int register is always in the reg2 position.
-                inst_RV_RV(INS_mov_xmm2i, tmpReg1, intReg, TYP_INT);
-
-                //   cmp intReg, 0xFFFFFFFF
-                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, 0xFFFFFFFF);
+                inst_RV_RV(INS_pmovmskb, intReg, tmpReg1, simdType, emitActualTypeSize(simdType));
+                // There's no pmovmskw/pmovmskd/pmovmskq but they're not needed anyway. Vector compare
+                // instructions produce "all ones"/"all zeroes" components and pmovmskb extracts a
+                // subset of each component's ones/zeroes. In the end we need to know if the result is
+                // "all ones" where the number of ones is given by the vector byte size, not by the
+                // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
+                // for SSE registers we need to compare to 0x0000FFFF.
+                // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
+                // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
+                // encoding instead of 83F8FF.
+                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg,
+                                          emitActualTypeSize(simdType) == 32 ? -1 : 0x0000FFFF);
             }
 
             if (targetReg != REG_NA)
@@ -1251,12 +1208,12 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 // If we need to materialize result into a register,  targetReg needs to
                 // be set to 1 on true and zero on false.
                 // Equality:
-                //   cmp targetReg, 0xFFFFFFFF
+                //   cmp targetReg, 0xFFFFFFFF or 0xFFFF
                 //   sete targetReg
                 //   movzx targetReg, targetReg
                 //
                 // InEquality:
-                //   cmp targetReg, 0xFFFFFFFF
+                //   cmp targetReg, 0xFFFFFFFF or 0xFFFF
                 //   setne targetReg
                 //   movzx targetReg, targetReg
                 //
-- 
2.7.4