add jit intrinsic support for vector conversion/narrow/widen on AMD64 and x86, except...
authorhelloguo <xiangyang.guo@intel.com>
Mon, 3 Apr 2017 17:28:09 +0000 (10:28 -0700)
committerhelloguo <xiangyang.guo@intel.com>
Wed, 10 May 2017 22:56:43 +0000 (15:56 -0700)
13 files changed:
src/jit/codegenlinear.h
src/jit/emitfmtsxarch.h
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/instrsxarch.h
src/jit/lsraxarch.cpp
src/jit/simd.cpp
src/jit/simd.h
src/jit/simdcodegenxarch.cpp
src/jit/simdintrinsiclist.h
tests/src/JIT/SIMD/VectorConvert.cs
tests/src/JIT/SIMD/VectorConvert_r.csproj
tests/src/JIT/SIMD/VectorConvert_ro.csproj

index 3bd0eac..5cead6d 100644 (file)
@@ -80,6 +80,17 @@ void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode);
 void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode);
 void genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode);
 void genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode);
+void genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
+                           var_types       simdType,
+                           var_types       baseType,
+                           regNumber       tmpReg,
+                           regNumber       tmpIntReg,
+                           regNumber       targetReg);
+void genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode);
+void genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode);
+void genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode);
+void genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg);
+void genSIMDIntrinsicWiden(GenTreeSIMD* simdNode);
 void genSIMDIntrinsic(GenTreeSIMD* simdNode);
 void genSIMDCheck(GenTree* treeNode);
 
index 49afcb5..6d15fcf 100644 (file)
@@ -109,7 +109,7 @@ IF_DEF(RRW_RRW,     IS_R1_RW|IS_R2_RW,          NONE)     // r/w    reg , r/w re
 IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW,          SCNS)     // r/w    reg , r/w  reg2 , const
 
 IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE)     // write  reg , read reg2 , read reg3
-
+IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write  reg , read reg2 , read reg3, const
 //----------------------------------------------------------------------------
 // The following formats are used for direct addresses (e.g. static data members)
 //----------------------------------------------------------------------------
index 7608130..659c260 100644 (file)
@@ -94,7 +94,10 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins)
             ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd || ins == INS_pminub ||
             ins == INS_pminsw || ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud ||
             ins == INS_pmaxub || ins == INS_pmaxsw || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw ||
-            ins == INS_pmaxud);
+            ins == INS_pmaxud || ins == INS_vinserti128 || ins == INS_punpckhbw || ins == INS_punpcklbw ||
+            ins == INS_punpckhqdq || ins == INS_punpcklqdq || ins == INS_punpckhwd || ins == INS_punpcklwd ||
+            ins == INS_punpckhdq || ins == INS_packssdw || ins == INS_packsswb || ins == INS_packuswb ||
+            ins == INS_packusdw || ins == INS_vperm2i128);
 }
 
 // Returns true if the AVX instruction is a move operator that requires 3 operands.
@@ -105,8 +108,8 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins)
 // to indicate whether a 3-operand instruction.
 bool emitter::IsThreeOperandMoveAVXInstruction(instruction ins)
 {
-    return IsAVXInstruction(ins) &&
-           (ins == INS_movlpd || ins == INS_movlps || ins == INS_movhpd || ins == INS_movhps || ins == INS_movss);
+    return IsAVXInstruction(ins) && (ins == INS_movlpd || ins == INS_movlps || ins == INS_movhpd || ins == INS_movhps ||
+                                     ins == INS_movss || ins == INS_movlhps);
 }
 
 // ------------------------------------------------------------------------------
@@ -206,6 +209,14 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att
 // Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix
 bool TakesRexWPrefix(instruction ins, emitAttr attr)
 {
+    // Because the current implementation of AVX does not have a way to distinguish between the register
+    // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are
+    // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE),
+    // and here we must special case these by the opcode.
+    if (ins == INS_vpermq)
+    {
+        return true;
+    }
 #ifdef _TARGET_AMD64_
     // movsx should always sign extend out to 8 bytes just because we don't track
     // whether the dest should be 4 bytes or 8 bytes (attr indicates the size
@@ -342,7 +353,6 @@ unsigned RegEncoding(regNumber reg)
 // AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
 emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 {
-#ifdef _TARGET_AMD64_
     if (UseAVX() && IsAVXInstruction(ins))
     {
         // W-bit is available only in 3-byte VEX prefix that starts with byte C4.
@@ -351,7 +361,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
         // W-bit is the only bit that is added in non bit-inverted form.
         return code | 0x00008000000000ULL;
     }
-
+#ifdef _TARGET_AMD64_
     return code | 0x4800000000ULL;
 #else
     assert(!"UNREACHED");
@@ -3810,6 +3820,40 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg,
     emitCurIGsize += sz;
 }
 
+/**********************************************************************************
+* emitIns_R_R_R_I: Add an instruction with three register operands and an immediate.
+*
+* Arguments:
+*    ins       - the instruction to add
+*    attr      - the emitter attribute for instruction
+*    targetReg - the target (destination) register
+*    reg1      - the first source register
+*    reg2      - the second source register
+*    ival      - the immediate value
+*/
+
+void emitter::emitIns_R_R_R_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, int ival)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+    // Currently vex prefix only use three bytes mode.
+    // size = vex + opcode + ModR/M + 1-byte-cns = 3 + 1 + 1 + 1 = 6
+    // TODO-XArch-CQ: We should create function which can calculate all kinds of AVX instructions size in future
+    UNATIVE_OFFSET sz = 6;
+
+    instrDesc* id = emitNewInstrCns(attr, ival);
+    id->idIns(ins);
+    id->idInsFmt(IF_RWR_RRD_RRD_CNS);
+    id->idReg1(targetReg);
+    id->idReg2(reg1);
+    id->idReg3(reg2);
+
+    id->idCodeSize(sz);
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
 #endif
 /*****************************************************************************
  *
@@ -6995,6 +7039,15 @@ void emitter::emitDispIns(
             printf("%s, ", emitRegName(id->idReg2(), attr));
             printf("%s", emitRegName(id->idReg3(), attr));
             break;
+        case IF_RWR_RRD_RRD_CNS:
+            assert(IsAVXInstruction(ins));
+            assert(IsThreeOperandAVXInstruction(ins));
+            printf("%s, ", emitRegName(id->idReg1(), attr));
+            printf("%s, ", emitRegName(id->idReg2(), attr));
+            printf("%s, ", emitRegName(id->idReg3(), attr));
+            val = emitGetInsSC(id);
+            goto PRINT_CONSTANT;
+            break;
 #endif
         case IF_RRW_RRW_CNS:
             printf("%s,", emitRegName(id->idReg1(), attr));
@@ -9514,7 +9567,34 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
 
         assert(id->idGCref() == GCT_NONE);
         assert(valInByte);
-        assert(ins == INS_psrldq || ins == INS_pslldq);
+        // The left and right shifts use the same encoding, and are distinguished by the Reg/Opcode field.
+        regNumber regOpcode;
+        switch (ins)
+        {
+            case INS_psrldq:
+                regOpcode = (regNumber)3;
+                break;
+            case INS_pslldq:
+                regOpcode = (regNumber)7;
+                break;
+            case INS_psrld:
+            case INS_psrlw:
+            case INS_psrlq:
+                regOpcode = (regNumber)2;
+                break;
+            case INS_pslld:
+            case INS_psllw:
+            case INS_psllq:
+                regOpcode = (regNumber)6;
+                break;
+            case INS_psrad:
+                regOpcode = (regNumber)4;
+                break;
+            default:
+                assert(!"Invalid instruction for SSE2 instruction of the form: opcode reg, immed8");
+                regOpcode = REG_NA;
+                break;
+        }
 
         // Get the 'base' opcode.
         code = insCodeMI(ins);
@@ -9528,14 +9608,6 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
             code = insEncodeReg3456(ins, reg, size, code);
         }
 
-        // In case of psrldq
-        // Reg/Opcode = 3
-        // R/M = reg1
-        //
-        // In case of pslldq
-        // Reg/Opcode = 7
-        // R/M = reg1
-        regNumber regOpcode = (regNumber)((ins == INS_psrldq) ? 3 : 7);
         unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8;
 
         // Output the REX prefix
@@ -10659,6 +10731,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             dst = emitOutputRRR(dst, id);
             sz  = emitSizeOfInsDsc(id);
             break;
+        case IF_RWR_RRD_RRD_CNS:
+            dst = emitOutputRRR(dst, id);
+            sz  = emitSizeOfInsDsc(id);
+            dst += emitOutputByte(dst, emitGetInsSC(id));
+            break;
 #endif
 
         case IF_RRW_RRW_CNS:
@@ -10690,6 +10767,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             assert(code & 0x00FF0000);
 
 #ifdef FEATURE_AVX_SUPPORT
+            if (TakesRexWPrefix(ins, size))
+            {
+                code = AddRexWPrefix(ins, code);
+            }
+
             if (TakesVexPrefix(ins))
             {
                 if (IsThreeOperandBinaryAVXInstruction(ins))
@@ -10718,11 +10800,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
             if (Is4ByteAVXInstruction(ins))
             {
-                // We just need to output the last byte of the opcode.
                 assert((code & 0xFF) == 0);
-                assert((code & 0xFF00) != 0xC000);
-                dst += emitOutputByte(dst, (code >> 8) & 0xFF);
-                code = 0;
+                if ((code & 0xFF00) == 0xC000)
+                {
+                    dst += emitOutputWord(dst, code | regcode);
+                }
+                else
+                {
+                    dst += emitOutputByte(dst, (code >> 8) & 0xFF);
+                    dst += emitOutputByte(dst, 0xC0 | (regcode >> 8));
+                }
             }
             else if (code & 0xFF000000)
             {
@@ -10732,27 +10819,25 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 if (Is4ByteSSE4Instruction(ins))
                 {
                     dst += emitOutputWord(dst, code);
-                    code = 0;
+                    dst += emitOutputByte(dst, 0xC0 | (regcode >> 8));
+                }
+                else
+                {
+                    assert((code & 0xFF00) == 0xC000);
+                    dst += emitOutputWord(dst, code | regcode);
                 }
             }
             else if (code & 0x00FF0000)
             {
                 dst += emitOutputByte(dst, code >> 16);
                 code &= 0x0000FFFF;
-            }
-
-            // Note that regcode is shifted by 8-bits above to align with RM byte.
-            if (code != 0)
-            {
                 assert((code & 0xFF00) == 0xC000);
                 dst += emitOutputWord(dst, code | regcode);
             }
             else
             {
-                // This case occurs for SSE4/AVX instructions.
-                // Note that regcode is left shifted by 8-bits.
-                assert(Is4ByteAVXInstruction(ins) || Is4ByteSSE4Instruction(ins));
-                dst += emitOutputByte(dst, 0xC0 | (regcode >> 8));
+                assert((code & 0xFF00) == 0xC000);
+                dst += emitOutputWord(dst, code | regcode);
             }
 
             dst += emitOutputByte(dst, emitGetInsSC(id));
index faeba7d..d439f7e 100644 (file)
@@ -360,6 +360,8 @@ void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg
 
 #ifdef FEATURE_AVX_SUPPORT
 void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3);
+
+void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, int ival);
 #endif
 
 void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);
index 729bece..0952770 100644 (file)
@@ -200,10 +200,13 @@ INST3( movapd,      "movapd"      , 0, IUM_WR, 0, 0, PCKDBL(0x29), BAD_CODE, PCK
 INST3( movaps,      "movaps"      , 0, IUM_WR, 0, 0, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28))
 INST3( movupd,      "movupd"      , 0, IUM_WR, 0, 0, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10))
 INST3( movups,      "movups"      , 0, IUM_WR, 0, 0, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10))
+INST3( movlhps,     "movlhps"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKFLT(0x16))
 
 INST3( shufps,      "shufps"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKFLT(0xC6))
 INST3( shufpd,      "shufpd"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0xC6))
-       
+
+INST3( punpckhdq,   "punpckhdq"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, PCKDBL(0x6A))
+
 // SSE 2 arith
 INST3( addps,  "addps",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x58))    // Add packed singles
 INST3( addss,  "addss",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x58))    // Add scalar singles
@@ -289,8 +292,19 @@ INST3( pand,        "pand"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,
 INST3( pandn,       "pandn"       , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xDF))   // Packed bit-wise AND NOT of two xmm regs
 INST3( por,         "por"         , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xEB))   // Packed bit-wise OR of two xmm regs
 INST3( pxor,        "pxor"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xEF))   // Packed bit-wise XOR of two xmm regs
+
+// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode,
+// which is handled in emitxarch.cpp.
 INST3( psrldq,      "psrldq"      , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x73),  BAD_CODE    )   // Shift right logical of xmm reg by given number of bytes
 INST3( pslldq,      "pslldq"      , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x73),  BAD_CODE    )   // Shift left logical of xmm reg by given number of bytes
+INST3( psllq,       "psllq"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x73),  BAD_CODE    )   // Packed shift left logical of 64-bit integers
+INST3( psrlq,       "psrlq"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x73),  BAD_CODE    )   // Packed shift right logical of 64-bit integers
+INST3( pslld,       "pslld"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x72),  BAD_CODE    )   // Packed shift left logical of 32-bit integers
+INST3( psrld,       "psrld"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x72),  BAD_CODE    )   // Packed shift right logical of 32-bit integers
+INST3( psllw,       "psllw"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x71),  BAD_CODE    )   // Packed shift left logical of 16-bit integers
+INST3( psrlw,       "psrlw"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x71),  BAD_CODE    )   // Packed shift right logical of 16-bit integers
+INST3( psrad,       "psrad"       , 0, IUM_WR, 0, 0, BAD_CODE,     PCKDBL(0x72),  BAD_CODE    )   // Packed shift right arithmetic of 32-bit integers
+
 INST3( pmaxub,      "pmaxub"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xDE))   // packed maximum unsigned bytes
 INST3( pminub,      "pminub"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xDA))   // packed minimum unsigned bytes
 INST3( pmaxsw,      "pmaxsw"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xEE))   // packed maximum signed words
@@ -306,14 +320,24 @@ INST3( pshufd,      "pshufd"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,
 INST3( pextrw,      "pextrw"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xC5))   // Extract 16-bit value into a r32 with zero extended to 32-bits
 INST3( pinsrw,      "pinsrw"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0xC4))   // packed insert word
 
+INST3( punpckhbw,   "punpckhbw"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x68))   // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3( punpcklbw,   "punpcklbw"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x60))   // Packed logical (unsigned) widen ubyte to ushort (lo)
+INST3( punpckhqdq,  "punpckhqdq"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x6D))   // Packed logical (unsigned) widen uint to ulong (hi)
+INST3( punpcklqdq,  "punpcklqdq"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x6C))   // Packed logical (unsigned) widen uint to ulong (lo)
+INST3( punpckhwd,   "punpckhwd"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x69))   // Packed logical (unsigned) widen ushort to uint (hi)
+INST3( punpcklwd,   "punpcklwd"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x61))   // Packed logical (unsigned) widen ushort to uint (lo)
+
+INST3( packssdw,    "packssdw"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x6B))   // Pack (narrow) int to short with saturation
+INST3( packsswb,    "packsswb"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x63))   // Pack (narrow) short to byte with saturation
+INST3( packuswb,    "packuswb"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE,      PCKDBL(0x67))   // Pack (narrow) short to unsigned byte with saturation
 #endif // !LEGACY_BACKEND
 INST3(LAST_SSE2_INSTRUCTION, "LAST_SSE2_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 #ifndef LEGACY_BACKEND
 INST3(FIRST_SSE4_INSTRUCTION, "FIRST_SSE4_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 //    enum           name           FP updmode rf wf    MR            MI        RM
-INST3( dpps,         "dpps"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x40))   // Packed bit-wise AND NOT of two xmm regs
-INST3( dppd,         "dppd"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x41))   // Packed bit-wise AND NOT of two xmm regs
+INST3( dpps,         "dpps"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x40))   // Packed dot product of two float vector regs
+INST3( dppd,         "dppd"        , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x41))   // Packed dot product of two double vector regs
 INST3( insertps,     "insertps"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x21))   // Insert packed single precision float value
 INST3( pcmpeqq,      "pcmpeqq"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x29))   // Packed compare 64-bit integers for equality
 INST3( pcmpgtq,      "pcmpgtq"     , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x37))   // Packed compare 64-bit integers for equality
@@ -331,6 +355,11 @@ INST3( pmaxsb,       "pmaxsb"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( pmaxsd,       "pmaxsd"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x3D))   // packed maximum 32-bit signed integers
 INST3( pmaxuw,       "pmaxuw"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x3E))   // packed maximum 16-bit unsigned integers
 INST3( pmaxud,       "pmaxud"      , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x3F))   // packed maximum 32-bit unsigned integers
+INST3( pmovsxbw,     "pmovsxbw"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x20))   // Packed sign extend byte to short
+INST3( pmovsxwd,     "pmovsxwd"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x23))   // Packed sign extend short to int
+INST3( pmovsxdq,     "pmovsxdq"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x25))   // Packed sign extend int to long
+INST3( packusdw,     "packusdw"    , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x2B))   // Pack (narrow) int to unsigned short with saturation
+
 INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 
 INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
@@ -342,9 +371,12 @@ INST3( vpbroadcastw, "pbroadcastw" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SS
 INST3( vpbroadcastd, "pbroadcastd" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x58))   // Broadcast int32 value from reg/memory to entire ymm register
 INST3( vpbroadcastq, "pbroadcastq" , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE38(0x59))   // Broadcast int64 value from reg/memory to entire ymm register
 INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19),  BAD_CODE, BAD_CODE)      // Extract 128-bit packed floating point values
+INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39),  BAD_CODE, BAD_CODE)      // Extract 128-bit packed integer values
 INST3( vinsertf128,  "insertf128"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x18))   // Insert 128-bit packed floating point values
+INST3( vinserti128,  "inserti128"  , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x38))   // Insert 128-bit packed integer values
 INST3( vzeroupper,   "zeroupper"   , 0, IUM_WR, 0, 0, 0xC577F8,     BAD_CODE, BAD_CODE)      // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
-
+INST3( vperm2i128,   "perm2i128"   , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x46))   // Permute 128-bit halves of input register
+INST3( vpermq,       "permq"       , 0, IUM_WR, 0, 0, BAD_CODE,     BAD_CODE, SSE3A(0x00))   // Permute 64-bit of input register
 INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION",  0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE)
 #endif // !LEGACY_BACKEND
 //    enum     name            FP  updmode rf wf R/M,R/M[reg]  R/M,icon
index 002e3d8..987ac72 100644 (file)
@@ -2676,6 +2676,90 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
             info->srcCount = 1;
             break;
 
+        case SIMDIntrinsicConvertToSingle:
+            info->srcCount = 1;
+            if (simdTree->gtSIMDBaseType == TYP_UINT)
+            {
+                // We need an internal register different from targetReg.
+                info->isInternalRegDelayFree = true;
+                info->internalIntCount       = 1;
+                info->internalFloatCount     = 2;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT));
+            }
+            break;
+
+        case SIMDIntrinsicConvertToUInt32:
+        case SIMDIntrinsicConvertToInt32:
+            info->srcCount = 1;
+            break;
+
+        case SIMDIntrinsicWidenLo:
+        case SIMDIntrinsicWidenHi:
+            info->srcCount = 1;
+            if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
+            {
+                // We need an internal register different from targetReg.
+                info->isInternalRegDelayFree = true;
+                info->internalFloatCount     = 1;
+                info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            }
+            break;
+
+        case SIMDIntrinsicConvertToInt64:
+        case SIMDIntrinsicConvertToUInt64:
+            // We need an internal register different from targetReg.
+            info->isInternalRegDelayFree = true;
+            info->srcCount               = 1;
+            info->internalIntCount       = 1;
+            if (comp->getSIMDInstructionSet() == InstructionSet_AVX)
+            {
+                info->internalFloatCount = 2;
+            }
+            else
+            {
+                info->internalFloatCount = 1;
+            }
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT));
+            break;
+
+        case SIMDIntrinsicConvertToDouble:
+            // We need an internal register different from targetReg.
+            info->isInternalRegDelayFree = true;
+            info->srcCount               = 1;
+            info->internalIntCount       = 1;
+#ifdef _TARGET_X86_
+            if (simdTree->gtSIMDBaseType == TYP_LONG)
+            {
+                info->internalFloatCount = 3;
+            }
+            else
+#endif
+                if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) || (simdTree->gtSIMDBaseType == TYP_ULONG))
+            {
+                info->internalFloatCount = 2;
+            }
+            else
+            {
+                info->internalFloatCount = 1;
+            }
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT));
+            break;
+
+        case SIMDIntrinsicNarrow:
+            // We need an internal register different from targetReg.
+            info->isInternalRegDelayFree = true;
+            info->srcCount               = 2;
+            if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
+            {
+                info->internalFloatCount = 2;
+            }
+            else
+            {
+                info->internalFloatCount = 1;
+            }
+            info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+            break;
+
         case SIMDIntrinsicShuffleSSE2:
             info->srcCount = 2;
             // Second operand is an integer constant and marked as contained.
index 4ba7832..bbb9a57 100644 (file)
@@ -2609,6 +2609,10 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
 
         // Unary operators that take and return a Vector.
         case SIMDIntrinsicCast:
+        case SIMDIntrinsicConvertToSingle:
+        case SIMDIntrinsicConvertToDouble:
+        case SIMDIntrinsicConvertToInt32:
+        case SIMDIntrinsicConvertToUInt32:
         {
             op1 = impSIMDPopStack(simdType, instMethod);
 
@@ -2617,6 +2621,61 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
         }
         break;
 
+        case SIMDIntrinsicConvertToInt64:
+        case SIMDIntrinsicConvertToUInt64:
+        {
+#ifdef _TARGET_AMD64_
+            op1 = impSIMDPopStack(simdType, instMethod);
+
+            simdTree = gtNewSIMDNode(simdType, op1, nullptr, simdIntrinsicID, baseType, size);
+            retVal   = simdTree;
+#else
+            JITDUMP("SIMD Conversion to Int64/UInt64 is not supported on this platform\n");
+            return nullptr;
+#endif
+        }
+        break;
+
+        case SIMDIntrinsicNarrow:
+        {
+            assert(!instMethod);
+            op2 = impSIMDPopStack(simdType);
+            op1 = impSIMDPopStack(simdType);
+            // op1 and op2 are two input Vector<T>.
+            simdTree = gtNewSIMDNode(simdType, op1, op2, simdIntrinsicID, baseType, size);
+            retVal   = simdTree;
+        }
+        break;
+
+        case SIMDIntrinsicWiden:
+        {
+            GenTree* dstAddrHi = impSIMDPopStack(TYP_BYREF);
+            GenTree* dstAddrLo = impSIMDPopStack(TYP_BYREF);
+            op1                = impSIMDPopStack(simdType);
+            GenTree* dupOp1    = fgInsertCommaFormTemp(&op1, gtGetStructHandleForSIMD(simdType, baseType));
+
+            // Widen the lower half and assign it to dstAddrLo.
+            simdTree = gtNewSIMDNode(simdType, op1, nullptr, SIMDIntrinsicWidenLo, baseType, size);
+            GenTree* loDest =
+                new (this, GT_BLK) GenTreeBlk(GT_BLK, simdType, dstAddrLo, getSIMDTypeSizeInBytes(clsHnd));
+            GenTree* loAsg = gtNewBlkOpNode(loDest, simdTree, getSIMDTypeSizeInBytes(clsHnd),
+                                            false, // not volatile
+                                            true); // copyBlock
+            loAsg->gtFlags |= ((simdTree->gtFlags | dstAddrLo->gtFlags) & GTF_ALL_EFFECT);
+
+            // Widen the upper half and assign it to dstAddrHi.
+            simdTree = gtNewSIMDNode(simdType, dupOp1, nullptr, SIMDIntrinsicWidenHi, baseType, size);
+            GenTree* hiDest =
+                new (this, GT_BLK) GenTreeBlk(GT_BLK, simdType, dstAddrHi, getSIMDTypeSizeInBytes(clsHnd));
+            GenTree* hiAsg = gtNewBlkOpNode(hiDest, simdTree, getSIMDTypeSizeInBytes(clsHnd),
+                                            false, // not volatile
+                                            true); // copyBlock
+            hiAsg->gtFlags |= ((simdTree->gtFlags | dstAddrHi->gtFlags) & GTF_ALL_EFFECT);
+
+            retVal = gtNewOperNode(GT_COMMA, simdType, loAsg, hiAsg);
+        }
+        break;
+
         case SIMDIntrinsicHWAccel:
         {
             GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, 1);
index c4a8866..ff522fd 100644 (file)
@@ -33,12 +33,16 @@ struct SIMDIntrinsicInfo
 // SSE2 Shuffle control byte to shuffle vector <W, Z, Y, X>
 // These correspond to shuffle immediate byte in shufps SSE2 instruction.
 #define SHUFFLE_XXXX 0x00 // 00 00 00 00
+#define SHUFFLE_XXZX 0x08 // 00 00 10 00
 #define SHUFFLE_XXWW 0x0F // 00 00 11 11
 #define SHUFFLE_XYZW 0x1B // 00 01 10 11
 #define SHUFFLE_YXYX 0x44 // 01 00 01 00
+#define SHUFFLE_YWXZ 0x72 // 01 11 00 10
 #define SHUFFLE_YYZZ 0x5A // 01 01 10 10
+#define SHUFFLE_ZXXX 0x80 // 10 00 00 00
 #define SHUFFLE_ZXXY 0x81 // 10 00 00 01
 #define SHUFFLE_ZWXY 0xB1 // 10 11 00 01
+#define SHUFFLE_WYZX 0xD8 // 11 01 10 00
 #define SHUFFLE_WWYY 0xF5 // 11 11 01 01
 #define SHUFFLE_ZZXX 0xA0 // 10 10 00 00
 #endif
index 940ba5f..a28c652 100644 (file)
@@ -487,14 +487,151 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type
             result = INS_movaps;
             break;
 
+        case SIMDIntrinsicConvertToSingle:
+            result = INS_cvtdq2ps;
+            break;
+
+        case SIMDIntrinsicConvertToDouble:
+            assert(baseType == TYP_LONG);
+            result = INS_cvtsi2sd;
+            break;
+
+        case SIMDIntrinsicConvertToInt32:
+        case SIMDIntrinsicConvertToUInt32:
+            assert(baseType == TYP_FLOAT);
+            result = INS_cvttps2dq;
+            break;
+
+        case SIMDIntrinsicConvertToInt64:
+        case SIMDIntrinsicConvertToUInt64:
+            assert(baseType == TYP_DOUBLE);
+            result = INS_cvttsd2si;
+            break;
+
+        case SIMDIntrinsicNarrow:
+            // Note that for the integer types the caller must zero the upper bits of
+            // each source element, since the instructions saturate.
+            switch (baseType)
+            {
+                case TYP_INT:
+                case TYP_UINT:
+                    if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4)
+                    {
+                        result = INS_packusdw;
+                    }
+                    else
+                    {
+                        result = INS_packssdw;
+                    }
+                    break;
+                case TYP_SHORT:
+                case TYP_CHAR:
+                    result = INS_packuswb;
+                    break;
+                default:
+                    assert(!"Invalid baseType for SIMDIntrinsicNarrow");
+                    result = INS_invalid;
+                    break;
+            }
+            break;
+
+        case SIMDIntrinsicWidenLo:
+            // Some of these have multiple instruction implementations, with one instruction to widen the lo half,
+            // and another to widen the hi half.
+            switch (baseType)
+            {
+                case TYP_FLOAT:
+                    result = INS_cvtps2pd;
+                    break;
+                case TYP_INT:
+                case TYP_UINT:
+                    result = INS_punpckldq;
+                    break;
+                case TYP_SHORT:
+                case TYP_CHAR:
+                    result = INS_punpcklwd;
+                    break;
+                case TYP_BYTE:
+                case TYP_UBYTE:
+                    result = INS_punpcklbw;
+                    break;
+                default:
+                    assert(!"Invalid baseType for SIMDIntrinsicWidenLo");
+                    result = INS_invalid;
+                    break;
+            }
+            break;
+
+        case SIMDIntrinsicWidenHi:
+            switch (baseType)
+            {
+                case TYP_FLOAT:
+                    // For this case, we actually use the same instruction.
+                    result = INS_cvtps2pd;
+                    break;
+                case TYP_INT:
+                case TYP_UINT:
+                    result = INS_punpckhdq;
+                    break;
+                case TYP_SHORT:
+                case TYP_CHAR:
+                    result = INS_punpckhwd;
+                    break;
+                case TYP_BYTE:
+                case TYP_UBYTE:
+                    result = INS_punpckhbw;
+                    break;
+                default:
+                    assert(!"Invalid baseType for SIMDIntrinsicWidenHi");
+                    result = INS_invalid;
+                    break;
+            }
+            break;
+
         case SIMDIntrinsicShiftLeftInternal:
-            // base type doesn't matter since the entire vector is shifted left
-            result = INS_pslldq;
+            switch (baseType)
+            {
+                case TYP_SIMD16:
+                    // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
+                    result = INS_pslldq;
+                    break;
+                case TYP_UINT:
+                case TYP_INT:
+                    result = INS_pslld;
+                    break;
+                case TYP_SHORT:
+                case TYP_CHAR:
+                case TYP_USHORT:
+                    result = INS_psllw;
+                    break;
+                default:
+                    assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal");
+                    result = INS_invalid;
+                    break;
+            }
             break;
 
         case SIMDIntrinsicShiftRightInternal:
-            // base type doesn't matter since the entire vector is shifted right
-            result = INS_psrldq;
+            switch (baseType)
+            {
+                case TYP_SIMD16:
+                    // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted.
+                    result = INS_psrldq;
+                    break;
+                case TYP_UINT:
+                case TYP_INT:
+                    result = INS_psrld;
+                    break;
+                case TYP_SHORT:
+                case TYP_CHAR:
+                case TYP_USHORT:
+                    result = INS_psrlw;
+                    break;
+                default:
+                    assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal");
+                    result = INS_invalid;
+                    break;
+            }
             break;
 
         case SIMDIntrinsicUpperSave:
@@ -600,9 +737,9 @@ void CodeGen::genSIMDScalarMove(
                 {
                     // There is no guarantee that upper bits of op1Reg are zero.
                     // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
-                    instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+                    instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
                     getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
-                    ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+                    ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
                     getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
                 }
                 else
@@ -700,7 +837,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
             ins                = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
             inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
 
-            ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+            ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
             getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
 
             ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
@@ -871,7 +1008,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
     }
 
     unsigned int baseTypeSize = genTypeSize(baseType);
-    instruction  insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+    instruction  insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
 
     // We will first consume the list items in execution (left to right) order,
     // and record the registers.
@@ -947,6 +1084,681 @@ void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
     genProduceReg(simdNode);
 }
 
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float)
+//
+// Arguments:
+//    simdNode - The GT_SIMD node
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode)
+{
+    SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
+    assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32) ||
+           (intrinsicID == SIMDIntrinsicConvertToUInt32));
+
+    GenTree*  op1       = simdNode->gtGetOp1();
+    var_types baseType  = simdNode->gtSIMDBaseType;
+    regNumber targetReg = simdNode->gtRegNum;
+    assert(targetReg != REG_NA);
+    var_types targetType = simdNode->TypeGet();
+
+    regNumber   op1Reg = genConsumeReg(op1);
+    instruction ins    = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+    if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT)
+    {
+        regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
+        regNumber tmpReg    = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+        regNumber tmpReg2   = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+        assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
+
+        // We will generate the following:
+        //   vmovdqu  tmpReg2, op1Reg           (copy the src and put it into tmpReg2)
+        //   vmovdqu  targetReg, op1Reg         (copy the src and put it into targetReg)
+        //   vpsrld   targetReg, 16             (get upper 16 bits of src and put it into targetReg)
+        //   vpslld   tmpReg2, 16
+        //   vpsrld   tmpReg2, 16               (get lower 16 bits of src and put it into tmpReg2)
+        //   mov      tmpIntReg, 0x5300000053000000
+        //   vmovd    tmpReg, tmpIntReg
+        //   vpbroadcastd tmpReg, tmpReg        (build mask for converting upper 16 bits of src)
+        //   vorps    targetReg, tmpReg
+        //   vsubps   targetReg, tmpReg         (convert upper 16 bits of src and put it into targetReg)
+        //   vcvtdq2ps tmpReg2, tmpReg2         (convert lower 16 bits of src and put it into tmpReg2)
+        //   vaddps   targetReg, tmpReg2        (add upper 16 bits and lower 16 bits)
+        inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType));
+        if (targetReg != op1Reg)
+        {
+            inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType));
+        }
+
+        // prepare upper 16 bits
+        getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16);
+
+        // prepare lower 16 bits
+        getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16);
+        getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16);
+
+// prepare mask
+#ifdef _TARGET_AMD64_
+        getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+        if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+        {
+            getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000);
+            inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+        }
+        else
+        {
+            getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300);
+            inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+            getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1);
+            getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3);
+        }
+#endif
+        if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+        {
+            inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+        }
+        else
+        {
+            inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType));
+        }
+
+        // convert upper 16 bits
+        inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+        inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+
+        // convert lower 16 bits
+        inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType));
+
+        // add lower 16 bits and upper 16 bits
+        inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType));
+    }
+    else
+    {
+        inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
+    }
+    genProduceReg(simdNode);
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double)
+//
+// Arguments:
+//    intrinsicID      the SIMD intrinsic ID
+//    simdType         the SIMD node type
+//    baseType         the base type of value to be converted
+//    tmpReg           the tmp reg
+//    tmpIntReg        the tmp integer reg
+//    targetReg        the target reg
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID,
+                                    var_types       simdType,
+                                    var_types       baseType,
+                                    regNumber       tmpReg,
+                                    regNumber       tmpIntReg,
+                                    regNumber       targetReg)
+{
+    instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType);
+    if (intrinsicID == SIMDIntrinsicConvertToDouble)
+    {
+        // Note that for mov_xmm2i, the int register is always in the reg2 position
+        inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG);
+        inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType));
+    }
+    else
+    {
+        inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType));
+        inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG);
+    }
+}
+
+//----------------------------------------------------------------------------------
+// genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double)
+//
+// Arguments:
+//    simdNode - The GT_SIMD node
+//
+// Notes:
+//    There are no instructions for converting to/from 64-bit integers, so for these we
+//    do the conversion an element at a time.
+//
+void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode)
+{
+    SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID;
+    assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64) ||
+           (intrinsicID == SIMDIntrinsicConvertToUInt64));
+
+    GenTree*  op1       = simdNode->gtGetOp1();
+    var_types baseType  = simdNode->gtSIMDBaseType;
+    regNumber targetReg = simdNode->gtRegNum;
+    assert(targetReg != REG_NA);
+    var_types      simdType  = simdNode->TypeGet();
+    regNumber      op1Reg    = genConsumeReg(op1);
+    regNumber      tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT);
+    regNumber      tmpReg;
+    regNumber      tmpReg2;
+    regNumber      tmpReg3;
+    InstructionSet iset = compiler->getSIMDInstructionSet();
+
+#ifdef _TARGET_X86_
+    if (baseType == TYP_LONG)
+    {
+        tmpReg  = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+        tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+        tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+        assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg);
+    }
+    else
+#endif
+        if (iset == InstructionSet_AVX || (baseType == TYP_ULONG))
+    {
+        tmpReg  = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+        tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+        tmpReg3 = REG_NA;
+        assert(tmpReg != op1Reg && tmpReg2 != op1Reg);
+    }
+    else
+    {
+        tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+        assert(tmpReg != op1Reg);
+        tmpReg2 = REG_NA;
+        tmpReg3 = REG_NA;
+    }
+
+    if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG))
+    {
+        // We will generate the following
+        //   vmovdqu  tmpReg2, op1Reg               (copy the src and put it into tmpReg2)
+        //   vmovdqu  targetReg, op1Reg             (copy the src and put it into targetReg)
+        //   vpsrlq   targetReg, 32                 (get upper 32 bits of src and put it into targetReg)
+        //   vpsllq   tmpReg2, 32
+        //   vpsrlq   tmpReg2, 32                   (get lower 32 bits of src and put it into tmpReg2)
+        //   mov      tmpIntReg, 0x4530000000000000
+        //   vmovd    tmpReg, tmpIntReg
+        //   vpbroadcastq tmpReg, tmpReg            (build mask for upper 32 bits of src)
+        //   vorpd    targetReg, tmpReg
+        //   vsubpd   targetReg, tmpReg             (convert upper 32 bits of src and put it into targetReg)
+        //   mov      tmpIntReg, 0x4330000000000000
+        //   vmovd    tmpReg, tmpIntReg
+        //   vpbroadcastq tmpReg, tmpReg            (build mask for lower 32 bits of src)
+        //   vorpd    tmpReg2, tmpReg
+        //   vsubpd   tmpReg2, tmpReg               (convert lower 32 bits of src and put it into tmpReg2)
+        //   vaddpd   targetReg, tmpReg2            (add upper 32 bits and lower 32 bits together)
+        inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
+        if (targetReg != op1Reg)
+        {
+            inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType));
+        }
+
+        // prepare upper 32 bits
+        getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
+
+        // prepare lower 32 bits
+        getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
+        getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
+
+// prepare mask for converting upper 32 bits
+#ifdef _TARGET_AMD64_
+        getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+        getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+        getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+#endif
+        if (iset == InstructionSet_AVX)
+        {
+            inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+        else
+        {
+            inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+
+        // convert upper 32 bits
+        inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+
+// prepare mask for converting lower 32 bits
+#ifdef _TARGET_AMD64_
+        getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG);
+#else
+        getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+        getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+#endif
+        if (iset == InstructionSet_AVX)
+        {
+            inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+        else
+        {
+            inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+
+        // convert lower 32 bits
+        inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+        inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+
+        // add lower 32 bits and upper 32 bits
+        inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
+    }
+    else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG))
+    {
+#ifdef _TARGET_AMD64_
+        instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+        instruction leftShiftIns  = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+
+        if (iset == InstructionSet_AVX)
+        {
+            // Extract the high 16-bits
+            getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+
+            // Put v[3] (the high-order element) in tmpReg2 and convert it.
+            inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+            getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+            genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
+
+            // Shift the resulting 64-bits left.
+            getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+
+            // Convert v[2], in the lo bits of tmpReg.
+            // For the convert to double, the convert preserves the upper bits in tmpReg2.
+            // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
+            genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2);
+        }
+
+        // Put v[1] in tmpReg.
+        inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
+        getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+        // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
+        genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+
+        // Shift the resulting 64-bits left.
+        getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+        // Convert the lo 64-bits into targetReg
+        genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg);
+
+        // Merge or copy the results (only at this point are we done with op1Reg).
+        if (tmpReg != targetReg)
+        {
+            inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+
+        if (iset == InstructionSet_AVX)
+        {
+            getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01);
+        }
+#else
+        // get the sign bit and put it in tmpReg3
+        inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType));
+        getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63);
+        getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63);
+
+        // get the absolute value of src and put it into tmpReg2 and targetReg
+        inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType));
+        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY);
+        getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32);
+        inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
+        inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType));
+        inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType));
+
+        // prepare upper 32 bits
+        getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32);
+
+        // prepare lower 32 bits
+        getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32);
+        getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32);
+
+        // prepare mask for converting upper 32 bits
+        getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+        getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+
+        if (iset == InstructionSet_AVX)
+        {
+            inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+        else
+        {
+            inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+
+        // convert upper 32 bits
+        inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+
+        // prepare mask for converting lower 32 bits
+        getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000);
+        inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT);
+        getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4);
+
+        if (iset == InstructionSet_AVX)
+        {
+            inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+        else
+        {
+            inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+
+        // convert lower 32 bits
+        inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+        inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+
+        // add lower 32 bits and upper 32 bits
+        inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType));
+
+        // add sign bit
+        inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType));
+#endif
+    }
+    else
+    {
+        instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+        instruction leftShiftIns  = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+
+        if (iset == InstructionSet_AVX)
+        {
+            // Extract the high 16-bits
+            getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+
+            // Put v[3] (the high-order element) in tmpReg2 and convert it.
+            inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+            getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+            genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2);
+
+            // Shift the resulting 64-bits left.
+            getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8);
+
+            // Convert v[2], in the lo bits of tmpReg.
+            // For the convert to double, the convert preserves the upper bits in tmpReg2.
+            // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits.
+            genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+            inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType));
+        }
+
+        // Put v[1] in tmpReg.
+        inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType));
+        getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+        // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it.
+        genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg);
+
+        // Shift the resulting 64-bits left.
+        getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8);
+
+        // Convert the lo 64-bits into targetReg
+        genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg);
+
+        // Merge or copy the results (only at this point are we done with op1Reg).
+        assert(tmpReg != targetReg);
+        inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType));
+        if (iset == InstructionSet_AVX)
+        {
+            getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01);
+        }
+    }
+    genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register
+//
+// Arguments:
+//    simdNode - The GT_SIMD node
+//
+// Notes:
+//    This is used for the WidenHi intrinsic to extract the upper half.
+//    On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes.
+//
+void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg)
+{
+    var_types simdType = simdNode->TypeGet();
+    emitAttr  emitSize = emitActualTypeSize(simdType);
+    if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
+    {
+        instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128;
+        getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01);
+    }
+    else
+    {
+        instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+        if (tgtReg != srcReg)
+        {
+            inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize);
+        }
+        getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8);
+    }
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations
+//
+// Arguments:
+//    simdNode - The GT_SIMD node
+//
+// Notes:
+//    The Widen intrinsics are broken into separate intrinsics for the two results.
+//
+void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode)
+{
+    assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) ||
+           (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi));
+
+    GenTree*  op1       = simdNode->gtGetOp1();
+    var_types baseType  = simdNode->gtSIMDBaseType;
+    regNumber targetReg = simdNode->gtRegNum;
+    assert(targetReg != REG_NA);
+    var_types      simdType = simdNode->TypeGet();
+    InstructionSet iset     = compiler->getSIMDInstructionSet();
+
+    genConsumeOperands(simdNode);
+    regNumber   op1Reg   = op1->gtRegNum;
+    regNumber   srcReg   = op1Reg;
+    emitAttr    emitSize = emitActualTypeSize(simdType);
+    instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+
+    if (baseType == TYP_FLOAT)
+    {
+        if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
+        {
+            genSIMDExtractUpperHalf(simdNode, srcReg, targetReg);
+            srcReg = targetReg;
+        }
+        inst_RV_RV(widenIns, targetReg, srcReg, simdType);
+    }
+    else
+    {
+        // We will generate the following on AVX:
+        // vpermq   targetReg, op1Reg, 0xd4|0xe8
+        // vpxor    tmpReg, tmpReg
+        // vpcmpgt[b|w|d] tmpReg, targetReg             (if basetype is signed)
+        // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg
+        regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+        assert(tmpReg != op1Reg);
+
+        if (iset == InstructionSet_AVX)
+        {
+            // permute op1Reg and put it into targetReg
+            unsigned ival = 0xd4;
+            if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)
+            {
+                ival = 0xe8;
+            }
+            getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival);
+        }
+        else if (targetReg != op1Reg)
+        {
+            inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
+        }
+
+        genSIMDZero(simdType, baseType, tmpReg);
+        if (!varTypeIsUnsigned(baseType))
+        {
+            instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType);
+            inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize);
+        }
+        inst_RV_RV(widenIns, targetReg, tmpReg, simdType);
+    }
+    genProduceReg(simdNode);
+}
+
+//--------------------------------------------------------------------------------
+// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations
+//
+// Arguments:
+//    simdNode - The GT_SIMD node
+//
+// Notes:
+//    This intrinsic takes two arguments. The first operand is narrowed to produce the
+//    lower elements of the results, and the second operand produces the high elements.
+//
+void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode)
+{
+    assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow);
+
+    GenTree*  op1       = simdNode->gtGetOp1();
+    GenTree*  op2       = simdNode->gtGetOp2();
+    var_types baseType  = simdNode->gtSIMDBaseType;
+    regNumber targetReg = simdNode->gtRegNum;
+    assert(targetReg != REG_NA);
+    var_types      simdType = simdNode->TypeGet();
+    emitAttr       emitSize = emitTypeSize(simdType);
+    InstructionSet iset     = compiler->getSIMDInstructionSet();
+
+    genConsumeOperands(simdNode);
+    regNumber op1Reg = op1->gtRegNum;
+    regNumber op2Reg = op2->gtRegNum;
+    if (baseType == TYP_DOUBLE)
+    {
+        regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+        inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType);
+        inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType);
+        // Now insert the high-order result (in tmpReg) into the upper half of targetReg.
+        if (compiler->canUseAVX())
+        {
+            getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01);
+        }
+        else
+        {
+            inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX);
+        }
+    }
+    else if (varTypeIsLong(baseType))
+    {
+        if (iset == InstructionSet_AVX)
+        {
+            // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg.
+            // We will generate the following:
+            //   vextracti128 tmpReg, op1Reg, 1       (extract elements 2 and 3 into tmpReg)
+            //   vextracti128 tmpReg2, op2Reg, 1      (extract elements 6 and 7 into tmpReg2)
+            //   vinserti128  tmpReg, tmpReg2, 1       (insert elements 6 and 7 into the high half of tmpReg)
+            //   mov          tmpReg2, op1Reg
+            //   vinserti128  tmpReg2, op2Reg, 1      (insert elements 4 and 5 into the high half of tmpReg2)
+            //   pshufd       tmpReg, tmpReg, XXZX    ( -  - 7L 6L  -  - 3L 2L) in tmpReg
+            //   pshufd       tgtReg, tmpReg2, XXZX   ( -  - 5L 4L  -  - 1L 0L) in tgtReg
+            //   punpcklqdq   tgtReg, tmpReg
+            regNumber tmpReg  = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+            regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+            getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01);
+            getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
+            getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01);
+            inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize);
+            getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01);
+            getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX);
+            getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX);
+            inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize);
+        }
+        else
+        {
+            // We will generate the following:
+            //   pshufd  targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements)
+            //   psrldq  targetReg, 8            (shift them right to get zeros in the high elements)
+            //   pshufd  tmpReg, op2Reg, XXZX    (same as above, but extract into the lower two 32-bit elements)
+            //   pslldq  tmpReg, 8               (now shift these left to get zeros in the low elements)
+            //   por     targetReg, tmpReg
+            regNumber   tmpReg        = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+            instruction shiftLeftIns  = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
+            instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
+            emitAttr    emitSize      = emitTypeSize(simdType);
+
+            getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX);
+            getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8);
+            getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX);
+            getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8);
+            inst_RV_RV(INS_por, targetReg, tmpReg, simdType);
+        }
+    }
+    else
+    {
+        // We will generate the following:
+        //   mov     targetReg, op1Reg
+        //   mov     tmpReg, op2Reg
+        //   psll?   targetReg, shiftCount
+        //   pslr?   targetReg, shiftCount
+        //   psll?   tmpReg, shiftCount
+        //   pslr?   tmpReg, shiftCount
+        //   <pack>  targetReg, tmpReg
+        // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType),
+        // and <pack> is the appropriate instruction to pack the result (note that we have to truncate to
+        // get CLR type semantics; otherwise it will saturate).
+        //
+        int         shiftCount    = genTypeSize(baseType) * (BITS_IN_BYTE / 2);
+        instruction ins           = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
+        instruction shiftLeftIns  = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+        instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+
+        if (iset == InstructionSet_AVX)
+        {
+            regNumber tmpReg  = simdNode->ExtractTempReg(RBM_ALLFLOAT);
+            regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+            // The AVX instructions generally operate on "lanes", so we have to permute the
+            // inputs so that the destination register has the low 128-bit halves of the two
+            // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs.
+            getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20);
+            getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31);
+            getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount);
+            getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount);
+            getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
+            getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount);
+            inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType));
+        }
+        else
+        {
+            regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT);
+
+            inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize);
+            inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize);
+
+            instruction tmpShiftRight = shiftRightIns;
+            if ((baseType == TYP_INT || baseType == TYP_UINT) && iset == InstructionSet_SSE2)
+            {
+                tmpShiftRight = INS_psrad;
+            }
+
+            getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount);
+            getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount);
+            getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount);
+            getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount);
+            inst_RV_RV(ins, targetReg, tmpReg, simdType);
+        }
+    }
+    genProduceReg(simdNode);
+}
+
 //--------------------------------------------------------------------------------
 // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
 // add, sub, mul, bit-wise And, AndNot and Or.
@@ -1076,7 +1888,7 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 
         // Extract first and third double word results from tmpReg
         // tmpReg = shuffle(0,0,2,0) of tmpReg
-        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, 0x08);
+        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX);
 
         // targetReg[63:0] = op1[0] * op2[0]
         // targetReg[127:64] = op1[2] * op2[2]
@@ -1085,7 +1897,7 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 
         // Extract first and third double word results from targetReg
         // targetReg = shuffle(0,0,2,0) of targetReg
-        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, 0x08);
+        getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX);
 
         // pack the results into a single vector
         inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
@@ -1125,9 +1937,9 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
         // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
         unsigned shiftCount = 16 - simdNode->gtSIMDSize;
         assert(shiftCount != 0);
-        instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+        instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16);
         getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
-        ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+        ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
         getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
     }
 
@@ -1834,7 +2646,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
 
         if (byteShiftCnt != 0)
         {
-            instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+            instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
             getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
         }
     }
@@ -1904,7 +2716,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
                     inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
                 }
 
-                ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
+                ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
                 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
             }
             else
@@ -2390,6 +3202,27 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
             genSIMDIntrinsicUnOp(simdNode);
             break;
 
+        case SIMDIntrinsicConvertToSingle:
+        case SIMDIntrinsicConvertToInt32:
+        case SIMDIntrinsicConvertToUInt32:
+            genSIMDIntrinsic32BitConvert(simdNode);
+            break;
+
+        case SIMDIntrinsicConvertToDouble:
+        case SIMDIntrinsicConvertToInt64:
+        case SIMDIntrinsicConvertToUInt64:
+            genSIMDIntrinsic64BitConvert(simdNode);
+            break;
+
+        case SIMDIntrinsicWidenLo:
+        case SIMDIntrinsicWidenHi:
+            genSIMDIntrinsicWiden(simdNode);
+            break;
+
+        case SIMDIntrinsicNarrow:
+            genSIMDIntrinsicNarrow(simdNode);
+            break;
+
         case SIMDIntrinsicAdd:
         case SIMDIntrinsicSub:
         case SIMDIntrinsicMul:
index 0160582..2eb4df3 100644 (file)
@@ -119,6 +119,23 @@ SIMD_INTRINSIC("ConditionalSelect",         false,       Select,
 // Cast
 SIMD_INTRINSIC("op_Explicit",               false,       Cast,                     "Cast",                   TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
 
+// Convert int/uint to single
+SIMD_INTRINSIC("ConvertToSingle",           false,       ConvertToSingle,          "ConvertToSingle",        TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_INT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert long/ulong to double
+SIMD_INTRINSIC("ConvertToDouble",           false,       ConvertToDouble,          "ConvertToDouble",        TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_LONG, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert single to int
+SIMD_INTRINSIC("ConvertToInt32",            false,       ConvertToInt32,           "ConvertToInt32",         TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert single to uint
+SIMD_INTRINSIC("ConvertToUInt32",           false,       ConvertToUInt32,          "ConvertToUInt32",        TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert double to long
+SIMD_INTRINSIC("ConvertToInt64",            false,       ConvertToInt64,           "ConvertToInt64",         TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Convert double to ulong
+SIMD_INTRINSIC("ConvertToUInt64",           false,       ConvertToUInt64,          "ConvertToUInt64",        TYP_STRUCT,     1,      {TYP_STRUCT, TYP_UNDEF,  TYP_UNDEF},   {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Narrow two input Vector<T>s to a single Vector<T>. The return value's lower elements are the elements from src1, and the upper elements are from src2.
+SIMD_INTRINSIC("Narrow",                    false,       Narrow,                   "Narrow",                 TYP_STRUCT,     2,      {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF},   {TYP_INT, TYP_DOUBLE, TYP_LONG, TYP_CHAR, TYP_SHORT, TYP_UINT, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Widen one input Vector<T> to two Vector<T>s: dest1 contains the lower half of elements in src, and dest2 contains the upper half of elements in src.
+SIMD_INTRINSIC("Widen",                     false,       Widen,                    "Widen",                  TYP_VOID,       3,      {TYP_STRUCT, TYP_BYREF,  TYP_BYREF},   {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+
 // Miscellaneous
 SIMD_INTRINSIC("get_IsHardwareAccelerated", false,       HWAccel,                  "HWAccel",                TYP_BOOL,       0,      {TYP_UNDEF,  TYP_UNDEF,  TYP_UNDEF},   {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
 
@@ -134,7 +151,11 @@ SIMD_INTRINSIC("ShiftRightInternal",        false,       ShiftRightInternal,
 SIMD_INTRINSIC("UpperSave",                 false,       UpperSave,                "UpperSave Internal",     TYP_STRUCT,     2,      {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF},     {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
 SIMD_INTRINSIC("UpperRestore",              false,       UpperRestore,             "UpperRestore Internal",  TYP_STRUCT,     2,      {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF},     {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
 
-SIMD_INTRINSIC(nullptr,                     false,       Invalid,                  "Invalid",                TYP_UNDEF,      0,      {TYP_UNDEF,  TYP_UNDEF,  TYP_UNDEF},   {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+// Internal intrinsics for Widen
+SIMD_INTRINSIC("WidenHi",                   false,       WidenHi,                   "WidenHi",               TYP_VOID,       2,      {TYP_UNDEF, TYP_UNDEF,  TYP_UNDEF},    {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+SIMD_INTRINSIC("WidenLo",                   false,       WidenLo,                   "WidenLo",               TYP_VOID,       2,      {TYP_UNDEF, TYP_UNDEF,  TYP_UNDEF},    {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
+
+SIMD_INTRINSIC(nullptr,                     false,       Invalid,                   "Invalid",               TYP_UNDEF,      0,      {TYP_UNDEF,  TYP_UNDEF,  TYP_UNDEF},   {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
 #undef SIMD_INTRINSIC
 
 #else //_TARGET_XARCH_
index 6c65b22..c2e4eb1 100644 (file)
@@ -559,6 +559,21 @@ partial class VectorTest
                 returnVal = Fail;
             }
         }
+
+        JitLog jitLog = new JitLog();        
+        if (!jitLog.Check("System.Numerics.Vector:ConvertToInt32(struct):struct")) returnVal = Fail;
+        if (!jitLog.Check("System.Numerics.Vector:ConvertToUInt32(struct):struct")) returnVal = Fail;
+        if (!jitLog.Check("System.Numerics.Vector:ConvertToSingle(struct):struct")) returnVal = Fail;
+        // Note: SIMD Conversion to Int64/UInt64 is not supported on x86
+#if !BIT32
+        if (!jitLog.Check("System.Numerics.Vector:ConvertToInt64(struct):struct")) returnVal = Fail;
+        if (!jitLog.Check("System.Numerics.Vector:ConvertToUInt64(struct):struct")) returnVal = Fail;
+#endif // !BIT32
+        if (!jitLog.Check("System.Numerics.Vector:ConvertToDouble(struct):struct")) returnVal = Fail;
+        if (!jitLog.Check("System.Numerics.Vector:Narrow(struct,struct):struct")) returnVal = Fail;
+        if (!jitLog.Check("System.Numerics.Vector:Widen(struct,byref,byref)")) returnVal = Fail;
+        jitLog.Dispose();
+
         return returnVal;
     }
 }
index 01231e2..db6fb24 100644 (file)
@@ -14,6 +14,9 @@
     <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
     <NuGetPackageImportStamp>7a9bfb7d</NuGetPackageImportStamp>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(BuildArch)' == 'x86'">
+    <DefineConstants>BIT32;$(DefineConstants)</DefineConstants>
+  </PropertyGroup>
   <!-- Default configurations to help VS understand the configurations -->
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />
index f751b88..82206ef 100644 (file)
@@ -14,6 +14,9 @@
     <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
     <NuGetPackageImportStamp>7a9bfb7d</NuGetPackageImportStamp>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(BuildArch)' == 'x86'">
+    <DefineConstants>BIT32;$(DefineConstants)</DefineConstants>
+  </PropertyGroup>
   <!-- Default configurations to help VS understand the configurations -->
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "></PropertyGroup>
   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' " />