From: helloguo Date: Mon, 3 Apr 2017 17:28:09 +0000 (-0700) Subject: add jit intrinsic support for vector conversion/narrow/widen on AMD64 and x86, except... X-Git-Tag: accepted/tizen/base/20180629.140029~1083^2~709^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=965d5eea7f9b2aac3e6bd5a2f0061b926b7c5f8c;p=platform%2Fupstream%2Fcoreclr.git add jit intrinsic support for vector conversion/narrow/widen on AMD64 and x86, except double->long/ulong conversion on x86 --- diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h index 3bd0eac..5cead6d 100644 --- a/src/jit/codegenlinear.h +++ b/src/jit/codegenlinear.h @@ -80,6 +80,17 @@ void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode); void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode); void genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode); void genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode); +void genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID, + var_types simdType, + var_types baseType, + regNumber tmpReg, + regNumber tmpIntReg, + regNumber targetReg); +void genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode); +void genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode); +void genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode); +void genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg); +void genSIMDIntrinsicWiden(GenTreeSIMD* simdNode); void genSIMDIntrinsic(GenTreeSIMD* simdNode); void genSIMDCheck(GenTree* treeNode); diff --git a/src/jit/emitfmtsxarch.h b/src/jit/emitfmtsxarch.h index 49afcb5..6d15fcf 100644 --- a/src/jit/emitfmtsxarch.h +++ b/src/jit/emitfmtsxarch.h @@ -109,7 +109,7 @@ IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w reg , r/w re IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW, SCNS) // r/w reg , r/w reg2 , const IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg , read reg2 , read reg3 - +IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write reg , read reg2 , read reg3, const //---------------------------------------------------------------------------- // The following formats are used for direct addresses (e.g. static data members) //---------------------------------------------------------------------------- diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 7608130..659c260 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -94,7 +94,10 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins) ins == INS_vinsertf128 || ins == INS_punpckldq || ins == INS_phaddd || ins == INS_pminub || ins == INS_pminsw || ins == INS_pminsb || ins == INS_pminsd || ins == INS_pminuw || ins == INS_pminud || ins == INS_pmaxub || ins == INS_pmaxsw || ins == INS_pmaxsb || ins == INS_pmaxsd || ins == INS_pmaxuw || - ins == INS_pmaxud); + ins == INS_pmaxud || ins == INS_vinserti128 || ins == INS_punpckhbw || ins == INS_punpcklbw || + ins == INS_punpckhqdq || ins == INS_punpcklqdq || ins == INS_punpckhwd || ins == INS_punpcklwd || + ins == INS_punpckhdq || ins == INS_packssdw || ins == INS_packsswb || ins == INS_packuswb || + ins == INS_packusdw || ins == INS_vperm2i128); } // Returns true if the AVX instruction is a move operator that requires 3 operands. @@ -105,8 +108,8 @@ bool emitter::IsThreeOperandBinaryAVXInstruction(instruction ins) // to indicate whether a 3-operand instruction. bool emitter::IsThreeOperandMoveAVXInstruction(instruction ins) { - return IsAVXInstruction(ins) && - (ins == INS_movlpd || ins == INS_movlps || ins == INS_movhpd || ins == INS_movhps || ins == INS_movss); + return IsAVXInstruction(ins) && (ins == INS_movlpd || ins == INS_movlps || ins == INS_movhpd || ins == INS_movhps || + ins == INS_movss || ins == INS_movlhps); } // ------------------------------------------------------------------------------ @@ -206,6 +209,14 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att // Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix bool TakesRexWPrefix(instruction ins, emitAttr attr) { + // Because the current implementation of AVX does not have a way to distinguish between the register + // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are + // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE), + // and here we must special case these by the opcode. + if (ins == INS_vpermq) + { + return true; + } #ifdef _TARGET_AMD64_ // movsx should always sign extend out to 8 bytes just because we don't track // whether the dest should be 4 bytes or 8 bytes (attr indicates the size @@ -342,7 +353,6 @@ unsigned RegEncoding(regNumber reg) // AVX: specific bits within VEX prefix need to be set in bit-inverted form. emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) { -#ifdef _TARGET_AMD64_ if (UseAVX() && IsAVXInstruction(ins)) { // W-bit is available only in 3-byte VEX prefix that starts with byte C4. @@ -351,7 +361,7 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) // W-bit is the only bit that is added in non bit-inverted form. return code | 0x00008000000000ULL; } - +#ifdef _TARGET_AMD64_ return code | 0x4800000000ULL; #else assert(!"UNREACHED"); @@ -3810,6 +3820,40 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, emitCurIGsize += sz; } +/********************************************************************************** +* emitIns_R_R_R_I: Add an instruction with three register operands and an immediate. +* +* Arguments: +* ins - the instruction to add +* attr - the emitter attribute for instruction +* targetReg - the target (destination) register +* reg1 - the first source register +* reg2 - the second source register +* ival - the immediate value +*/ + +void emitter::emitIns_R_R_R_I( + instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, int ival) +{ + assert(IsSSEOrAVXInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins)); + // Currently vex prefix only use three bytes mode. + // size = vex + opcode + ModR/M + 1-byte-cns = 3 + 1 + 1 + 1 = 6 + // TODO-XArch-CQ: We should create function which can calculate all kinds of AVX instructions size in future + UNATIVE_OFFSET sz = 6; + + instrDesc* id = emitNewInstrCns(attr, ival); + id->idIns(ins); + id->idInsFmt(IF_RWR_RRD_RRD_CNS); + id->idReg1(targetReg); + id->idReg2(reg1); + id->idReg3(reg2); + + id->idCodeSize(sz); + dispIns(id); + emitCurIGsize += sz; +} + #endif /***************************************************************************** * @@ -6995,6 +7039,15 @@ void emitter::emitDispIns( printf("%s, ", emitRegName(id->idReg2(), attr)); printf("%s", emitRegName(id->idReg3(), attr)); break; + case IF_RWR_RRD_RRD_CNS: + assert(IsAVXInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins)); + printf("%s, ", emitRegName(id->idReg1(), attr)); + printf("%s, ", emitRegName(id->idReg2(), attr)); + printf("%s, ", emitRegName(id->idReg3(), attr)); + val = emitGetInsSC(id); + goto PRINT_CONSTANT; + break; #endif case IF_RRW_RRW_CNS: printf("%s,", emitRegName(id->idReg1(), attr)); @@ -9514,7 +9567,34 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); assert(valInByte); - assert(ins == INS_psrldq || ins == INS_pslldq); + // The left and right shifts use the same encoding, and are distinguished by the Reg/Opcode field. + regNumber regOpcode; + switch (ins) + { + case INS_psrldq: + regOpcode = (regNumber)3; + break; + case INS_pslldq: + regOpcode = (regNumber)7; + break; + case INS_psrld: + case INS_psrlw: + case INS_psrlq: + regOpcode = (regNumber)2; + break; + case INS_pslld: + case INS_psllw: + case INS_psllq: + regOpcode = (regNumber)6; + break; + case INS_psrad: + regOpcode = (regNumber)4; + break; + default: + assert(!"Invalid instruction for SSE2 instruction of the form: opcode reg, immed8"); + regOpcode = REG_NA; + break; + } // Get the 'base' opcode. code = insCodeMI(ins); @@ -9528,14 +9608,6 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) code = insEncodeReg3456(ins, reg, size, code); } - // In case of psrldq - // Reg/Opcode = 3 - // R/M = reg1 - // - // In case of pslldq - // Reg/Opcode = 7 - // R/M = reg1 - regNumber regOpcode = (regNumber)((ins == INS_psrldq) ? 3 : 7); unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8; // Output the REX prefix @@ -10659,6 +10731,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst = emitOutputRRR(dst, id); sz = emitSizeOfInsDsc(id); break; + case IF_RWR_RRD_RRD_CNS: + dst = emitOutputRRR(dst, id); + sz = emitSizeOfInsDsc(id); + dst += emitOutputByte(dst, emitGetInsSC(id)); + break; #endif case IF_RRW_RRW_CNS: @@ -10690,6 +10767,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(code & 0x00FF0000); #ifdef FEATURE_AVX_SUPPORT + if (TakesRexWPrefix(ins, size)) + { + code = AddRexWPrefix(ins, code); + } + if (TakesVexPrefix(ins)) { if (IsThreeOperandBinaryAVXInstruction(ins)) @@ -10718,11 +10800,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (Is4ByteAVXInstruction(ins)) { - // We just need to output the last byte of the opcode. assert((code & 0xFF) == 0); - assert((code & 0xFF00) != 0xC000); - dst += emitOutputByte(dst, (code >> 8) & 0xFF); - code = 0; + if ((code & 0xFF00) == 0xC000) + { + dst += emitOutputWord(dst, code | regcode); + } + else + { + dst += emitOutputByte(dst, (code >> 8) & 0xFF); + dst += emitOutputByte(dst, 0xC0 | (regcode >> 8)); + } } else if (code & 0xFF000000) { @@ -10732,27 +10819,25 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (Is4ByteSSE4Instruction(ins)) { dst += emitOutputWord(dst, code); - code = 0; + dst += emitOutputByte(dst, 0xC0 | (regcode >> 8)); + } + else + { + assert((code & 0xFF00) == 0xC000); + dst += emitOutputWord(dst, code | regcode); } } else if (code & 0x00FF0000) { dst += emitOutputByte(dst, code >> 16); code &= 0x0000FFFF; - } - - // Note that regcode is shifted by 8-bits above to align with RM byte. - if (code != 0) - { assert((code & 0xFF00) == 0xC000); dst += emitOutputWord(dst, code | regcode); } else { - // This case occurs for SSE4/AVX instructions. - // Note that regcode is left shifted by 8-bits. - assert(Is4ByteAVXInstruction(ins) || Is4ByteSSE4Instruction(ins)); - dst += emitOutputByte(dst, 0xC0 | (regcode >> 8)); + assert((code & 0xFF00) == 0xC000); + dst += emitOutputWord(dst, code | regcode); } dst += emitOutputByte(dst, emitGetInsSC(id)); diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index faeba7d..d439f7e 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -360,6 +360,8 @@ void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg #ifdef FEATURE_AVX_SUPPORT void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3); + +void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, int ival); #endif void emitIns_S(instruction ins, emitAttr attr, int varx, int offs); diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index 729bece..0952770 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -200,10 +200,13 @@ INST3( movapd, "movapd" , 0, IUM_WR, 0, 0, PCKDBL(0x29), BAD_CODE, PCK INST3( movaps, "movaps" , 0, IUM_WR, 0, 0, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28)) INST3( movupd, "movupd" , 0, IUM_WR, 0, 0, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10)) INST3( movups, "movups" , 0, IUM_WR, 0, 0, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10)) +INST3( movlhps, "movlhps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x16)) INST3( shufps, "shufps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0xC6)) INST3( shufpd, "shufpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC6)) - + +INST3( punpckhdq, "punpckhdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6A)) + // SSE 2 arith INST3( addps, "addps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x58)) // Add packed singles INST3( addss, "addss", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSEFLT(0x58)) // Add scalar singles @@ -289,8 +292,19 @@ INST3( pand, "pand" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, INST3( pandn, "pandn" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDF)) // Packed bit-wise AND NOT of two xmm regs INST3( por, "por" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEB)) // Packed bit-wise OR of two xmm regs INST3( pxor, "pxor" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEF)) // Packed bit-wise XOR of two xmm regs + +// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode, +// which is handled in emitxarch.cpp. INST3( psrldq, "psrldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift right logical of xmm reg by given number of bytes INST3( pslldq, "pslldq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Shift left logical of xmm reg by given number of bytes +INST3( psllq, "psllq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift left logical of 64-bit integers +INST3( psrlq, "psrlq" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x73), BAD_CODE ) // Packed shift right logical of 64-bit integers +INST3( pslld, "pslld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift left logical of 32-bit integers +INST3( psrld, "psrld" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right logical of 32-bit integers +INST3( psllw, "psllw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift left logical of 16-bit integers +INST3( psrlw, "psrlw" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x71), BAD_CODE ) // Packed shift right logical of 16-bit integers +INST3( psrad, "psrad" , 0, IUM_WR, 0, 0, BAD_CODE, PCKDBL(0x72), BAD_CODE ) // Packed shift right arithmetic of 32-bit integers + INST3( pmaxub, "pmaxub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDE)) // packed maximum unsigned bytes INST3( pminub, "pminub" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xDA)) // packed minimum unsigned bytes INST3( pmaxsw, "pmaxsw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xEE)) // packed maximum signed words @@ -306,14 +320,24 @@ INST3( pshufd, "pshufd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, INST3( pextrw, "pextrw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC5)) // Extract 16-bit value into a r32 with zero extended to 32-bits INST3( pinsrw, "pinsrw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0xC4)) // packed insert word +INST3( punpckhbw, "punpckhbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x68)) // Packed logical (unsigned) widen ubyte to ushort (hi) +INST3( punpcklbw, "punpcklbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x60)) // Packed logical (unsigned) widen ubyte to ushort (lo) +INST3( punpckhqdq, "punpckhqdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6D)) // Packed logical (unsigned) widen uint to ulong (hi) +INST3( punpcklqdq, "punpcklqdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6C)) // Packed logical (unsigned) widen uint to ulong (lo) +INST3( punpckhwd, "punpckhwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x69)) // Packed logical (unsigned) widen ushort to uint (hi) +INST3( punpcklwd, "punpcklwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x61)) // Packed logical (unsigned) widen ushort to uint (lo) + +INST3( packssdw, "packssdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x6B)) // Pack (narrow) int to short with saturation +INST3( packsswb, "packsswb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x63)) // Pack (narrow) short to byte with saturation +INST3( packuswb, "packuswb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x67)) // Pack (narrow) short to unsigned byte with saturation #endif // !LEGACY_BACKEND INST3(LAST_SSE2_INSTRUCTION, "LAST_SSE2_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) #ifndef LEGACY_BACKEND INST3(FIRST_SSE4_INSTRUCTION, "FIRST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) // enum name FP updmode rf wf MR MI RM -INST3( dpps, "dpps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x40)) // Packed bit-wise AND NOT of two xmm regs -INST3( dppd, "dppd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x41)) // Packed bit-wise AND NOT of two xmm regs +INST3( dpps, "dpps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x40)) // Packed dot product of two float vector regs +INST3( dppd, "dppd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x41)) // Packed dot product of two double vector regs INST3( insertps, "insertps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x21)) // Insert packed single precision float value INST3( pcmpeqq, "pcmpeqq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x29)) // Packed compare 64-bit integers for equality INST3( pcmpgtq, "pcmpgtq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x37)) // Packed compare 64-bit integers for equality @@ -331,6 +355,11 @@ INST3( pmaxsb, "pmaxsb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( pmaxsd, "pmaxsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3D)) // packed maximum 32-bit signed integers INST3( pmaxuw, "pmaxuw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3E)) // packed maximum 16-bit unsigned integers INST3( pmaxud, "pmaxud" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x3F)) // packed maximum 32-bit unsigned integers +INST3( pmovsxbw, "pmovsxbw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x20)) // Packed sign extend byte to short +INST3( pmovsxwd, "pmovsxwd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x23)) // Packed sign extend short to int +INST3( pmovsxdq, "pmovsxdq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x25)) // Packed sign extend int to long +INST3( packusdw, "packusdw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x2B)) // Pack (narrow) int to unsigned short with saturation + INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) @@ -342,9 +371,12 @@ INST3( vpbroadcastw, "pbroadcastw" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( vpbroadcastd, "pbroadcastd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x58)) // Broadcast int32 value from reg/memory to entire ymm register INST3( vpbroadcastq, "pbroadcastq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x59)) // Broadcast int64 value from reg/memory to entire ymm register INST3( vextractf128, "extractf128" , 0, IUM_WR, 0, 0, SSE3A(0x19), BAD_CODE, BAD_CODE) // Extract 128-bit packed floating point values +INST3( vextracti128, "extracti128" , 0, IUM_WR, 0, 0, SSE3A(0x39), BAD_CODE, BAD_CODE) // Extract 128-bit packed integer values INST3( vinsertf128, "insertf128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x18)) // Insert 128-bit packed floating point values +INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x38)) // Insert 128-bit packed integer values INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) - +INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register +INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) #endif // !LEGACY_BACKEND // enum name FP updmode rf wf R/M,R/M[reg] R/M,icon diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 002e3d8..987ac72 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2676,6 +2676,90 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) info->srcCount = 1; break; + case SIMDIntrinsicConvertToSingle: + info->srcCount = 1; + if (simdTree->gtSIMDBaseType == TYP_UINT) + { + // We need an internal register different from targetReg. + info->isInternalRegDelayFree = true; + info->internalIntCount = 1; + info->internalFloatCount = 2; + info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT)); + } + break; + + case SIMDIntrinsicConvertToUInt32: + case SIMDIntrinsicConvertToInt32: + info->srcCount = 1; + break; + + case SIMDIntrinsicWidenLo: + case SIMDIntrinsicWidenHi: + info->srcCount = 1; + if (varTypeIsIntegral(simdTree->gtSIMDBaseType)) + { + // We need an internal register different from targetReg. + info->isInternalRegDelayFree = true; + info->internalFloatCount = 1; + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + } + break; + + case SIMDIntrinsicConvertToInt64: + case SIMDIntrinsicConvertToUInt64: + // We need an internal register different from targetReg. + info->isInternalRegDelayFree = true; + info->srcCount = 1; + info->internalIntCount = 1; + if (comp->getSIMDInstructionSet() == InstructionSet_AVX) + { + info->internalFloatCount = 2; + } + else + { + info->internalFloatCount = 1; + } + info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT)); + break; + + case SIMDIntrinsicConvertToDouble: + // We need an internal register different from targetReg. + info->isInternalRegDelayFree = true; + info->srcCount = 1; + info->internalIntCount = 1; +#ifdef _TARGET_X86_ + if (simdTree->gtSIMDBaseType == TYP_LONG) + { + info->internalFloatCount = 3; + } + else +#endif + if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) || (simdTree->gtSIMDBaseType == TYP_ULONG)) + { + info->internalFloatCount = 2; + } + else + { + info->internalFloatCount = 1; + } + info->setInternalCandidates(lsra, lsra->allSIMDRegs() | lsra->allRegs(TYP_INT)); + break; + + case SIMDIntrinsicNarrow: + // We need an internal register different from targetReg. + info->isInternalRegDelayFree = true; + info->srcCount = 2; + if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) && (simdTree->gtSIMDBaseType != TYP_DOUBLE)) + { + info->internalFloatCount = 2; + } + else + { + info->internalFloatCount = 1; + } + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + break; + case SIMDIntrinsicShuffleSSE2: info->srcCount = 2; // Second operand is an integer constant and marked as contained. diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp index 4ba7832..bbb9a57 100644 --- a/src/jit/simd.cpp +++ b/src/jit/simd.cpp @@ -2609,6 +2609,10 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, // Unary operators that take and return a Vector. case SIMDIntrinsicCast: + case SIMDIntrinsicConvertToSingle: + case SIMDIntrinsicConvertToDouble: + case SIMDIntrinsicConvertToInt32: + case SIMDIntrinsicConvertToUInt32: { op1 = impSIMDPopStack(simdType, instMethod); @@ -2617,6 +2621,61 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE opcode, } break; + case SIMDIntrinsicConvertToInt64: + case SIMDIntrinsicConvertToUInt64: + { +#ifdef _TARGET_AMD64_ + op1 = impSIMDPopStack(simdType, instMethod); + + simdTree = gtNewSIMDNode(simdType, op1, nullptr, simdIntrinsicID, baseType, size); + retVal = simdTree; +#else + JITDUMP("SIMD Conversion to Int64/UInt64 is not supported on this platform\n"); + return nullptr; +#endif + } + break; + + case SIMDIntrinsicNarrow: + { + assert(!instMethod); + op2 = impSIMDPopStack(simdType); + op1 = impSIMDPopStack(simdType); + // op1 and op2 are two input Vector. + simdTree = gtNewSIMDNode(simdType, op1, op2, simdIntrinsicID, baseType, size); + retVal = simdTree; + } + break; + + case SIMDIntrinsicWiden: + { + GenTree* dstAddrHi = impSIMDPopStack(TYP_BYREF); + GenTree* dstAddrLo = impSIMDPopStack(TYP_BYREF); + op1 = impSIMDPopStack(simdType); + GenTree* dupOp1 = fgInsertCommaFormTemp(&op1, gtGetStructHandleForSIMD(simdType, baseType)); + + // Widen the lower half and assign it to dstAddrLo. + simdTree = gtNewSIMDNode(simdType, op1, nullptr, SIMDIntrinsicWidenLo, baseType, size); + GenTree* loDest = + new (this, GT_BLK) GenTreeBlk(GT_BLK, simdType, dstAddrLo, getSIMDTypeSizeInBytes(clsHnd)); + GenTree* loAsg = gtNewBlkOpNode(loDest, simdTree, getSIMDTypeSizeInBytes(clsHnd), + false, // not volatile + true); // copyBlock + loAsg->gtFlags |= ((simdTree->gtFlags | dstAddrLo->gtFlags) & GTF_ALL_EFFECT); + + // Widen the upper half and assign it to dstAddrHi. + simdTree = gtNewSIMDNode(simdType, dupOp1, nullptr, SIMDIntrinsicWidenHi, baseType, size); + GenTree* hiDest = + new (this, GT_BLK) GenTreeBlk(GT_BLK, simdType, dstAddrHi, getSIMDTypeSizeInBytes(clsHnd)); + GenTree* hiAsg = gtNewBlkOpNode(hiDest, simdTree, getSIMDTypeSizeInBytes(clsHnd), + false, // not volatile + true); // copyBlock + hiAsg->gtFlags |= ((simdTree->gtFlags | dstAddrHi->gtFlags) & GTF_ALL_EFFECT); + + retVal = gtNewOperNode(GT_COMMA, simdType, loAsg, hiAsg); + } + break; + case SIMDIntrinsicHWAccel: { GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, 1); diff --git a/src/jit/simd.h b/src/jit/simd.h index c4a8866..ff522fd 100644 --- a/src/jit/simd.h +++ b/src/jit/simd.h @@ -33,12 +33,16 @@ struct SIMDIntrinsicInfo // SSE2 Shuffle control byte to shuffle vector // These correspond to shuffle immediate byte in shufps SSE2 instruction. #define SHUFFLE_XXXX 0x00 // 00 00 00 00 +#define SHUFFLE_XXZX 0x08 // 00 00 10 00 #define SHUFFLE_XXWW 0x0F // 00 00 11 11 #define SHUFFLE_XYZW 0x1B // 00 01 10 11 #define SHUFFLE_YXYX 0x44 // 01 00 01 00 +#define SHUFFLE_YWXZ 0x72 // 01 11 00 10 #define SHUFFLE_YYZZ 0x5A // 01 01 10 10 +#define SHUFFLE_ZXXX 0x80 // 10 00 00 00 #define SHUFFLE_ZXXY 0x81 // 10 00 00 01 #define SHUFFLE_ZWXY 0xB1 // 10 11 00 01 +#define SHUFFLE_WYZX 0xD8 // 11 01 10 00 #define SHUFFLE_WWYY 0xF5 // 11 11 01 01 #define SHUFFLE_ZZXX 0xA0 // 10 10 00 00 #endif diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index 940ba5f..a28c652 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -487,14 +487,151 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type result = INS_movaps; break; + case SIMDIntrinsicConvertToSingle: + result = INS_cvtdq2ps; + break; + + case SIMDIntrinsicConvertToDouble: + assert(baseType == TYP_LONG); + result = INS_cvtsi2sd; + break; + + case SIMDIntrinsicConvertToInt32: + case SIMDIntrinsicConvertToUInt32: + assert(baseType == TYP_FLOAT); + result = INS_cvttps2dq; + break; + + case SIMDIntrinsicConvertToInt64: + case SIMDIntrinsicConvertToUInt64: + assert(baseType == TYP_DOUBLE); + result = INS_cvttsd2si; + break; + + case SIMDIntrinsicNarrow: + // Note that for the integer types the caller must zero the upper bits of + // each source element, since the instructions saturate. + switch (baseType) + { + case TYP_INT: + case TYP_UINT: + if (compiler->getSIMDInstructionSet() >= InstructionSet_SSE3_4) + { + result = INS_packusdw; + } + else + { + result = INS_packssdw; + } + break; + case TYP_SHORT: + case TYP_CHAR: + result = INS_packuswb; + break; + default: + assert(!"Invalid baseType for SIMDIntrinsicNarrow"); + result = INS_invalid; + break; + } + break; + + case SIMDIntrinsicWidenLo: + // Some of these have multiple instruction implementations, with one instruction to widen the lo half, + // and another to widen the hi half. + switch (baseType) + { + case TYP_FLOAT: + result = INS_cvtps2pd; + break; + case TYP_INT: + case TYP_UINT: + result = INS_punpckldq; + break; + case TYP_SHORT: + case TYP_CHAR: + result = INS_punpcklwd; + break; + case TYP_BYTE: + case TYP_UBYTE: + result = INS_punpcklbw; + break; + default: + assert(!"Invalid baseType for SIMDIntrinsicWidenLo"); + result = INS_invalid; + break; + } + break; + + case SIMDIntrinsicWidenHi: + switch (baseType) + { + case TYP_FLOAT: + // For this case, we actually use the same instruction. + result = INS_cvtps2pd; + break; + case TYP_INT: + case TYP_UINT: + result = INS_punpckhdq; + break; + case TYP_SHORT: + case TYP_CHAR: + result = INS_punpckhwd; + break; + case TYP_BYTE: + case TYP_UBYTE: + result = INS_punpckhbw; + break; + default: + assert(!"Invalid baseType for SIMDIntrinsicWidenHi"); + result = INS_invalid; + break; + } + break; + case SIMDIntrinsicShiftLeftInternal: - // base type doesn't matter since the entire vector is shifted left - result = INS_pslldq; + switch (baseType) + { + case TYP_SIMD16: + // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted. + result = INS_pslldq; + break; + case TYP_UINT: + case TYP_INT: + result = INS_pslld; + break; + case TYP_SHORT: + case TYP_CHAR: + case TYP_USHORT: + result = INS_psllw; + break; + default: + assert(!"Invalid baseType for SIMDIntrinsicShiftLeftInternal"); + result = INS_invalid; + break; + } break; case SIMDIntrinsicShiftRightInternal: - // base type doesn't matter since the entire vector is shifted right - result = INS_psrldq; + switch (baseType) + { + case TYP_SIMD16: + // For SSE2, entire vector is shifted, for AVX2, 16-byte chunks are shifted. + result = INS_psrldq; + break; + case TYP_UINT: + case TYP_INT: + result = INS_psrld; + break; + case TYP_SHORT: + case TYP_CHAR: + case TYP_USHORT: + result = INS_psrlw; + break; + default: + assert(!"Invalid baseType for SIMDIntrinsicShiftRightInternal"); + result = INS_invalid; + break; + } break; case SIMDIntrinsicUpperSave: @@ -600,9 +737,9 @@ void CodeGen::genSIMDScalarMove( { // There is no guarantee that upper bits of op1Reg are zero. // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes. - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); + instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); + ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12); } else @@ -700,7 +837,7 @@ void CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode) ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT); inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT)); - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); + ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType); @@ -871,7 +1008,7 @@ void CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode) } unsigned int baseTypeSize = genTypeSize(baseType); - instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); + instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); // We will first consume the list items in execution (left to right) order, // and record the registers. @@ -947,6 +1084,681 @@ void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } +//---------------------------------------------------------------------------------- +// genSIMDIntrinsic32BitConvert: Generate code for 32-bit SIMD Convert (int/uint <-> float) +// +// Arguments: +// simdNode - The GT_SIMD node +// +// Return Value: +// None. +// +void CodeGen::genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode) +{ + SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID; + assert((intrinsicID == SIMDIntrinsicConvertToSingle) || (intrinsicID == SIMDIntrinsicConvertToInt32) || + (intrinsicID == SIMDIntrinsicConvertToUInt32)); + + GenTree* op1 = simdNode->gtGetOp1(); + var_types baseType = simdNode->gtSIMDBaseType; + regNumber targetReg = simdNode->gtRegNum; + assert(targetReg != REG_NA); + var_types targetType = simdNode->TypeGet(); + + regNumber op1Reg = genConsumeReg(op1); + instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); + if (intrinsicID == SIMDIntrinsicConvertToSingle && baseType == TYP_UINT) + { + regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT); + regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); + regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + assert(tmpReg != op1Reg && tmpReg2 != op1Reg); + + // We will generate the following: + // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2) + // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg) + // vpsrld targetReg, 16 (get upper 16 bits of src and put it into targetReg) + // vpslld tmpReg2, 16 + // vpsrld tmpReg2, 16 (get lower 16 bits of src and put it into tmpReg2) + // mov tmpIntReg, 0x5300000053000000 + // vmovd tmpReg, tmpIntReg + // vpbroadcastd tmpReg, tmpReg (build mask for converting upper 16 bits of src) + // vorps targetReg, tmpReg + // vsubps targetReg, tmpReg (convert upper 16 bits of src and put it into targetReg) + // vcvtdq2ps tmpReg2, tmpReg2 (convert lower 16 bits of src and put it into tmpReg2) + // vaddps targetReg, tmpReg2 (add upper 16 bits and lower 16 bits) + inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(targetType)); + if (targetReg != op1Reg) + { + inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(targetType)); + } + + // prepare upper 16 bits + getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), targetReg, 16); + + // prepare lower 16 bits + getEmitter()->emitIns_R_I(INS_pslld, emitActualTypeSize(targetType), tmpReg2, 16); + getEmitter()->emitIns_R_I(INS_psrld, emitActualTypeSize(targetType), tmpReg2, 16); + +// prepare mask +#ifdef _TARGET_AMD64_ + getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X5300000053000000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); +#else + if (compiler->getSIMDInstructionSet() == InstructionSet_AVX) + { + getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X53000000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); + } + else + { + getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X00005300); + inst_RV_RV(INS_pxor, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); + getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 1); + getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), tmpReg, tmpIntReg, 3); + } +#endif + if (compiler->getSIMDInstructionSet() == InstructionSet_AVX) + { + inst_RV_RV(INS_vpbroadcastd, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); + } + else + { + inst_RV_RV(INS_movlhps, tmpReg, tmpReg, targetType, emitActualTypeSize(targetType)); + } + + // convert upper 16 bits + inst_RV_RV(INS_orps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); + inst_RV_RV(INS_subps, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); + + // convert lower 16 bits + inst_RV_RV(ins, tmpReg2, tmpReg2, targetType, emitActualTypeSize(targetType)); + + // add lower 16 bits and upper 16 bits + inst_RV_RV(INS_addps, targetReg, tmpReg2, targetType, emitActualTypeSize(targetType)); + } + else + { + inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType)); + } + genProduceReg(simdNode); +} + +//---------------------------------------------------------------------------------- +// genSIMDLo64BitConvert: Generate code to convert lower-most 64-bit item (long <--> double) +// +// Arguments: +// intrinsicID the SIMD intrinsic ID +// simdType the SIMD node type +// baseType the base type of value to be converted +// tmpReg the tmp reg +// tmpIntReg the tmp integer reg +// targetReg the target reg +// +// Return Value: +// None. +// +void CodeGen::genSIMDLo64BitConvert(SIMDIntrinsicID intrinsicID, + var_types simdType, + var_types baseType, + regNumber tmpReg, + regNumber tmpIntReg, + regNumber targetReg) +{ + instruction ins = getOpForSIMDIntrinsic(intrinsicID, baseType); + if (intrinsicID == SIMDIntrinsicConvertToDouble) + { + // Note that for mov_xmm2i, the int register is always in the reg2 position + inst_RV_RV(INS_mov_xmm2i, tmpReg, tmpIntReg, TYP_LONG); + inst_RV_RV(ins, targetReg, tmpIntReg, baseType, emitActualTypeSize(baseType)); + } + else + { + inst_RV_RV(ins, tmpIntReg, tmpReg, baseType, emitActualTypeSize(baseType)); + inst_RV_RV(INS_mov_i2xmm, targetReg, tmpIntReg, TYP_LONG); + } +} + +//---------------------------------------------------------------------------------- +// genSIMDIntrinsic64BitConvert: Generate code for 64-bit SIMD Convert (long/ulong <-> double) +// +// Arguments: +// simdNode - The GT_SIMD node +// +// Notes: +// There are no instructions for converting to/from 64-bit integers, so for these we +// do the conversion an element at a time. +// +void CodeGen::genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode) +{ + SIMDIntrinsicID intrinsicID = simdNode->gtSIMDIntrinsicID; + assert((intrinsicID == SIMDIntrinsicConvertToDouble) || (intrinsicID == SIMDIntrinsicConvertToInt64) || + (intrinsicID == SIMDIntrinsicConvertToUInt64)); + + GenTree* op1 = simdNode->gtGetOp1(); + var_types baseType = simdNode->gtSIMDBaseType; + regNumber targetReg = simdNode->gtRegNum; + assert(targetReg != REG_NA); + var_types simdType = simdNode->TypeGet(); + regNumber op1Reg = genConsumeReg(op1); + regNumber tmpIntReg = simdNode->GetSingleTempReg(RBM_ALLINT); + regNumber tmpReg; + regNumber tmpReg2; + regNumber tmpReg3; + InstructionSet iset = compiler->getSIMDInstructionSet(); + +#ifdef _TARGET_X86_ + if (baseType == TYP_LONG) + { + tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); + tmpReg2 = simdNode->ExtractTempReg(RBM_ALLFLOAT); + tmpReg3 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + assert(tmpReg != op1Reg && tmpReg2 != op1Reg && tmpReg3 != op1Reg); + } + else +#endif + if (iset == InstructionSet_AVX || (baseType == TYP_ULONG)) + { + tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); + tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + tmpReg3 = REG_NA; + assert(tmpReg != op1Reg && tmpReg2 != op1Reg); + } + else + { + tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + assert(tmpReg != op1Reg); + tmpReg2 = REG_NA; + tmpReg3 = REG_NA; + } + + if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_ULONG)) + { + // We will generate the following + // vmovdqu tmpReg2, op1Reg (copy the src and put it into tmpReg2) + // vmovdqu targetReg, op1Reg (copy the src and put it into targetReg) + // vpsrlq targetReg, 32 (get upper 32 bits of src and put it into targetReg) + // vpsllq tmpReg2, 32 + // vpsrlq tmpReg2, 32 (get lower 32 bits of src and put it into tmpReg2) + // mov tmpIntReg, 0x4530000000000000 + // vmovd tmpReg, tmpIntReg + // vpbroadcastq tmpReg, tmpReg (build mask for upper 32 bits of src) + // vorpd targetReg, tmpReg + // vsubpd targetReg, tmpReg (convert upper 32 bits of src and put it into targetReg) + // mov tmpIntReg, 0x4330000000000000 + // vmovd tmpReg, tmpIntReg + // vpbroadcastq tmpReg, tmpReg (build mask for lower 32 bits of src) + // vorpd tmpReg2, tmpReg + // vsubpd tmpReg2, tmpReg (convert lower 32 bits of src and put it into tmpReg2) + // vaddpd targetReg, tmpReg2 (add upper 32 bits and lower 32 bits together) + inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType)); + if (targetReg != op1Reg) + { + inst_RV_RV(INS_movdqu, targetReg, op1Reg, baseType, emitActualTypeSize(simdType)); + } + + // prepare upper 32 bits + getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32); + + // prepare lower 32 bits + getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32); + getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32); + +// prepare mask for converting upper 32 bits +#ifdef _TARGET_AMD64_ + getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4530000000000000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); +#else + getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); + getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); +#endif + if (iset == InstructionSet_AVX) + { + inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + else + { + inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + + // convert upper 32 bits + inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); + +// prepare mask for converting lower 32 bits +#ifdef _TARGET_AMD64_ + getEmitter()->emitIns_R_I(INS_mov, EA_8BYTE, tmpIntReg, (ssize_t)0X4330000000000000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_ULONG); +#else + getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); + getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); +#endif + if (iset == InstructionSet_AVX) + { + inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + else + { + inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + + // convert lower 32 bits + inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + + // add lower 32 bits and upper 32 bits + inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType)); + } + else if ((intrinsicID == SIMDIntrinsicConvertToDouble) && (baseType == TYP_LONG)) + { +#ifdef _TARGET_AMD64_ + instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); + instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); + + if (iset == InstructionSet_AVX) + { + // Extract the high 16-bits + getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); + + // Put v[3] (the high-order element) in tmpReg2 and convert it. + inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2); + + // Shift the resulting 64-bits left. + getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); + + // Convert v[2], in the lo bits of tmpReg. + // For the convert to double, the convert preserves the upper bits in tmpReg2. + // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits. + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg2); + } + + // Put v[1] in tmpReg. + inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8); + + // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it. + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); + + // Shift the resulting 64-bits left. + getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8); + + // Convert the lo 64-bits into targetReg + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, tmpReg); + + // Merge or copy the results (only at this point are we done with op1Reg). + if (tmpReg != targetReg) + { + inst_RV_RV(INS_movaps, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + + if (iset == InstructionSet_AVX) + { + getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg2, 0x01); + } +#else + // get the sign bit and put it in tmpReg3 + inst_RV_RV(INS_movdqu, tmpReg3, op1Reg, baseType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg3, 63); + getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg3, 63); + + // get the absolute value of src and put it into tmpReg2 and targetReg + inst_RV_RV(INS_movdqu, tmpReg2, op1Reg, baseType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(simdType), tmpReg, op1Reg, SHUFFLE_WWYY); + getEmitter()->emitIns_R_I(INS_psrad, emitActualTypeSize(simdType), tmpReg, 32); + inst_RV_RV(INS_pxor, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_psubq, tmpReg2, tmpReg, baseType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_movdqu, targetReg, tmpReg2, baseType, emitActualTypeSize(simdType)); + + // prepare upper 32 bits + getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), targetReg, 32); + + // prepare lower 32 bits + getEmitter()->emitIns_R_I(INS_psllq, emitActualTypeSize(simdType), tmpReg2, 32); + getEmitter()->emitIns_R_I(INS_psrlq, emitActualTypeSize(simdType), tmpReg2, 32); + + // prepare mask for converting upper 32 bits + getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X45300000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); + getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); + + if (iset == InstructionSet_AVX) + { + inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + else + { + inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + + // convert upper 32 bits + inst_RV_RV(INS_orpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_subpd, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); + + // prepare mask for converting lower 32 bits + getEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, tmpIntReg, (ssize_t)0X43300000); + inst_RV_RV(INS_mov_i2xmm, tmpReg, tmpIntReg, TYP_UINT); + getEmitter()->emitIns_R_I(INS_pslldq, EA_16BYTE, tmpReg, 4); + + if (iset == InstructionSet_AVX) + { + inst_RV_RV(INS_vpbroadcastq, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + else + { + inst_RV_RV(INS_movlhps, tmpReg, tmpReg, simdType, emitActualTypeSize(simdType)); + } + + // convert lower 32 bits + inst_RV_RV(INS_orpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + inst_RV_RV(INS_subpd, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + + // add lower 32 bits and upper 32 bits + inst_RV_RV(INS_addpd, targetReg, tmpReg2, simdType, emitActualTypeSize(simdType)); + + // add sign bit + inst_RV_RV(INS_por, targetReg, tmpReg3, simdType, emitActualTypeSize(simdType)); +#endif + } + else + { + instruction rightShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); + instruction leftShiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); + + if (iset == InstructionSet_AVX) + { + // Extract the high 16-bits + getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, op1Reg, 0x01); + + // Put v[3] (the high-order element) in tmpReg2 and convert it. + inst_RV_RV(ins_Copy(simdType), tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg2, tmpIntReg, tmpReg2); + + // Shift the resulting 64-bits left. + getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg2, 8); + + // Convert v[2], in the lo bits of tmpReg. + // For the convert to double, the convert preserves the upper bits in tmpReg2. + // For the integer convert, we have to put it in tmpReg and or it in, since movd clears the upper bits. + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); + inst_RV_RV(INS_por, tmpReg2, tmpReg, simdType, emitActualTypeSize(simdType)); + } + + // Put v[1] in tmpReg. + inst_RV_RV(ins_Copy(simdType), tmpReg, op1Reg, simdType, emitActualTypeSize(simdType)); + getEmitter()->emitIns_R_I(rightShiftIns, emitActualTypeSize(simdType), tmpReg, 8); + + // At this point we have v[1] in the low-order 64-bits of tmpReg. Convert it. + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, tmpReg, tmpIntReg, tmpReg); + + // Shift the resulting 64-bits left. + getEmitter()->emitIns_R_I(leftShiftIns, emitActualTypeSize(simdType), tmpReg, 8); + + // Convert the lo 64-bits into targetReg + genSIMDLo64BitConvert(intrinsicID, simdType, baseType, op1Reg, tmpIntReg, targetReg); + + // Merge or copy the results (only at this point are we done with op1Reg). + assert(tmpReg != targetReg); + inst_RV_RV(INS_por, targetReg, tmpReg, simdType, emitActualTypeSize(simdType)); + if (iset == InstructionSet_AVX) + { + getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, targetReg, tmpReg2, 0x01); + } + } + genProduceReg(simdNode); +} + +//-------------------------------------------------------------------------------- +// genSIMDExtractUpperHalf: Generate code to extract the upper half of a SIMD register +// +// Arguments: +// simdNode - The GT_SIMD node +// +// Notes: +// This is used for the WidenHi intrinsic to extract the upper half. +// On SSE*, this is 8 bytes, and on AVX2 it is 16 bytes. +// +void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg) +{ + var_types simdType = simdNode->TypeGet(); + emitAttr emitSize = emitActualTypeSize(simdType); + if (compiler->getSIMDInstructionSet() == InstructionSet_AVX) + { + instruction extractIns = varTypeIsFloating(simdNode->gtSIMDBaseType) ? INS_vextractf128 : INS_vextracti128; + getEmitter()->emitIns_R_R_I(extractIns, EA_32BYTE, tgtReg, srcReg, 0x01); + } + else + { + instruction shiftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); + if (tgtReg != srcReg) + { + inst_RV_RV(ins_Copy(simdType), tgtReg, srcReg, simdType, emitSize); + } + getEmitter()->emitIns_R_I(shiftIns, emitSize, tgtReg, 8); + } +} + +//-------------------------------------------------------------------------------- +// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations +// +// Arguments: +// simdNode - The GT_SIMD node +// +// Notes: +// The Widen intrinsics are broken into separate intrinsics for the two results. +// +void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) +{ + assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || + (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); + + GenTree* op1 = simdNode->gtGetOp1(); + var_types baseType = simdNode->gtSIMDBaseType; + regNumber targetReg = simdNode->gtRegNum; + assert(targetReg != REG_NA); + var_types simdType = simdNode->TypeGet(); + InstructionSet iset = compiler->getSIMDInstructionSet(); + + genConsumeOperands(simdNode); + regNumber op1Reg = op1->gtRegNum; + regNumber srcReg = op1Reg; + emitAttr emitSize = emitActualTypeSize(simdType); + instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); + + if (baseType == TYP_FLOAT) + { + if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) + { + genSIMDExtractUpperHalf(simdNode, srcReg, targetReg); + srcReg = targetReg; + } + inst_RV_RV(widenIns, targetReg, srcReg, simdType); + } + else + { + // We will generate the following on AVX: + // vpermq targetReg, op1Reg, 0xd4|0xe8 + // vpxor tmpReg, tmpReg + // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed) + // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg + regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + assert(tmpReg != op1Reg); + + if (iset == InstructionSet_AVX) + { + // permute op1Reg and put it into targetReg + unsigned ival = 0xd4; + if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) + { + ival = 0xe8; + } + getEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, ival); + } + else if (targetReg != op1Reg) + { + inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize); + } + + genSIMDZero(simdType, baseType, tmpReg); + if (!varTypeIsUnsigned(baseType)) + { + instruction compareIns = getOpForSIMDIntrinsic(SIMDIntrinsicGreaterThan, baseType); + inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize); + } + inst_RV_RV(widenIns, targetReg, tmpReg, simdType); + } + genProduceReg(simdNode); +} + +//-------------------------------------------------------------------------------- +// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations +// +// Arguments: +// simdNode - The GT_SIMD node +// +// Notes: +// This intrinsic takes two arguments. The first operand is narrowed to produce the +// lower elements of the results, and the second operand produces the high elements. +// +void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) +{ + assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); + + GenTree* op1 = simdNode->gtGetOp1(); + GenTree* op2 = simdNode->gtGetOp2(); + var_types baseType = simdNode->gtSIMDBaseType; + regNumber targetReg = simdNode->gtRegNum; + assert(targetReg != REG_NA); + var_types simdType = simdNode->TypeGet(); + emitAttr emitSize = emitTypeSize(simdType); + InstructionSet iset = compiler->getSIMDInstructionSet(); + + genConsumeOperands(simdNode); + regNumber op1Reg = op1->gtRegNum; + regNumber op2Reg = op2->gtRegNum; + if (baseType == TYP_DOUBLE) + { + regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + + inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType); + inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType); + // Now insert the high-order result (in tmpReg) into the upper half of targetReg. + if (compiler->canUseAVX()) + { + getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01); + } + else + { + inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, SHUFFLE_YXYX); + } + } + else if (varTypeIsLong(baseType)) + { + if (iset == InstructionSet_AVX) + { + // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg. + // We will generate the following: + // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg) + // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2) + // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg) + // mov tmpReg2, op1Reg + // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2) + // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg + // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg + // punpcklqdq tgtReg, tmpReg + regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); + regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); + getEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); + getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01); + inst_RV_RV(ins_Copy(simdType), tmpReg2, op1Reg, simdType, emitSize); + getEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); + getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, SHUFFLE_XXZX); + getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, SHUFFLE_XXZX); + inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize); + } + else + { + // We will generate the following: + // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements) + // psrldq targetReg, 8 (shift them right to get zeros in the high elements) + // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements) + // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements) + // por targetReg, tmpReg + regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); + instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); + emitAttr emitSize = emitTypeSize(simdType); + + getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, SHUFFLE_ZXXX); + getEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8); + getEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, SHUFFLE_XXZX); + getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8); + inst_RV_RV(INS_por, targetReg, tmpReg, simdType); + } + } + else + { + // We will generate the following: + // mov targetReg, op1Reg + // mov tmpReg, op2Reg + // psll? targetReg, shiftCount + // pslr? targetReg, shiftCount + // psll? tmpReg, shiftCount + // pslr? tmpReg, shiftCount + // targetReg, tmpReg + // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType), + // and is the appropriate instruction to pack the result (note that we have to truncate to + // get CLR type semantics; otherwise it will saturate). + // + int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2); + instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); + instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); + instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); + + if (iset == InstructionSet_AVX) + { + regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); + regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + + // The AVX instructions generally operate on "lanes", so we have to permute the + // inputs so that the destination register has the low 128-bit halves of the two + // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs. + getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20); + getEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31); + getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount); + getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount); + getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); + getEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount); + inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType)); + } + else + { + regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); + + inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdType, emitSize); + inst_RV_RV(ins_Copy(simdType), tmpReg, op2Reg, simdType, emitSize); + + instruction tmpShiftRight = shiftRightIns; + if ((baseType == TYP_INT || baseType == TYP_UINT) && iset == InstructionSet_SSE2) + { + tmpShiftRight = INS_psrad; + } + + getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount); + getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount); + getEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); + getEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount); + inst_RV_RV(ins, targetReg, tmpReg, simdType); + } + } + genProduceReg(simdNode); +} + //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. @@ -1076,7 +1888,7 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) // Extract first and third double word results from tmpReg // tmpReg = shuffle(0,0,2,0) of tmpReg - getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, 0x08); + getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, SHUFFLE_XXZX); // targetReg[63:0] = op1[0] * op2[0] // targetReg[127:64] = op1[2] * op2[2] @@ -1085,7 +1897,7 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) // Extract first and third double word results from targetReg // targetReg = shuffle(0,0,2,0) of targetReg - getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, 0x08); + getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, SHUFFLE_XXZX); // pack the results into a single vector inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType)); @@ -1125,9 +1937,9 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length. unsigned shiftCount = 16 - simdNode->gtSIMDSize; assert(shiftCount != 0); - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); + instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); + ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount); } @@ -1834,7 +2646,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) if (byteShiftCnt != 0) { - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); + instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt); } } @@ -1904,7 +2716,7 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType)); } - ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); + ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt); } else @@ -2390,6 +3202,27 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicUnOp(simdNode); break; + case SIMDIntrinsicConvertToSingle: + case SIMDIntrinsicConvertToInt32: + case SIMDIntrinsicConvertToUInt32: + genSIMDIntrinsic32BitConvert(simdNode); + break; + + case SIMDIntrinsicConvertToDouble: + case SIMDIntrinsicConvertToInt64: + case SIMDIntrinsicConvertToUInt64: + genSIMDIntrinsic64BitConvert(simdNode); + break; + + case SIMDIntrinsicWidenLo: + case SIMDIntrinsicWidenHi: + genSIMDIntrinsicWiden(simdNode); + break; + + case SIMDIntrinsicNarrow: + genSIMDIntrinsicNarrow(simdNode); + break; + case SIMDIntrinsicAdd: case SIMDIntrinsicSub: case SIMDIntrinsicMul: diff --git a/src/jit/simdintrinsiclist.h b/src/jit/simdintrinsiclist.h index 0160582..2eb4df3 100644 --- a/src/jit/simdintrinsiclist.h +++ b/src/jit/simdintrinsiclist.h @@ -119,6 +119,23 @@ SIMD_INTRINSIC("ConditionalSelect", false, Select, // Cast SIMD_INTRINSIC("op_Explicit", false, Cast, "Cast", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) +// Convert int/uint to single +SIMD_INTRINSIC("ConvertToSingle", false, ConvertToSingle, "ConvertToSingle", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Convert long/ulong to double +SIMD_INTRINSIC("ConvertToDouble", false, ConvertToDouble, "ConvertToDouble", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_LONG, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Convert single to int +SIMD_INTRINSIC("ConvertToInt32", false, ConvertToInt32, "ConvertToInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Convert single to uint +SIMD_INTRINSIC("ConvertToUInt32", false, ConvertToUInt32, "ConvertToUInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Convert double to long +SIMD_INTRINSIC("ConvertToInt64", false, ConvertToInt64, "ConvertToInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Convert double to ulong +SIMD_INTRINSIC("ConvertToUInt64", false, ConvertToUInt64, "ConvertToUInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Narrow two input Vectors to a single Vector. The return value's lower elements are the elements from src1, and the upper elements are from src2. +SIMD_INTRINSIC("Narrow", false, Narrow, "Narrow", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_DOUBLE, TYP_LONG, TYP_CHAR, TYP_SHORT, TYP_UINT, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Widen one input Vector to two Vectors: dest1 contains the lower half of elements in src, and dest2 contains the upper half of elements in src. +SIMD_INTRINSIC("Widen", false, Widen, "Widen", TYP_VOID, 3, {TYP_STRUCT, TYP_BYREF, TYP_BYREF}, {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) + // Miscellaneous SIMD_INTRINSIC("get_IsHardwareAccelerated", false, HWAccel, "HWAccel", TYP_BOOL, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) @@ -134,7 +151,11 @@ SIMD_INTRINSIC("ShiftRightInternal", false, ShiftRightInternal, SIMD_INTRINSIC("UpperSave", false, UpperSave, "UpperSave Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) SIMD_INTRINSIC("UpperRestore", false, UpperRestore, "UpperRestore Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC(nullptr, false, Invalid, "Invalid", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +// Internal intrinsics for Widen +SIMD_INTRINSIC("WidenHi", false, WidenHi, "WidenHi", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) +SIMD_INTRINSIC("WidenLo", false, WidenLo, "WidenLo", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_CHAR, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) + +SIMD_INTRINSIC(nullptr, false, Invalid, "Invalid", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) #undef SIMD_INTRINSIC #else //_TARGET_XARCH_ diff --git a/tests/src/JIT/SIMD/VectorConvert.cs b/tests/src/JIT/SIMD/VectorConvert.cs index 6c65b22..c2e4eb1 100644 --- a/tests/src/JIT/SIMD/VectorConvert.cs +++ b/tests/src/JIT/SIMD/VectorConvert.cs @@ -559,6 +559,21 @@ partial class VectorTest returnVal = Fail; } } + + JitLog jitLog = new JitLog(); + if (!jitLog.Check("System.Numerics.Vector:ConvertToInt32(struct):struct")) returnVal = Fail; + if (!jitLog.Check("System.Numerics.Vector:ConvertToUInt32(struct):struct")) returnVal = Fail; + if (!jitLog.Check("System.Numerics.Vector:ConvertToSingle(struct):struct")) returnVal = Fail; + // Note: SIMD Conversion to Int64/UInt64 is not supported on x86 +#if !BIT32 + if (!jitLog.Check("System.Numerics.Vector:ConvertToInt64(struct):struct")) returnVal = Fail; + if (!jitLog.Check("System.Numerics.Vector:ConvertToUInt64(struct):struct")) returnVal = Fail; +#endif // !BIT32 + if (!jitLog.Check("System.Numerics.Vector:ConvertToDouble(struct):struct")) returnVal = Fail; + if (!jitLog.Check("System.Numerics.Vector:Narrow(struct,struct):struct")) returnVal = Fail; + if (!jitLog.Check("System.Numerics.Vector:Widen(struct,byref,byref)")) returnVal = Fail; + jitLog.Dispose(); + return returnVal; } } diff --git a/tests/src/JIT/SIMD/VectorConvert_r.csproj b/tests/src/JIT/SIMD/VectorConvert_r.csproj index 01231e2..db6fb24 100644 --- a/tests/src/JIT/SIMD/VectorConvert_r.csproj +++ b/tests/src/JIT/SIMD/VectorConvert_r.csproj @@ -14,6 +14,9 @@ ..\..\ 7a9bfb7d + + BIT32;$(DefineConstants) + diff --git a/tests/src/JIT/SIMD/VectorConvert_ro.csproj b/tests/src/JIT/SIMD/VectorConvert_ro.csproj index f751b88..82206ef 100644 --- a/tests/src/JIT/SIMD/VectorConvert_ro.csproj +++ b/tests/src/JIT/SIMD/VectorConvert_ro.csproj @@ -14,6 +14,9 @@ ..\..\ 7a9bfb7d + + BIT32;$(DefineConstants) +