1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // genIsTableDrivenHWIntrinsic:
31 // category - category of a HW intrinsic
34 // returns true if this category can be table-driven in CodeGen
36 static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
38 // TODO - make more categories to the table-driven framework
39 // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
40 const bool tableDrivenCategory =
41 category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
42 const bool tableDrivenFlag = (flags & (HW_Flag_MultiIns | HW_Flag_SpecialCodeGen)) == 0;
43 return tableDrivenCategory && tableDrivenFlag;
46 //------------------------------------------------------------------------
47 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
50 // node - The hardware intrinsic node
52 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
54 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
55 InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
56 HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
57 HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
58 int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
59 int numArgs = Compiler::numArgsOfHWIntrinsic(node);
61 assert((flags & HW_Flag_NoCodeGen) == 0);
63 if (genIsTableDrivenHWIntrinsic(category, flags))
65 GenTree* op1 = node->gtGetOp1();
66 GenTree* op2 = node->gtGetOp2();
67 regNumber targetReg = node->gtRegNum;
68 var_types targetType = node->TypeGet();
69 var_types baseType = node->gtSIMDBaseType;
71 regNumber op1Reg = REG_NA;
72 regNumber op2Reg = REG_NA;
73 emitter* emit = getEmitter();
76 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
77 assert(ins != INS_invalid);
78 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
79 assert(simdSize != 0);
85 genConsumeOperands(node);
86 op1Reg = op1->gtRegNum;
87 if (category == HW_Category_MemoryLoad)
89 emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
91 else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
93 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
95 else if ((ival != -1) && varTypeIsFloating(baseType))
97 emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
101 emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
108 genConsumeOperands(node);
110 op1Reg = op1->gtRegNum;
111 op2Reg = op2->gtRegNum;
113 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
115 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
117 // For non-commutative intrinsics, we should have ensured that op2 was marked
118 // delay free in order to prevent it from getting assigned the same register
119 // as target. However, for commutative intrinsics, we can just swap the operands
120 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
122 noway_assert(node->OperIsCommutative());
127 if (category == HW_Category_MemoryStore)
129 emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
131 else if ((ival != -1) && varTypeIsFloating(baseType))
133 genHWIntrinsic_R_R_RM_I(node, ins);
135 else if (category == HW_Category_MemoryLoad)
137 if (intrinsicID == NI_AVX_MaskLoad)
139 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
143 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
146 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
148 if (intrinsicID == NI_SSE2_Extract)
150 // extract instructions return to GP-registers, so it needs int size as the emitsize
151 simdSize = emitTypeSize(TYP_INT);
153 auto emitSwCase = [&](unsigned i) {
154 emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
157 if (op2->IsCnsIntOrI())
159 ssize_t ival = op2->AsIntCon()->IconValue();
160 emitSwCase((unsigned)ival);
164 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
165 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
166 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
167 regNumber baseReg = node->ExtractTempReg();
168 regNumber offsReg = node->GetSingleTempReg();
169 genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
174 genHWIntrinsic_R_R_RM(node, ins);
181 assert(op1->OperIsList());
182 assert(op1->gtGetOp2()->OperIsList());
183 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
185 GenTreeArgList* argList = op1->AsArgList();
186 op1 = argList->Current();
188 op1Reg = op1->gtRegNum;
190 argList = argList->Rest();
191 op2 = argList->Current();
193 op2Reg = op2->gtRegNum;
195 argList = argList->Rest();
196 GenTree* op3 = argList->Current();
198 regNumber op3Reg = op3->gtRegNum;
200 if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
202 auto emitSwCase = [&](unsigned i) {
203 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
205 if (op3->IsCnsIntOrI())
207 ssize_t ival = op3->AsIntCon()->IconValue();
208 emitSwCase((unsigned)ival);
212 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
213 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
214 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
215 regNumber baseReg = node->ExtractTempReg();
216 regNumber offsReg = node->GetSingleTempReg();
217 genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
220 else if (category == HW_Category_MemoryStore)
222 assert(intrinsicID == NI_SSE2_MaskMove);
223 assert(targetReg == REG_NA);
225 // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
226 if (op3Reg != REG_EDI)
228 emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
230 emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
234 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
249 case InstructionSet_SSE:
250 genSSEIntrinsic(node);
252 case InstructionSet_SSE2:
253 genSSE2Intrinsic(node);
255 case InstructionSet_SSE41:
256 genSSE41Intrinsic(node);
258 case InstructionSet_SSE42:
259 genSSE42Intrinsic(node);
261 case InstructionSet_AVX:
262 case InstructionSet_AVX2:
263 genAvxOrAvx2Intrinsic(node);
265 case InstructionSet_AES:
266 genAESIntrinsic(node);
268 case InstructionSet_BMI1:
269 genBMI1Intrinsic(node);
271 case InstructionSet_BMI2:
272 genBMI2Intrinsic(node);
274 case InstructionSet_FMA:
275 genFMAIntrinsic(node);
277 case InstructionSet_LZCNT:
278 genLZCNTIntrinsic(node);
280 case InstructionSet_PCLMULQDQ:
281 genPCLMULQDQIntrinsic(node);
283 case InstructionSet_POPCNT:
284 genPOPCNTIntrinsic(node);
292 //------------------------------------------------------------------------
293 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
294 // register/memory operand, and that returns a value in register
297 // node - The hardware intrinsic node
298 // ins - The instruction being generated
300 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
302 var_types targetType = node->TypeGet();
303 regNumber targetReg = node->gtRegNum;
304 GenTree* op1 = node->gtGetOp1();
305 GenTree* op2 = node->gtGetOp2();
306 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
307 emitter* emit = getEmitter();
309 // TODO-XArch-CQ: Commutative operations can have op1 be contained
310 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
312 regNumber op1Reg = op1->gtRegNum;
314 assert(targetReg != REG_NA);
315 assert(op1Reg != REG_NA);
317 if (op2->isContained() || op2->isUsedFromSpillTemp())
319 assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
320 assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
322 TempDsc* tmpDsc = nullptr;
323 unsigned varNum = BAD_VAR_NUM;
324 unsigned offset = (unsigned)-1;
326 if (op2->isUsedFromSpillTemp())
328 assert(op2->IsRegOptional());
330 tmpDsc = getSpillTempDsc(op2);
331 varNum = tmpDsc->tdTempNum();
334 compiler->tmpRlsTemp(tmpDsc);
336 else if (op2->OperIsHWIntrinsic())
338 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
341 else if (op2->isIndir())
343 GenTreeIndir* memIndir = op2->AsIndir();
344 GenTree* memBase = memIndir->gtOp1;
346 switch (memBase->OperGet())
348 case GT_LCL_VAR_ADDR:
350 varNum = memBase->AsLclVarCommon()->GetLclNum();
353 // Ensure that all the GenTreeIndir values are set to their defaults.
354 assert(!memIndir->HasIndex());
355 assert(memIndir->Scale() == 1);
356 assert(memIndir->Offset() == 0);
361 case GT_CLS_VAR_ADDR:
363 emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
369 emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
376 switch (op2->OperGet())
380 GenTreeLclFld* lclField = op2->AsLclFld();
382 varNum = lclField->GetLclNum();
383 offset = lclField->gtLclFld.gtLclOffs;
389 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
390 varNum = op2->AsLclVar()->GetLclNum();
401 // Ensure we got a good varNum and offset.
402 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
403 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
404 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
405 assert(offset != (unsigned)-1);
407 emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
411 regNumber op2Reg = op2->gtRegNum;
413 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
415 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
417 // For non-commutative intrinsics, we should have ensured that op2 was marked
418 // delay free in order to prevent it from getting assigned the same register
419 // as target. However, for commutative intrinsics, we can just swap the operands
420 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
422 noway_assert(node->OperIsCommutative());
427 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg);
431 //------------------------------------------------------------------------
432 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
433 // register/memory operand, an immediate operand, and that returns a value in register
436 // node - The hardware intrinsic node
437 // ins - The instruction being generated
439 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
441 var_types targetType = node->TypeGet();
442 regNumber targetReg = node->gtRegNum;
443 GenTree* op1 = node->gtGetOp1();
444 GenTree* op2 = node->gtGetOp2();
445 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
446 int ival = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
447 emitter* emit = getEmitter();
449 // TODO-XArch-CQ: Commutative operations can have op1 be contained
450 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
452 regNumber op1Reg = op1->gtRegNum;
454 assert(targetReg != REG_NA);
455 assert(op1Reg != REG_NA);
457 if (op2->isContained() || op2->isUsedFromSpillTemp())
459 assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
460 assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
462 TempDsc* tmpDsc = nullptr;
463 unsigned varNum = BAD_VAR_NUM;
464 unsigned offset = (unsigned)-1;
466 if (op2->isUsedFromSpillTemp())
468 assert(op2->IsRegOptional());
470 tmpDsc = getSpillTempDsc(op2);
471 varNum = tmpDsc->tdTempNum();
474 compiler->tmpRlsTemp(tmpDsc);
476 else if (op2->OperIsHWIntrinsic())
478 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
481 else if (op2->isIndir())
483 GenTreeIndir* memIndir = op2->AsIndir();
484 GenTree* memBase = memIndir->gtOp1;
486 switch (memBase->OperGet())
488 case GT_LCL_VAR_ADDR:
490 varNum = memBase->AsLclVarCommon()->GetLclNum();
493 // Ensure that all the GenTreeIndir values are set to their defaults.
494 assert(!memIndir->HasIndex());
495 assert(memIndir->Scale() == 1);
496 assert(memIndir->Offset() == 0);
501 case GT_CLS_VAR_ADDR:
503 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
510 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
517 switch (op2->OperGet())
521 GenTreeLclFld* lclField = op2->AsLclFld();
523 varNum = lclField->GetLclNum();
524 offset = lclField->gtLclFld.gtLclOffs;
530 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
531 varNum = op2->AsLclVar()->GetLclNum();
542 // Ensure we got a good varNum and offset.
543 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
544 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
545 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
546 assert(offset != (unsigned)-1);
548 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
552 regNumber op2Reg = op2->gtRegNum;
554 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
556 // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
558 // For non-commutative intrinsics, we should have ensured that op2 was marked
559 // delay free in order to prevent it from getting assigned the same register
560 // as target. However, for commutative intrinsics, we can just swap the operands
561 // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
563 noway_assert(node->OperIsCommutative());
568 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
572 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
573 // with non-constant argument
576 // intrinsic - intrinsic ID
577 // nonConstImmReg - the register contains non-constant imm8 argument
578 // baseReg - a register for the start of the switch table
579 // offsReg - a register for the offset into the switch table
580 // emitSwCase - the lambda to generate siwtch-case
583 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
585 // This function can be used for all imm-intrinsics (whether full-range or not),
586 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
587 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
589 template <typename HWIntrinsicSwitchCaseBody>
590 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
591 regNumber nonConstImmReg,
594 HWIntrinsicSwitchCaseBody emitSwCase)
596 assert(nonConstImmReg != REG_NA);
597 emitter* emit = getEmitter();
599 const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
600 assert(maxByte <= 256);
601 BasicBlock* jmpTable[256];
603 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
604 unsigned jmpTableOffs = 0;
606 // Emit the jump table
607 for (unsigned i = 0; i < maxByte; i++)
609 jmpTable[i] = genCreateTempLabel();
610 emit->emitDataGenData(i, jmpTable[i]);
613 emit->emitDataGenEnd();
615 // Compute and jump to the appropriate offset in the switch table
616 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
618 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
619 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
620 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
621 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
623 // Emit the switch table entries
625 BasicBlock* switchTableBeg = genCreateTempLabel();
626 BasicBlock* switchTableEnd = genCreateTempLabel();
628 genDefineTempLabel(switchTableBeg);
630 for (unsigned i = 0; i < maxByte; i++)
632 genDefineTempLabel(jmpTable[i]);
634 emit->emitIns_J(INS_jmp, switchTableEnd);
637 genDefineTempLabel(switchTableEnd);
640 //------------------------------------------------------------------------
641 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
644 // node - The hardware intrinsic node
646 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
648 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
649 GenTree* op1 = node->gtGetOp1();
650 GenTree* op2 = node->gtGetOp2();
651 GenTree* op3 = nullptr;
652 GenTree* op4 = nullptr;
653 regNumber targetReg = node->gtRegNum;
654 var_types targetType = node->TypeGet();
655 var_types baseType = node->gtSIMDBaseType;
657 regNumber op1Reg = REG_NA;
658 regNumber op2Reg = REG_NA;
659 regNumber op3Reg = REG_NA;
660 regNumber op4Reg = REG_NA;
661 emitter* emit = getEmitter();
663 if ((op1 != nullptr) && !op1->OperIsList())
665 op1Reg = op1->gtRegNum;
666 genConsumeOperands(node);
671 case NI_SSE_CompareEqualOrderedScalar:
672 case NI_SSE_CompareEqualUnorderedScalar:
674 assert(baseType == TYP_FLOAT);
675 op2Reg = op2->gtRegNum;
676 regNumber tmpReg = node->GetSingleTempReg();
677 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
679 // Ensure we aren't overwriting targetReg
680 assert(tmpReg != targetReg);
682 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
683 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
684 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
685 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
686 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
690 case NI_SSE_CompareGreaterThanOrderedScalar:
691 case NI_SSE_CompareGreaterThanUnorderedScalar:
693 assert(baseType == TYP_FLOAT);
694 op2Reg = op2->gtRegNum;
696 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
697 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
698 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
699 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
703 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
704 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
706 assert(baseType == TYP_FLOAT);
707 op2Reg = op2->gtRegNum;
709 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
710 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
711 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
712 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
716 case NI_SSE_CompareLessThanOrderedScalar:
717 case NI_SSE_CompareLessThanUnorderedScalar:
719 assert(baseType == TYP_FLOAT);
720 op2Reg = op2->gtRegNum;
722 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
723 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
724 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
725 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
729 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
730 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
732 assert(baseType == TYP_FLOAT);
733 op2Reg = op2->gtRegNum;
735 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
736 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
737 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
738 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
742 case NI_SSE_CompareNotEqualOrderedScalar:
743 case NI_SSE_CompareNotEqualUnorderedScalar:
745 assert(baseType == TYP_FLOAT);
746 op2Reg = op2->gtRegNum;
747 regNumber tmpReg = node->GetSingleTempReg();
748 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
750 // Ensure we aren't overwriting targetReg
751 assert(tmpReg != targetReg);
753 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
754 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
755 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
756 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
757 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
761 case NI_SSE_ConvertToSingle:
762 case NI_SSE_StaticCast:
764 assert(op2 == nullptr);
765 if (op1Reg != targetReg)
767 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
768 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
773 case NI_SSE_MoveMask:
775 assert(baseType == TYP_FLOAT);
776 assert(op2 == nullptr);
778 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
779 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
783 case NI_SSE_Prefetch0:
784 case NI_SSE_Prefetch1:
785 case NI_SSE_Prefetch2:
786 case NI_SSE_PrefetchNonTemporal:
788 assert(baseType == TYP_UBYTE);
789 assert(op2 == nullptr);
791 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
792 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
796 case NI_SSE_SetScalarVector128:
798 assert(baseType == TYP_FLOAT);
799 assert(op2 == nullptr);
801 if (op1Reg == targetReg)
803 regNumber tmpReg = node->GetSingleTempReg();
805 // Ensure we aren't overwriting targetReg
806 assert(tmpReg != targetReg);
808 emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
812 emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
813 emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
817 case NI_SSE_SetZeroVector128:
819 assert(baseType == TYP_FLOAT);
820 assert(op1 == nullptr);
821 assert(op2 == nullptr);
822 emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
826 case NI_SSE_StoreFence:
828 assert(baseType == TYP_VOID);
829 assert(op1 == nullptr);
830 assert(op2 == nullptr);
831 emit->emitIns(INS_sfence);
843 //------------------------------------------------------------------------
844 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
847 // node - The hardware intrinsic node
849 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
851 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
852 GenTree* op1 = node->gtGetOp1();
853 GenTree* op2 = node->gtGetOp2();
854 regNumber targetReg = node->gtRegNum;
855 var_types targetType = node->TypeGet();
856 var_types baseType = node->gtSIMDBaseType;
857 regNumber op1Reg = REG_NA;
858 regNumber op2Reg = REG_NA;
859 emitter* emit = getEmitter();
862 if ((op1 != nullptr) && !op1->OperIsList())
864 op1Reg = op1->gtRegNum;
865 genConsumeOperands(node);
870 // All integer overloads are handled by table codegen
871 case NI_SSE2_CompareLessThan:
873 assert(op1 != nullptr);
874 assert(op2 != nullptr);
875 assert(baseType == TYP_DOUBLE);
876 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
877 op2Reg = op2->gtRegNum;
878 ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
880 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
885 case NI_SSE2_CompareEqualOrderedScalar:
886 case NI_SSE2_CompareEqualUnorderedScalar:
888 assert(baseType == TYP_DOUBLE);
889 op2Reg = op2->gtRegNum;
890 regNumber tmpReg = node->GetSingleTempReg();
891 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
893 // Ensure we aren't overwriting targetReg
894 assert(tmpReg != targetReg);
896 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
897 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
898 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
899 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
900 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
904 case NI_SSE2_CompareGreaterThanOrderedScalar:
905 case NI_SSE2_CompareGreaterThanUnorderedScalar:
907 assert(baseType == TYP_DOUBLE);
908 op2Reg = op2->gtRegNum;
909 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
911 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
912 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
913 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
917 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
918 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
920 assert(baseType == TYP_DOUBLE);
921 op2Reg = op2->gtRegNum;
922 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
924 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
925 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
926 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
930 case NI_SSE2_CompareLessThanOrderedScalar:
931 case NI_SSE2_CompareLessThanUnorderedScalar:
933 assert(baseType == TYP_DOUBLE);
934 op2Reg = op2->gtRegNum;
935 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
937 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
938 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
939 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
943 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
944 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
946 assert(baseType == TYP_DOUBLE);
947 op2Reg = op2->gtRegNum;
948 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
950 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
951 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
952 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
956 case NI_SSE2_CompareNotEqualOrderedScalar:
957 case NI_SSE2_CompareNotEqualUnorderedScalar:
959 assert(baseType == TYP_DOUBLE);
960 op2Reg = op2->gtRegNum;
961 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
962 regNumber tmpReg = node->GetSingleTempReg();
964 // Ensure we aren't overwriting targetReg
965 assert(tmpReg != targetReg);
967 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
968 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
969 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
970 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
971 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
975 case NI_SSE2_ConvertScalarToVector128Double:
976 case NI_SSE2_ConvertScalarToVector128Single:
978 assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
979 assert(op1 != nullptr);
980 assert(op2 != nullptr);
981 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
982 genHWIntrinsic_R_R_RM(node, ins);
986 case NI_SSE2_ConvertScalarToVector128Int64:
987 case NI_SSE2_ConvertScalarToVector128UInt64:
989 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
990 assert(op1 != nullptr);
991 assert(op2 == nullptr);
992 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
993 emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
997 case NI_SSE2_ConvertToDouble:
999 assert(op2 == nullptr);
1000 if (op1Reg != targetReg)
1002 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1003 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
1008 case NI_SSE2_ConvertToInt32:
1009 case NI_SSE2_ConvertToInt64:
1010 case NI_SSE2_ConvertToUInt32:
1011 case NI_SSE2_ConvertToUInt64:
1013 assert(op2 == nullptr);
1014 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
1015 baseType == TYP_LONG || baseType == TYP_ULONG);
1016 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1017 if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
1019 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
1023 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1028 case NI_SSE2_LoadFence:
1030 assert(baseType == TYP_VOID);
1031 assert(op1 == nullptr);
1032 assert(op2 == nullptr);
1033 emit->emitIns(INS_lfence);
1037 case NI_SSE2_MemoryFence:
1039 assert(baseType == TYP_VOID);
1040 assert(op1 == nullptr);
1041 assert(op2 == nullptr);
1042 emit->emitIns(INS_mfence);
1046 case NI_SSE2_MoveMask:
1048 assert(op2 == nullptr);
1049 assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1051 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1052 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1056 case NI_SSE2_SetScalarVector128:
1058 assert(baseType == TYP_DOUBLE);
1059 assert(op2 == nullptr);
1061 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1062 if (op1Reg == targetReg)
1064 regNumber tmpReg = node->GetSingleTempReg();
1066 // Ensure we aren't overwriting targetReg
1067 assert(tmpReg != targetReg);
1069 emit->emitIns_R_R(INS_movapd, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
1073 emit->emitIns_SIMD_R_R_R(INS_xorpd, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1074 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
1078 case NI_SSE2_SetZeroVector128:
1080 assert(baseType != TYP_FLOAT);
1081 assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE);
1082 assert(op1 == nullptr);
1083 assert(op2 == nullptr);
1085 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1086 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1090 case NI_SSE2_StoreNonTemporal:
1092 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1093 assert(op1 != nullptr);
1094 assert(op2 != nullptr);
1096 op2Reg = op2->gtRegNum;
1097 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1098 emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1107 genProduceReg(node);
1110 //------------------------------------------------------------------------
1111 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1114 // node - The hardware intrinsic node
1116 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1118 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1119 GenTree* op1 = node->gtGetOp1();
1120 GenTree* op2 = node->gtGetOp2();
1121 GenTree* op3 = nullptr;
1122 GenTree* op4 = nullptr;
1123 regNumber targetReg = node->gtRegNum;
1124 var_types targetType = node->TypeGet();
1125 var_types baseType = node->gtSIMDBaseType;
1127 regNumber op1Reg = REG_NA;
1128 regNumber op2Reg = REG_NA;
1129 regNumber op3Reg = REG_NA;
1130 regNumber op4Reg = REG_NA;
1131 emitter* emit = getEmitter();
1133 if ((op1 != nullptr) && !op1->OperIsList())
1135 op1Reg = op1->gtRegNum;
1136 genConsumeOperands(node);
1139 switch (intrinsicID)
1141 case NI_SSE41_TestAllOnes:
1143 regNumber tmpReg = node->GetSingleTempReg();
1144 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1145 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1146 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1147 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1148 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1152 case NI_SSE41_TestAllZeros:
1153 case NI_SSE41_TestZ:
1155 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1156 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1157 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1158 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1162 case NI_SSE41_TestC:
1164 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1165 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1166 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1167 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1171 case NI_SSE41_TestMixOnesZeros:
1172 case NI_SSE41_TestNotZAndNotC:
1174 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1175 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1176 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1177 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1181 case NI_SSE41_Extract:
1183 regNumber tmpTargetReg = REG_NA;
1184 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1185 if (baseType == TYP_FLOAT)
1187 tmpTargetReg = node->ExtractTempReg();
1189 auto emitSwCase = [&](unsigned i) {
1190 if (baseType == TYP_FLOAT)
1192 // extract instructions return to GP-registers, so it needs int size as the emitsize
1193 emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, (int)i);
1194 emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1198 emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, (int)i);
1202 if (op2->IsCnsIntOrI())
1204 ssize_t ival = op2->AsIntCon()->IconValue();
1205 emitSwCase((unsigned)ival);
1209 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1210 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1211 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1212 regNumber baseReg = node->ExtractTempReg();
1213 regNumber offsReg = node->GetSingleTempReg();
1214 genHWIntrinsicJumpTableFallback(intrinsicID, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1224 genProduceReg(node);
1227 //------------------------------------------------------------------------
1228 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1231 // node - The hardware intrinsic node
1233 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1235 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1236 GenTree* op1 = node->gtGetOp1();
1237 GenTree* op2 = node->gtGetOp2();
1238 regNumber targetReg = node->gtRegNum;
1239 assert(targetReg != REG_NA);
1240 var_types targetType = node->TypeGet();
1241 var_types baseType = node->gtSIMDBaseType;
1243 regNumber op1Reg = op1->gtRegNum;
1244 regNumber op2Reg = op2->gtRegNum;
1245 genConsumeOperands(node);
1247 switch (intrinsicID)
1249 case NI_SSE42_Crc32:
1250 if (op1Reg != targetReg)
1252 assert(op2Reg != targetReg);
1253 inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1256 if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
1258 assert(targetType == TYP_INT);
1259 inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
1263 assert(op1->TypeGet() == op2->TypeGet());
1264 assert(targetType == TYP_INT || targetType == TYP_LONG);
1265 inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
1273 genProduceReg(node);
1276 //------------------------------------------------------------------------
1277 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1280 // node - The hardware intrinsic node
1282 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1284 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1285 var_types baseType = node->gtSIMDBaseType;
1286 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1287 var_types targetType = node->TypeGet();
1288 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1289 int numArgs = Compiler::numArgsOfHWIntrinsic(node);
1290 GenTree* op1 = node->gtGetOp1();
1291 GenTree* op2 = node->gtGetOp2();
1292 regNumber op1Reg = REG_NA;
1293 regNumber op2Reg = REG_NA;
1294 regNumber targetReg = node->gtRegNum;
1295 emitter* emit = getEmitter();
1297 if ((op1 != nullptr) && !op1->OperIsList())
1299 genConsumeOperands(node);
1302 switch (intrinsicID)
1304 case NI_AVX_SetZeroVector256:
1306 assert(op1 == nullptr);
1307 assert(op2 == nullptr);
1308 // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
1309 // generate xorps on AVX machines.
1310 if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
1312 emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
1316 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1321 case NI_AVX_SetAllVector256:
1323 assert(op1 != nullptr);
1324 assert(op2 == nullptr);
1325 op1Reg = op1->gtRegNum;
1326 if (varTypeIsIntegral(baseType))
1328 // If the argument is a integer, it needs to be moved into a XMM register
1329 regNumber tmpXMM = node->ExtractTempReg();
1330 emit->emitIns_R_R(INS_mov_i2xmm, emitActualTypeSize(baseType), tmpXMM, op1Reg);
1334 if (compiler->compSupports(InstructionSet_AVX2))
1336 // generate broadcast instructions if AVX2 is available
1337 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
1341 // duplicate the scalar argument to XMM register
1345 emit->emitIns_SIMD_R_R_I(INS_vpermilps, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1348 emit->emitIns_R_R(INS_movddup, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg);
1353 regNumber tmpZeroReg = node->GetSingleTempReg();
1354 emit->emitIns_R_R(INS_pxor, emitTypeSize(TYP_SIMD16), tmpZeroReg, tmpZeroReg);
1355 emit->emitIns_SIMD_R_R_R(INS_pshufb, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, tmpZeroReg);
1360 emit->emitIns_SIMD_R_R_I(INS_pshuflw, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1361 emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 80);
1365 emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1369 emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 68);
1376 // duplicate the XMM register to YMM register
1377 emit->emitIns_SIMD_R_R_R_I(INS_vinsertf128, emitTypeSize(TYP_SIMD32), targetReg, op1Reg, op1Reg, 1);
1382 case NI_AVX_ExtendToVector256:
1384 // ExtendToVector256 has zero-extend semantics in order to ensure it is deterministic
1385 // We always emit a move to the target register, even when op1Reg == targetReg, in order
1386 // to ensure that Bits MAXVL-1:128 are zeroed.
1388 assert(op2 == nullptr);
1389 regNumber op1Reg = op1->gtRegNum;
1390 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
1394 case NI_AVX_GetLowerHalf:
1395 case NI_AVX_StaticCast:
1397 assert(op2 == nullptr);
1398 regNumber op1Reg = op1->gtRegNum;
1400 if (op1Reg != targetReg)
1402 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
1409 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1410 emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1411 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1415 case NI_AVX_TestNotZAndNotC:
1417 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1418 emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1419 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1425 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1426 emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1427 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1431 case NI_AVX_ExtractVector128:
1432 case NI_AVX_InsertVector128:
1433 case NI_AVX2_ExtractVector128:
1434 case NI_AVX2_InsertVector128:
1436 GenTree* lastOp = nullptr;
1439 assert(intrinsicID == NI_AVX_ExtractVector128 || NI_AVX_ExtractVector128);
1440 op1Reg = op1->gtRegNum;
1441 op2Reg = op2->gtRegNum;
1446 assert(numArgs == 3);
1447 assert(op1->OperIsList());
1448 assert(op1->gtGetOp2()->OperIsList());
1449 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
1451 GenTreeArgList* argList = op1->AsArgList();
1452 op1 = argList->Current();
1453 genConsumeRegs(op1);
1454 op1Reg = op1->gtRegNum;
1456 argList = argList->Rest();
1457 op2 = argList->Current();
1458 genConsumeRegs(op2);
1459 op2Reg = op2->gtRegNum;
1461 argList = argList->Rest();
1462 lastOp = argList->Current();
1463 genConsumeRegs(lastOp);
1466 regNumber op3Reg = lastOp->gtRegNum;
1468 auto emitSwCase = [&](unsigned i) {
1469 // TODO-XARCH-Bug the emitter cannot work with imm8 >= 128,
1470 // so clear the 8th bit that is not used by the instructions
1474 if (intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128)
1476 emit->emitIns_AR_R_I(ins, attr, op1Reg, 0, op2Reg, (int)i);
1478 else if (op2->TypeGet() == TYP_I_IMPL)
1480 emit->emitIns_SIMD_R_R_AR_I(ins, attr, targetReg, op1Reg, op2Reg, (int)i);
1484 assert(op2->TypeGet() == TYP_SIMD16);
1485 emit->emitIns_SIMD_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, (int)i);
1490 assert(numArgs == 2);
1491 assert(intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128);
1492 emit->emitIns_SIMD_R_R_I(ins, attr, targetReg, op1Reg, (int)i);
1496 if (lastOp->IsCnsIntOrI())
1498 ssize_t ival = lastOp->AsIntCon()->IconValue();
1499 emitSwCase((unsigned)ival);
1503 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1504 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1505 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1506 regNumber baseReg = node->ExtractTempReg();
1507 regNumber offsReg = node->GetSingleTempReg();
1508 genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
1518 genProduceReg(node);
1521 //------------------------------------------------------------------------
1522 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
1525 // node - The hardware intrinsic node
1527 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
1529 NYI("Implement AES intrinsic code generation");
1532 //------------------------------------------------------------------------
1533 // genBMI1Intrinsic: Generates the code for a BMI1 hardware intrinsic node
1536 // node - The hardware intrinsic node
1538 void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
1540 NYI("Implement BMI1 intrinsic code generation");
1543 //------------------------------------------------------------------------
1544 // genBMI2Intrinsic: Generates the code for a BMI2 hardware intrinsic node
1547 // node - The hardware intrinsic node
1549 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
1551 NYI("Implement BMI2 intrinsic code generation");
1554 //------------------------------------------------------------------------
1555 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
1558 // node - The hardware intrinsic node
1560 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
1562 NYI("Implement FMA intrinsic code generation");
1565 //------------------------------------------------------------------------
1566 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
1569 // node - The hardware intrinsic node
1571 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
1573 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1574 GenTree* op1 = node->gtGetOp1();
1575 regNumber targetReg = node->gtRegNum;
1576 assert(targetReg != REG_NA);
1577 var_types targetType = node->TypeGet();
1578 regNumber op1Reg = op1->gtRegNum;
1579 genConsumeOperands(node);
1581 assert(intrinsicID == NI_LZCNT_LeadingZeroCount);
1583 inst_RV_RV(INS_lzcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1585 genProduceReg(node);
1588 //------------------------------------------------------------------------
1589 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
1592 // node - The hardware intrinsic node
1594 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
1596 NYI("Implement PCLMULQDQ intrinsic code generation");
1599 //------------------------------------------------------------------------
1600 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
1603 // node - The hardware intrinsic node
1605 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
1607 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1608 GenTree* op1 = node->gtGetOp1();
1609 regNumber targetReg = node->gtRegNum;
1610 assert(targetReg != REG_NA);
1611 var_types targetType = node->TypeGet();
1612 regNumber op1Reg = op1->gtRegNum;
1613 genConsumeOperands(node);
1615 assert(intrinsicID == NI_POPCNT_PopCount);
1617 inst_RV_RV(INS_popcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1619 genProduceReg(node);
1622 #endif // FEATURE_HW_INTRINSICS