1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // genIsTableDrivenHWIntrinsic:
31 // category - category of a HW intrinsic
34 // returns true if this category can be table-driven in CodeGen
36 static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
38 // TODO - make more categories to the table-driven framework
39 // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen
40 const bool tableDrivenCategory =
41 category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
42 const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0;
43 return tableDrivenCategory && tableDrivenFlag;
46 //------------------------------------------------------------------------
47 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
50 // node - The hardware intrinsic node
52 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
54 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
55 InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
56 HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
57 HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
58 int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
59 int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID);
61 assert((flags & HW_Flag_NoCodeGen) == 0);
63 if (genIsTableDrivenHWIntrinsic(category, flags))
65 GenTree* op1 = node->gtGetOp1();
66 GenTree* op2 = node->gtGetOp2();
67 regNumber targetReg = node->gtRegNum;
68 var_types targetType = node->TypeGet();
69 var_types baseType = node->gtSIMDBaseType;
71 regNumber op1Reg = REG_NA;
72 regNumber op2Reg = REG_NA;
73 emitter* emit = getEmitter();
76 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
77 assert(ins != INS_invalid);
78 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
79 assert(simdSize != 0);
84 genConsumeOperands(node);
85 op1Reg = op1->gtRegNum;
86 if (category == HW_Category_MemoryLoad)
88 emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
90 else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
92 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
94 else if ((ival != -1) && varTypeIsFloating(baseType))
96 emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
100 emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
105 genConsumeOperands(node);
106 op1Reg = op1->gtRegNum;
107 op2Reg = op2->gtRegNum;
108 if (category == HW_Category_MemoryStore)
110 emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
112 else if ((ival != -1) && varTypeIsFloating(baseType))
114 genHWIntrinsic_R_R_RM_I(node, ins);
116 else if (category == HW_Category_MemoryLoad)
118 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
120 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
122 auto emitSwCase = [&](unsigned i) {
123 emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
126 if (op2->IsCnsIntOrI())
128 ssize_t ival = op2->AsIntCon()->IconValue();
129 emitSwCase((unsigned)ival);
133 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
134 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
135 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
136 regNumber baseReg = node->ExtractTempReg();
137 regNumber offsReg = node->GetSingleTempReg();
138 genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
143 genHWIntrinsic_R_R_RM(node, ins);
148 assert(op1->OperIsList());
149 assert(op1->gtGetOp2()->OperIsList());
150 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
152 GenTreeArgList* argList = op1->AsArgList();
153 op1 = argList->Current();
155 op1Reg = op1->gtRegNum;
157 argList = argList->Rest();
158 op2 = argList->Current();
160 op2Reg = op2->gtRegNum;
162 argList = argList->Rest();
163 GenTree* op3 = argList->Current();
165 regNumber op3Reg = op3->gtRegNum;
167 if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
169 auto emitSwCase = [&](unsigned i) {
170 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
172 if (op3->IsCnsIntOrI())
174 ssize_t ival = op3->AsIntCon()->IconValue();
175 emitSwCase((unsigned)ival);
179 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
180 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
181 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
182 regNumber baseReg = node->ExtractTempReg();
183 regNumber offsReg = node->GetSingleTempReg();
184 genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
189 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
204 case InstructionSet_SSE:
205 genSSEIntrinsic(node);
207 case InstructionSet_SSE2:
208 genSSE2Intrinsic(node);
210 case InstructionSet_SSE41:
211 genSSE41Intrinsic(node);
213 case InstructionSet_SSE42:
214 genSSE42Intrinsic(node);
216 case InstructionSet_AVX:
217 genAVXIntrinsic(node);
219 case InstructionSet_AVX2:
220 genAVX2Intrinsic(node);
222 case InstructionSet_AES:
223 genAESIntrinsic(node);
225 case InstructionSet_BMI1:
226 genBMI1Intrinsic(node);
228 case InstructionSet_BMI2:
229 genBMI2Intrinsic(node);
231 case InstructionSet_FMA:
232 genFMAIntrinsic(node);
234 case InstructionSet_LZCNT:
235 genLZCNTIntrinsic(node);
237 case InstructionSet_PCLMULQDQ:
238 genPCLMULQDQIntrinsic(node);
240 case InstructionSet_POPCNT:
241 genPOPCNTIntrinsic(node);
249 //------------------------------------------------------------------------
250 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
251 // register/memory operand, and that returns a value in register
254 // node - The hardware intrinsic node
255 // ins - The instruction being generated
257 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
259 var_types targetType = node->TypeGet();
260 regNumber targetReg = node->gtRegNum;
261 GenTree* op1 = node->gtGetOp1();
262 GenTree* op2 = node->gtGetOp2();
263 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
264 emitter* emit = getEmitter();
266 // TODO-XArch-CQ: Commutative operations can have op1 be contained
267 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
269 regNumber op1Reg = op1->gtRegNum;
271 assert(targetReg != REG_NA);
272 assert(op1Reg != REG_NA);
274 if (op2->isContained() || op2->isUsedFromSpillTemp())
276 assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
277 assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
279 TempDsc* tmpDsc = nullptr;
280 unsigned varNum = BAD_VAR_NUM;
281 unsigned offset = (unsigned)-1;
283 if (op2->isUsedFromSpillTemp())
285 assert(op2->IsRegOptional());
287 tmpDsc = getSpillTempDsc(op2);
288 varNum = tmpDsc->tdTempNum();
291 compiler->tmpRlsTemp(tmpDsc);
293 else if (op2->OperIsHWIntrinsic())
295 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
298 else if (op2->isIndir())
300 GenTreeIndir* memIndir = op2->AsIndir();
301 GenTree* memBase = memIndir->gtOp1;
303 switch (memBase->OperGet())
305 case GT_LCL_VAR_ADDR:
307 varNum = memBase->AsLclVarCommon()->GetLclNum();
310 // Ensure that all the GenTreeIndir values are set to their defaults.
311 assert(!memIndir->HasIndex());
312 assert(memIndir->Scale() == 1);
313 assert(memIndir->Offset() == 0);
318 case GT_CLS_VAR_ADDR:
320 emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
326 emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
333 switch (op2->OperGet())
337 GenTreeLclFld* lclField = op2->AsLclFld();
339 varNum = lclField->GetLclNum();
340 offset = lclField->gtLclFld.gtLclOffs;
346 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
347 varNum = op2->AsLclVar()->GetLclNum();
358 // Ensure we got a good varNum and offset.
359 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
360 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
361 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
362 assert(offset != (unsigned)-1);
364 emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
368 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum);
372 //------------------------------------------------------------------------
373 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
374 // register/memory operand, an immediate operand, and that returns a value in register
377 // node - The hardware intrinsic node
378 // ins - The instruction being generated
380 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
382 var_types targetType = node->TypeGet();
383 regNumber targetReg = node->gtRegNum;
384 GenTree* op1 = node->gtGetOp1();
385 GenTree* op2 = node->gtGetOp2();
386 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
387 int ival = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
388 emitter* emit = getEmitter();
390 // TODO-XArch-CQ: Commutative operations can have op1 be contained
391 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
393 regNumber op1Reg = op1->gtRegNum;
395 assert(targetReg != REG_NA);
396 assert(op1Reg != REG_NA);
398 if (op2->isContained() || op2->isUsedFromSpillTemp())
400 assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
401 assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
403 TempDsc* tmpDsc = nullptr;
404 unsigned varNum = BAD_VAR_NUM;
405 unsigned offset = (unsigned)-1;
407 if (op2->isUsedFromSpillTemp())
409 assert(op2->IsRegOptional());
411 tmpDsc = getSpillTempDsc(op2);
412 varNum = tmpDsc->tdTempNum();
415 compiler->tmpRlsTemp(tmpDsc);
417 else if (op2->OperIsHWIntrinsic())
419 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
422 else if (op2->isIndir())
424 GenTreeIndir* memIndir = op2->AsIndir();
425 GenTree* memBase = memIndir->gtOp1;
427 switch (memBase->OperGet())
429 case GT_LCL_VAR_ADDR:
431 varNum = memBase->AsLclVarCommon()->GetLclNum();
434 // Ensure that all the GenTreeIndir values are set to their defaults.
435 assert(!memIndir->HasIndex());
436 assert(memIndir->Scale() == 1);
437 assert(memIndir->Offset() == 0);
442 case GT_CLS_VAR_ADDR:
444 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
451 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
458 switch (op2->OperGet())
462 GenTreeLclFld* lclField = op2->AsLclFld();
464 varNum = lclField->GetLclNum();
465 offset = lclField->gtLclFld.gtLclOffs;
471 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
472 varNum = op2->AsLclVar()->GetLclNum();
483 // Ensure we got a good varNum and offset.
484 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
485 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
486 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
487 assert(offset != (unsigned)-1);
489 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
493 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, ival);
497 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
498 // with non-constant argument
501 // intrinsic - intrinsic ID
502 // nonConstImmReg - the register contains non-constant imm8 argument
503 // baseReg - a register for the start of the switch table
504 // offsReg - a register for the offset into the switch table
505 // emitSwCase - the lambda to generate siwtch-case
508 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
510 // This function can be used for all imm-intrinsics (whether full-range or not),
511 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
512 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
514 template <typename HWIntrinsicSwitchCaseBody>
515 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
516 regNumber nonConstImmReg,
519 HWIntrinsicSwitchCaseBody emitSwCase)
521 assert(nonConstImmReg != REG_NA);
522 emitter* emit = getEmitter();
524 const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
525 assert(maxByte <= 256);
526 BasicBlock* jmpTable[256];
528 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
529 unsigned jmpTableOffs = 0;
531 // Emit the jump table
532 for (unsigned i = 0; i < maxByte; i++)
534 jmpTable[i] = genCreateTempLabel();
535 emit->emitDataGenData(i, jmpTable[i]);
538 emit->emitDataGenEnd();
540 // Compute and jump to the appropriate offset in the switch table
541 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
543 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
544 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
545 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
546 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
548 // Emit the switch table entries
550 BasicBlock* switchTableBeg = genCreateTempLabel();
551 BasicBlock* switchTableEnd = genCreateTempLabel();
553 genDefineTempLabel(switchTableBeg);
555 for (unsigned i = 0; i < maxByte; i++)
557 genDefineTempLabel(jmpTable[i]);
559 emit->emitIns_J(INS_jmp, switchTableEnd);
562 genDefineTempLabel(switchTableEnd);
565 //------------------------------------------------------------------------
566 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
569 // node - The hardware intrinsic node
571 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
573 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
574 GenTree* op1 = node->gtGetOp1();
575 GenTree* op2 = node->gtGetOp2();
576 GenTree* op3 = nullptr;
577 GenTree* op4 = nullptr;
578 regNumber targetReg = node->gtRegNum;
579 var_types targetType = node->TypeGet();
580 var_types baseType = node->gtSIMDBaseType;
582 regNumber op1Reg = REG_NA;
583 regNumber op2Reg = REG_NA;
584 regNumber op3Reg = REG_NA;
585 regNumber op4Reg = REG_NA;
586 emitter* emit = getEmitter();
588 if ((op1 != nullptr) && !op1->OperIsList())
590 op1Reg = op1->gtRegNum;
591 genConsumeOperands(node);
596 case NI_SSE_ConvertScalarToVector128Single:
598 assert(node->TypeGet() == TYP_SIMD16);
599 assert(node->gtSIMDBaseType == TYP_FLOAT);
600 assert(Compiler::ivalOfHWIntrinsic(intrinsicID) == -1);
602 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
603 genHWIntrinsic_R_R_RM(node, ins);
607 case NI_SSE_CompareEqualOrderedScalar:
608 case NI_SSE_CompareEqualUnorderedScalar:
610 assert(baseType == TYP_FLOAT);
611 op2Reg = op2->gtRegNum;
612 regNumber tmpReg = node->GetSingleTempReg();
613 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
615 // Ensure we aren't overwriting targetReg
616 assert(tmpReg != targetReg);
618 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
619 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
620 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
621 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
622 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
623 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
627 case NI_SSE_CompareGreaterThanOrderedScalar:
628 case NI_SSE_CompareGreaterThanUnorderedScalar:
630 assert(baseType == TYP_FLOAT);
631 op2Reg = op2->gtRegNum;
633 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
634 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
635 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
636 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
640 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
641 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
643 assert(baseType == TYP_FLOAT);
644 op2Reg = op2->gtRegNum;
646 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
647 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
648 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
649 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
653 case NI_SSE_CompareLessThanOrderedScalar:
654 case NI_SSE_CompareLessThanUnorderedScalar:
656 assert(baseType == TYP_FLOAT);
657 op2Reg = op2->gtRegNum;
659 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
660 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
661 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
662 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
666 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
667 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
669 assert(baseType == TYP_FLOAT);
670 op2Reg = op2->gtRegNum;
672 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
673 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
674 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
675 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
679 case NI_SSE_CompareNotEqualOrderedScalar:
680 case NI_SSE_CompareNotEqualUnorderedScalar:
682 assert(baseType == TYP_FLOAT);
683 op2Reg = op2->gtRegNum;
684 regNumber tmpReg = node->GetSingleTempReg();
685 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
687 // Ensure we aren't overwriting targetReg
688 assert(tmpReg != targetReg);
690 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
691 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
692 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
693 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
694 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
695 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
699 case NI_SSE_ConvertToSingle:
700 case NI_SSE_StaticCast:
702 assert(op2 == nullptr);
703 if (op1Reg != targetReg)
705 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
706 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
711 case NI_SSE_MoveMask:
713 assert(baseType == TYP_FLOAT);
714 assert(op2 == nullptr);
716 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
717 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
721 case NI_SSE_Prefetch0:
722 case NI_SSE_Prefetch1:
723 case NI_SSE_Prefetch2:
724 case NI_SSE_PrefetchNonTemporal:
726 assert(baseType == TYP_UBYTE);
727 assert(op2 == nullptr);
729 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
730 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
734 case NI_SSE_ReciprocalScalar:
735 case NI_SSE_ReciprocalSqrtScalar:
736 case NI_SSE_SqrtScalar:
738 assert(baseType == TYP_FLOAT);
739 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
743 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg);
747 genHWIntrinsic_R_R_RM(node, ins);
752 case NI_SSE_SetScalarVector128:
754 assert(baseType == TYP_FLOAT);
755 assert(op2 == nullptr);
757 if (op1Reg == targetReg)
759 regNumber tmpReg = node->GetSingleTempReg();
761 // Ensure we aren't overwriting targetReg
762 assert(tmpReg != targetReg);
764 emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
768 emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
769 emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
773 case NI_SSE_SetZeroVector128:
775 assert(baseType == TYP_FLOAT);
776 assert(op1 == nullptr);
777 assert(op2 == nullptr);
778 emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
782 case NI_SSE_StoreFence:
784 assert(baseType == TYP_VOID);
785 assert(op1 == nullptr);
786 assert(op2 == nullptr);
787 emit->emitIns(INS_sfence);
799 //------------------------------------------------------------------------
800 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
803 // node - The hardware intrinsic node
805 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
807 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
808 GenTree* op1 = node->gtGetOp1();
809 GenTree* op2 = node->gtGetOp2();
810 regNumber targetReg = node->gtRegNum;
811 var_types targetType = node->TypeGet();
812 var_types baseType = node->gtSIMDBaseType;
813 regNumber op1Reg = REG_NA;
814 regNumber op2Reg = REG_NA;
815 emitter* emit = getEmitter();
818 if ((op1 != nullptr) && !op1->OperIsList())
820 op1Reg = op1->gtRegNum;
821 genConsumeOperands(node);
826 // All integer overloads are handled by table codegen
827 case NI_SSE2_CompareLessThan:
829 assert(op1 != nullptr);
830 assert(op2 != nullptr);
831 assert(baseType == TYP_DOUBLE);
832 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
833 op2Reg = op2->gtRegNum;
834 ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
836 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
841 case NI_SSE2_CompareEqualOrderedScalar:
842 case NI_SSE2_CompareEqualUnorderedScalar:
844 assert(baseType == TYP_DOUBLE);
845 op2Reg = op2->gtRegNum;
846 regNumber tmpReg = node->GetSingleTempReg();
847 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
849 // Ensure we aren't overwriting targetReg
850 assert(tmpReg != targetReg);
852 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
853 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
854 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
855 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
856 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
857 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
861 case NI_SSE2_CompareGreaterThanOrderedScalar:
862 case NI_SSE2_CompareGreaterThanUnorderedScalar:
864 assert(baseType == TYP_DOUBLE);
865 op2Reg = op2->gtRegNum;
866 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
868 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
869 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
870 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
874 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
875 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
877 assert(baseType == TYP_DOUBLE);
878 op2Reg = op2->gtRegNum;
879 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
881 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
882 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
883 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
887 case NI_SSE2_CompareLessThanOrderedScalar:
888 case NI_SSE2_CompareLessThanUnorderedScalar:
890 assert(baseType == TYP_DOUBLE);
891 op2Reg = op2->gtRegNum;
892 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
894 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
895 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
896 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
900 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
901 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
903 assert(baseType == TYP_DOUBLE);
904 op2Reg = op2->gtRegNum;
905 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
907 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
908 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
909 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
913 case NI_SSE2_CompareNotEqualOrderedScalar:
914 case NI_SSE2_CompareNotEqualUnorderedScalar:
916 assert(baseType == TYP_DOUBLE);
917 op2Reg = op2->gtRegNum;
918 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
919 regNumber tmpReg = node->GetSingleTempReg();
921 // Ensure we aren't overwriting targetReg
922 assert(tmpReg != targetReg);
924 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
925 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
926 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
927 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
928 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
929 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
933 case NI_SSE2_ConvertScalarToVector128Double:
934 case NI_SSE2_ConvertScalarToVector128Single:
936 assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
937 assert(op1 != nullptr);
938 assert(op2 != nullptr);
939 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
940 genHWIntrinsic_R_R_RM(node, ins);
944 case NI_SSE2_ConvertScalarToVector128Int64:
945 case NI_SSE2_ConvertScalarToVector128UInt64:
947 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
948 assert(op1 != nullptr);
949 assert(op2 == nullptr);
950 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
951 // TODO-XArch-CQ -> use of type size of TYP_SIMD16 leads to
952 // instruction register encoding errors for SSE legacy encoding
953 emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
957 case NI_SSE2_ConvertToDouble:
959 assert(op2 == nullptr);
960 if (op1Reg != targetReg)
962 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
963 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
968 case NI_SSE2_ConvertToInt32:
969 case NI_SSE2_ConvertToInt64:
970 case NI_SSE2_ConvertToUInt32:
971 case NI_SSE2_ConvertToUInt64:
973 assert(op2 == nullptr);
974 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
975 baseType == TYP_LONG || baseType == TYP_ULONG);
976 if (op1Reg != targetReg)
978 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
979 if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
981 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
985 // TODO-XArch-Bug https://github.com/dotnet/coreclr/issues/16329
986 // using hardcoded instruction as workaround for inexact type conversions
987 emit->emitIns_R_R(INS_mov_xmm2i, emitActualTypeSize(baseType), op1Reg, targetReg);
993 case NI_SSE2_LoadFence:
995 assert(baseType == TYP_VOID);
996 assert(op1 == nullptr);
997 assert(op2 == nullptr);
998 emit->emitIns(INS_lfence);
1002 case NI_SSE2_MemoryFence:
1004 assert(baseType == TYP_VOID);
1005 assert(op1 == nullptr);
1006 assert(op2 == nullptr);
1007 emit->emitIns(INS_mfence);
1011 case NI_SSE2_MoveMask:
1013 assert(op2 == nullptr);
1014 assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1016 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1017 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1021 case NI_SSE2_SetZeroVector128:
1023 assert(baseType != TYP_FLOAT);
1024 assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE);
1025 assert(op1 == nullptr);
1026 assert(op2 == nullptr);
1028 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1029 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1038 genProduceReg(node);
1041 //------------------------------------------------------------------------
1042 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1045 // node - The hardware intrinsic node
1047 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1049 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1050 GenTree* op1 = node->gtGetOp1();
1051 GenTree* op2 = node->gtGetOp2();
1052 GenTree* op3 = nullptr;
1053 GenTree* op4 = nullptr;
1054 regNumber targetReg = node->gtRegNum;
1055 var_types targetType = node->TypeGet();
1056 var_types baseType = node->gtSIMDBaseType;
1058 regNumber op1Reg = REG_NA;
1059 regNumber op2Reg = REG_NA;
1060 regNumber op3Reg = REG_NA;
1061 regNumber op4Reg = REG_NA;
1062 emitter* emit = getEmitter();
1064 if ((op1 != nullptr) && !op1->OperIsList())
1066 op1Reg = op1->gtRegNum;
1067 genConsumeOperands(node);
1070 switch (intrinsicID)
1072 case NI_SSE41_CeilingScalar:
1073 case NI_SSE41_FloorScalar:
1074 case NI_SSE41_RoundCurrentDirectionScalar:
1075 case NI_SSE41_RoundToNearestIntegerScalar:
1076 case NI_SSE41_RoundToNegativeInfinityScalar:
1077 case NI_SSE41_RoundToPositiveInfinityScalar:
1078 case NI_SSE41_RoundToZeroScalar:
1080 assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE));
1081 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1085 int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
1086 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg, ival);
1090 genHWIntrinsic_R_R_RM_I(node, ins);
1095 case NI_SSE41_TestAllOnes:
1097 regNumber tmpReg = node->GetSingleTempReg();
1098 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1099 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1100 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1101 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1102 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1106 case NI_SSE41_TestAllZeros:
1107 case NI_SSE41_TestZ:
1109 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1110 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1111 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1112 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1116 case NI_SSE41_TestC:
1118 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1119 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1120 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1121 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1125 case NI_SSE41_TestMixOnesZeros:
1126 case NI_SSE41_TestNotZAndNotC:
1128 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1129 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1130 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1131 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1140 genProduceReg(node);
1143 //------------------------------------------------------------------------
1144 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1147 // node - The hardware intrinsic node
1149 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1151 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1152 GenTree* op1 = node->gtGetOp1();
1153 GenTree* op2 = node->gtGetOp2();
1154 regNumber targetReg = node->gtRegNum;
1155 assert(targetReg != REG_NA);
1156 var_types targetType = node->TypeGet();
1157 var_types baseType = node->gtSIMDBaseType;
1159 regNumber op1Reg = op1->gtRegNum;
1160 regNumber op2Reg = op2->gtRegNum;
1161 genConsumeOperands(node);
1163 switch (intrinsicID)
1165 case NI_SSE42_Crc32:
1166 if (op1Reg != targetReg)
1168 inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1171 if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
1173 assert(targetType == TYP_INT);
1174 inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
1178 assert(op1->TypeGet() == op2->TypeGet());
1179 assert(targetType == TYP_INT || targetType == TYP_LONG);
1180 inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
1188 genProduceReg(node);
1191 //------------------------------------------------------------------------
1192 // genAVXIntrinsic: Generates the code for an AVX hardware intrinsic node
1195 // node - The hardware intrinsic node
1197 void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
1199 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1200 var_types baseType = node->gtSIMDBaseType;
1201 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1202 var_types targetType = node->TypeGet();
1203 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1204 GenTree* op1 = node->gtGetOp1();
1205 GenTree* op2 = node->gtGetOp2();
1206 regNumber targetReg = node->gtRegNum;
1207 emitter* emit = getEmitter();
1209 genConsumeOperands(node);
1211 switch (intrinsicID)
1213 case NI_AVX_SetZeroVector256:
1215 assert(op1 == nullptr);
1216 assert(op2 == nullptr);
1217 // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
1218 // generate xorps on AVX machines.
1219 if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
1221 emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
1225 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1234 genProduceReg(node);
1237 //------------------------------------------------------------------------
1238 // genAVX2Intrinsic: Generates the code for an AVX2 hardware intrinsic node
1241 // node - The hardware intrinsic node
1243 void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node)
1245 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1246 var_types baseType = node->gtSIMDBaseType;
1247 instruction ins = INS_invalid;
1249 genConsumeOperands(node);
1251 switch (intrinsicID)
1258 genProduceReg(node);
1261 //------------------------------------------------------------------------
1262 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
1265 // node - The hardware intrinsic node
1267 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
1269 NYI("Implement AES intrinsic code generation");
1272 //------------------------------------------------------------------------
1273 // genBMI1Intrinsic: Generates the code for a BMI1 hardware intrinsic node
1276 // node - The hardware intrinsic node
1278 void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
1280 NYI("Implement BMI1 intrinsic code generation");
1283 //------------------------------------------------------------------------
1284 // genBMI2Intrinsic: Generates the code for a BMI2 hardware intrinsic node
1287 // node - The hardware intrinsic node
1289 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
1291 NYI("Implement BMI2 intrinsic code generation");
1294 //------------------------------------------------------------------------
1295 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
1298 // node - The hardware intrinsic node
1300 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
1302 NYI("Implement FMA intrinsic code generation");
1305 //------------------------------------------------------------------------
1306 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
1309 // node - The hardware intrinsic node
1311 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
1313 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1314 GenTree* op1 = node->gtGetOp1();
1315 regNumber targetReg = node->gtRegNum;
1316 assert(targetReg != REG_NA);
1317 var_types targetType = node->TypeGet();
1318 regNumber op1Reg = op1->gtRegNum;
1319 genConsumeOperands(node);
1321 assert(intrinsicID == NI_LZCNT_LeadingZeroCount);
1323 inst_RV_RV(INS_lzcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1325 genProduceReg(node);
1328 //------------------------------------------------------------------------
1329 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
1332 // node - The hardware intrinsic node
1334 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
1336 NYI("Implement PCLMULQDQ intrinsic code generation");
1339 //------------------------------------------------------------------------
1340 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
1343 // node - The hardware intrinsic node
1345 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
1347 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1348 GenTree* op1 = node->gtGetOp1();
1349 regNumber targetReg = node->gtRegNum;
1350 assert(targetReg != REG_NA);
1351 var_types targetType = node->TypeGet();
1352 regNumber op1Reg = op1->gtRegNum;
1353 genConsumeOperands(node);
1355 assert(intrinsicID == NI_POPCNT_PopCount);
1357 inst_RV_RV(INS_popcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1359 genProduceReg(node);
1362 #endif // FEATURE_HW_INTRINSICS