1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
8 XX Intel hardware intrinsic Code Generator XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
18 #ifdef FEATURE_HW_INTRINSICS
22 #include "sideeffects.h"
25 #include "gcinfoencoder.h"
27 //------------------------------------------------------------------------
28 // genIsTableDrivenHWIntrinsic:
31 // category - category of a HW intrinsic
34 // returns true if this category can be table-driven in CodeGen
36 static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
38 // TODO - make more categories to the table-driven framework
39 // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen
40 const bool tableDrivenCategory =
41 category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
42 const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0;
43 return tableDrivenCategory && tableDrivenFlag;
46 //------------------------------------------------------------------------
47 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
50 // node - The hardware intrinsic node
52 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
54 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
55 InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID);
56 HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID);
57 HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID);
58 int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
59 int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID);
61 assert((flags & HW_Flag_NoCodeGen) == 0);
63 if (genIsTableDrivenHWIntrinsic(category, flags))
65 GenTree* op1 = node->gtGetOp1();
66 GenTree* op2 = node->gtGetOp2();
67 regNumber targetReg = node->gtRegNum;
68 var_types targetType = node->TypeGet();
69 var_types baseType = node->gtSIMDBaseType;
71 regNumber op1Reg = REG_NA;
72 regNumber op2Reg = REG_NA;
73 emitter* emit = getEmitter();
76 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
77 assert(ins != INS_invalid);
78 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
79 assert(simdSize != 0);
84 genConsumeOperands(node);
85 op1Reg = op1->gtRegNum;
86 if (category == HW_Category_MemoryLoad)
88 emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
90 else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
92 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
94 else if ((ival != -1) && varTypeIsFloating(baseType))
96 emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
100 emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
105 genConsumeOperands(node);
106 op1Reg = op1->gtRegNum;
107 op2Reg = op2->gtRegNum;
108 if (category == HW_Category_MemoryStore)
110 emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
112 else if ((ival != -1) && varTypeIsFloating(baseType))
114 genHWIntrinsic_R_R_RM_I(node, ins);
116 else if (category == HW_Category_MemoryLoad)
118 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
120 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
122 auto emitSwCase = [&](unsigned i) {
123 emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
126 if (op2->IsCnsIntOrI())
128 ssize_t ival = op2->AsIntCon()->IconValue();
129 emitSwCase((unsigned)ival);
133 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
134 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
135 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
136 regNumber baseReg = node->ExtractTempReg();
137 regNumber offsReg = node->GetSingleTempReg();
138 genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
143 genHWIntrinsic_R_R_RM(node, ins);
148 assert(op1->OperIsList());
149 assert(op1->gtGetOp2()->OperIsList());
150 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
152 GenTreeArgList* argList = op1->AsArgList();
153 op1 = argList->Current();
155 op1Reg = op1->gtRegNum;
157 argList = argList->Rest();
158 op2 = argList->Current();
160 op2Reg = op2->gtRegNum;
162 argList = argList->Rest();
163 GenTree* op3 = argList->Current();
165 regNumber op3Reg = op3->gtRegNum;
167 if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
169 auto emitSwCase = [&](unsigned i) {
170 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
172 if (op3->IsCnsIntOrI())
174 ssize_t ival = op3->AsIntCon()->IconValue();
175 emitSwCase((unsigned)ival);
179 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
180 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
181 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
182 regNumber baseReg = node->ExtractTempReg();
183 regNumber offsReg = node->GetSingleTempReg();
184 genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
189 emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
204 case InstructionSet_SSE:
205 genSSEIntrinsic(node);
207 case InstructionSet_SSE2:
208 genSSE2Intrinsic(node);
210 case InstructionSet_SSE41:
211 genSSE41Intrinsic(node);
213 case InstructionSet_SSE42:
214 genSSE42Intrinsic(node);
216 case InstructionSet_AVX:
217 genAVXIntrinsic(node);
219 case InstructionSet_AVX2:
220 genAVX2Intrinsic(node);
222 case InstructionSet_AES:
223 genAESIntrinsic(node);
225 case InstructionSet_BMI1:
226 genBMI1Intrinsic(node);
228 case InstructionSet_BMI2:
229 genBMI2Intrinsic(node);
231 case InstructionSet_FMA:
232 genFMAIntrinsic(node);
234 case InstructionSet_LZCNT:
235 genLZCNTIntrinsic(node);
237 case InstructionSet_PCLMULQDQ:
238 genPCLMULQDQIntrinsic(node);
240 case InstructionSet_POPCNT:
241 genPOPCNTIntrinsic(node);
249 //------------------------------------------------------------------------
250 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
251 // register/memory operand, and that returns a value in register
254 // node - The hardware intrinsic node
255 // ins - The instruction being generated
257 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
259 var_types targetType = node->TypeGet();
260 regNumber targetReg = node->gtRegNum;
261 GenTree* op1 = node->gtGetOp1();
262 GenTree* op2 = node->gtGetOp2();
263 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
264 emitter* emit = getEmitter();
266 // TODO-XArch-CQ: Commutative operations can have op1 be contained
267 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
269 regNumber op1Reg = op1->gtRegNum;
271 assert(targetReg != REG_NA);
272 assert(op1Reg != REG_NA);
274 if (op2->isContained() || op2->isUsedFromSpillTemp())
276 assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
277 assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
279 TempDsc* tmpDsc = nullptr;
280 unsigned varNum = BAD_VAR_NUM;
281 unsigned offset = (unsigned)-1;
283 if (op2->isUsedFromSpillTemp())
285 assert(op2->IsRegOptional());
287 tmpDsc = getSpillTempDsc(op2);
288 varNum = tmpDsc->tdTempNum();
291 compiler->tmpRlsTemp(tmpDsc);
293 else if (op2->OperIsHWIntrinsic())
295 emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
298 else if (op2->isIndir())
300 GenTreeIndir* memIndir = op2->AsIndir();
301 GenTree* memBase = memIndir->gtOp1;
303 switch (memBase->OperGet())
305 case GT_LCL_VAR_ADDR:
307 varNum = memBase->AsLclVarCommon()->GetLclNum();
310 // Ensure that all the GenTreeIndir values are set to their defaults.
311 assert(!memIndir->HasIndex());
312 assert(memIndir->Scale() == 1);
313 assert(memIndir->Offset() == 0);
318 case GT_CLS_VAR_ADDR:
320 emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
326 emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
333 switch (op2->OperGet())
337 GenTreeLclFld* lclField = op2->AsLclFld();
339 varNum = lclField->GetLclNum();
340 offset = lclField->gtLclFld.gtLclOffs;
346 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
347 varNum = op2->AsLclVar()->GetLclNum();
358 // Ensure we got a good varNum and offset.
359 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
360 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
361 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
362 assert(offset != (unsigned)-1);
364 emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
368 emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum);
372 //------------------------------------------------------------------------
373 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
374 // register/memory operand, an immediate operand, and that returns a value in register
377 // node - The hardware intrinsic node
378 // ins - The instruction being generated
380 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
382 var_types targetType = node->TypeGet();
383 regNumber targetReg = node->gtRegNum;
384 GenTree* op1 = node->gtGetOp1();
385 GenTree* op2 = node->gtGetOp2();
386 emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
387 int ival = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
388 emitter* emit = getEmitter();
390 // TODO-XArch-CQ: Commutative operations can have op1 be contained
391 // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
393 regNumber op1Reg = op1->gtRegNum;
395 assert(targetReg != REG_NA);
396 assert(op1Reg != REG_NA);
398 if (op2->isContained() || op2->isUsedFromSpillTemp())
400 assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
401 assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
403 TempDsc* tmpDsc = nullptr;
404 unsigned varNum = BAD_VAR_NUM;
405 unsigned offset = (unsigned)-1;
407 if (op2->isUsedFromSpillTemp())
409 assert(op2->IsRegOptional());
411 tmpDsc = getSpillTempDsc(op2);
412 varNum = tmpDsc->tdTempNum();
415 compiler->tmpRlsTemp(tmpDsc);
417 else if (op2->OperIsHWIntrinsic())
419 emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
422 else if (op2->isIndir())
424 GenTreeIndir* memIndir = op2->AsIndir();
425 GenTree* memBase = memIndir->gtOp1;
427 switch (memBase->OperGet())
429 case GT_LCL_VAR_ADDR:
431 varNum = memBase->AsLclVarCommon()->GetLclNum();
434 // Ensure that all the GenTreeIndir values are set to their defaults.
435 assert(!memIndir->HasIndex());
436 assert(memIndir->Scale() == 1);
437 assert(memIndir->Offset() == 0);
442 case GT_CLS_VAR_ADDR:
444 emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
451 emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
458 switch (op2->OperGet())
462 GenTreeLclFld* lclField = op2->AsLclFld();
464 varNum = lclField->GetLclNum();
465 offset = lclField->gtLclFld.gtLclOffs;
471 assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
472 varNum = op2->AsLclVar()->GetLclNum();
483 // Ensure we got a good varNum and offset.
484 // We also need to check for `tmpDsc != nullptr` since spill temp numbers
485 // are negative and start with -1, which also happens to be BAD_VAR_NUM.
486 assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
487 assert(offset != (unsigned)-1);
489 emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
493 emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, ival);
497 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
498 // with non-constant argument
501 // intrinsic - intrinsic ID
502 // nonConstImmReg - the register contains non-constant imm8 argument
503 // baseReg - a register for the start of the switch table
504 // offsReg - a register for the offset into the switch table
505 // emitSwCase - the lambda to generate siwtch-case
508 // generate the jump-table fallback for imm-intrinsics with non-constant argument.
510 // This function can be used for all imm-intrinsics (whether full-range or not),
511 // The compiler front-end (i.e. importer) is responsible to insert a range-check IR
512 // (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
514 template <typename HWIntrinsicSwitchCaseBody>
515 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsic,
516 regNumber nonConstImmReg,
519 HWIntrinsicSwitchCaseBody emitSwCase)
521 assert(nonConstImmReg != REG_NA);
522 emitter* emit = getEmitter();
524 const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
525 assert(maxByte <= 256);
526 BasicBlock* jmpTable[256];
528 unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
529 unsigned jmpTableOffs = 0;
531 // Emit the jump table
532 for (unsigned i = 0; i < maxByte; i++)
534 jmpTable[i] = genCreateTempLabel();
535 emit->emitDataGenData(i, jmpTable[i]);
538 emit->emitDataGenEnd();
540 // Compute and jump to the appropriate offset in the switch table
541 emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
543 emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
544 emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
545 emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
546 emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
548 // Emit the switch table entries
550 BasicBlock* switchTableBeg = genCreateTempLabel();
551 BasicBlock* switchTableEnd = genCreateTempLabel();
553 genDefineTempLabel(switchTableBeg);
555 for (unsigned i = 0; i < maxByte; i++)
557 genDefineTempLabel(jmpTable[i]);
559 emit->emitIns_J(INS_jmp, switchTableEnd);
562 genDefineTempLabel(switchTableEnd);
565 //------------------------------------------------------------------------
566 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
569 // node - The hardware intrinsic node
571 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
573 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
574 GenTree* op1 = node->gtGetOp1();
575 GenTree* op2 = node->gtGetOp2();
576 GenTree* op3 = nullptr;
577 GenTree* op4 = nullptr;
578 regNumber targetReg = node->gtRegNum;
579 var_types targetType = node->TypeGet();
580 var_types baseType = node->gtSIMDBaseType;
582 regNumber op1Reg = REG_NA;
583 regNumber op2Reg = REG_NA;
584 regNumber op3Reg = REG_NA;
585 regNumber op4Reg = REG_NA;
586 emitter* emit = getEmitter();
588 if ((op1 != nullptr) && !op1->OperIsList())
590 op1Reg = op1->gtRegNum;
591 genConsumeOperands(node);
596 case NI_SSE_ConvertScalarToVector128Single:
598 assert(node->TypeGet() == TYP_SIMD16);
599 assert(node->gtSIMDBaseType == TYP_FLOAT);
600 assert(Compiler::ivalOfHWIntrinsic(intrinsicID) == -1);
602 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
603 genHWIntrinsic_R_R_RM(node, ins);
607 case NI_SSE_CompareEqualOrderedScalar:
608 case NI_SSE_CompareEqualUnorderedScalar:
610 assert(baseType == TYP_FLOAT);
611 op2Reg = op2->gtRegNum;
612 regNumber tmpReg = node->GetSingleTempReg();
614 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
615 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
616 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
617 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
618 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
619 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
620 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
624 case NI_SSE_CompareGreaterThanOrderedScalar:
625 case NI_SSE_CompareGreaterThanUnorderedScalar:
627 assert(baseType == TYP_FLOAT);
628 op2Reg = op2->gtRegNum;
630 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
631 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
632 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
633 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
637 case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
638 case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
640 assert(baseType == TYP_FLOAT);
641 op2Reg = op2->gtRegNum;
643 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
644 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
645 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
646 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
650 case NI_SSE_CompareLessThanOrderedScalar:
651 case NI_SSE_CompareLessThanUnorderedScalar:
653 assert(baseType == TYP_FLOAT);
654 op2Reg = op2->gtRegNum;
656 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
657 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
658 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
659 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
663 case NI_SSE_CompareLessThanOrEqualOrderedScalar:
664 case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
666 assert(baseType == TYP_FLOAT);
667 op2Reg = op2->gtRegNum;
669 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
670 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
671 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
672 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
676 case NI_SSE_CompareNotEqualOrderedScalar:
677 case NI_SSE_CompareNotEqualUnorderedScalar:
679 assert(baseType == TYP_FLOAT);
680 op2Reg = op2->gtRegNum;
682 regNumber tmpReg = node->GetSingleTempReg();
684 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
685 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
686 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
687 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
688 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
689 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
690 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
694 case NI_SSE_ConvertToSingle:
695 case NI_SSE_StaticCast:
697 assert(op2 == nullptr);
698 if (op1Reg != targetReg)
700 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
701 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
706 case NI_SSE_MoveMask:
708 assert(baseType == TYP_FLOAT);
709 assert(op2 == nullptr);
711 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
712 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
716 case NI_SSE_Prefetch0:
717 case NI_SSE_Prefetch1:
718 case NI_SSE_Prefetch2:
719 case NI_SSE_PrefetchNonTemporal:
721 assert(baseType == TYP_UBYTE);
722 assert(op2 == nullptr);
724 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
725 emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
729 case NI_SSE_ReciprocalScalar:
730 case NI_SSE_ReciprocalSqrtScalar:
731 case NI_SSE_SqrtScalar:
733 assert(baseType == TYP_FLOAT);
734 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
738 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg);
742 genHWIntrinsic_R_R_RM(node, ins);
747 case NI_SSE_SetScalarVector128:
749 assert(baseType == TYP_FLOAT);
750 assert(op2 == nullptr);
752 if (op1Reg == targetReg)
754 regNumber tmpReg = node->GetSingleTempReg();
755 emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
759 emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
760 emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
764 case NI_SSE_SetZeroVector128:
766 assert(baseType == TYP_FLOAT);
767 assert(op1 == nullptr);
768 assert(op2 == nullptr);
769 emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
773 case NI_SSE_StoreFence:
775 assert(baseType == TYP_VOID);
776 assert(op1 == nullptr);
777 assert(op2 == nullptr);
778 emit->emitIns(INS_sfence);
790 //------------------------------------------------------------------------
791 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
794 // node - The hardware intrinsic node
796 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
798 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
799 GenTree* op1 = node->gtGetOp1();
800 GenTree* op2 = node->gtGetOp2();
801 regNumber targetReg = node->gtRegNum;
802 var_types targetType = node->TypeGet();
803 var_types baseType = node->gtSIMDBaseType;
804 regNumber op1Reg = REG_NA;
805 regNumber op2Reg = REG_NA;
806 emitter* emit = getEmitter();
809 if ((op1 != nullptr) && !op1->OperIsList())
811 op1Reg = op1->gtRegNum;
812 genConsumeOperands(node);
817 // All integer overloads are handled by table codegen
818 case NI_SSE2_CompareLessThan:
820 assert(op1 != nullptr);
821 assert(op2 != nullptr);
822 assert(baseType == TYP_DOUBLE);
823 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
824 op2Reg = op2->gtRegNum;
825 ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
827 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
832 case NI_SSE2_CompareEqualOrderedScalar:
833 case NI_SSE2_CompareEqualUnorderedScalar:
835 assert(baseType == TYP_DOUBLE);
836 op2Reg = op2->gtRegNum;
837 regNumber tmpReg = node->GetSingleTempReg();
838 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
840 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
841 emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
842 emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
843 emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
844 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
845 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
849 case NI_SSE2_CompareGreaterThanOrderedScalar:
850 case NI_SSE2_CompareGreaterThanUnorderedScalar:
852 assert(baseType == TYP_DOUBLE);
853 op2Reg = op2->gtRegNum;
854 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
856 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
857 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
858 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
862 case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
863 case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
865 assert(baseType == TYP_DOUBLE);
866 op2Reg = op2->gtRegNum;
867 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
869 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
870 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
871 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
875 case NI_SSE2_CompareLessThanOrderedScalar:
876 case NI_SSE2_CompareLessThanUnorderedScalar:
878 assert(baseType == TYP_DOUBLE);
879 op2Reg = op2->gtRegNum;
880 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
882 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
883 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
884 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
888 case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
889 case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
891 assert(baseType == TYP_DOUBLE);
892 op2Reg = op2->gtRegNum;
893 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
895 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
896 emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
897 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
901 case NI_SSE2_CompareNotEqualOrderedScalar:
902 case NI_SSE2_CompareNotEqualUnorderedScalar:
904 assert(baseType == TYP_DOUBLE);
905 op2Reg = op2->gtRegNum;
906 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
907 regNumber tmpReg = node->GetSingleTempReg();
909 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
910 emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
911 emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
912 emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
913 emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
914 emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
918 case NI_SSE2_ConvertScalarToVector128Double:
919 case NI_SSE2_ConvertScalarToVector128Single:
921 assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
922 assert(op1 != nullptr);
923 assert(op2 != nullptr);
924 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
925 genHWIntrinsic_R_R_RM(node, ins);
929 case NI_SSE2_ConvertScalarToVector128Int64:
930 case NI_SSE2_ConvertScalarToVector128UInt64:
932 assert(baseType == TYP_LONG || baseType == TYP_ULONG);
933 assert(op1 != nullptr);
934 assert(op2 == nullptr);
935 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
936 // TODO-XArch-CQ -> use of type size of TYP_SIMD16 leads to
937 // instruction register encoding errors for SSE legacy encoding
938 emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
942 case NI_SSE2_ConvertToDouble:
944 assert(op2 == nullptr);
945 if (op1Reg != targetReg)
947 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
948 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
953 case NI_SSE2_ConvertToInt32:
954 case NI_SSE2_ConvertToInt64:
955 case NI_SSE2_ConvertToUInt32:
956 case NI_SSE2_ConvertToUInt64:
958 assert(op2 == nullptr);
959 assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
960 baseType == TYP_LONG || baseType == TYP_ULONG);
961 if (op1Reg != targetReg)
963 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
964 if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
966 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
970 // TODO-XArch-Bug https://github.com/dotnet/coreclr/issues/16329
971 // using hardcoded instruction as workaround for inexact type conversions
972 emit->emitIns_R_R(INS_mov_xmm2i, emitActualTypeSize(baseType), op1Reg, targetReg);
978 case NI_SSE2_LoadFence:
980 assert(baseType == TYP_VOID);
981 assert(op1 == nullptr);
982 assert(op2 == nullptr);
983 emit->emitIns(INS_lfence);
987 case NI_SSE2_MemoryFence:
989 assert(baseType == TYP_VOID);
990 assert(op1 == nullptr);
991 assert(op2 == nullptr);
992 emit->emitIns(INS_mfence);
996 case NI_SSE2_MoveMask:
998 assert(op2 == nullptr);
999 assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1001 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1002 emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1006 case NI_SSE2_SetZeroVector128:
1008 assert(baseType != TYP_FLOAT);
1009 assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE);
1010 assert(op1 == nullptr);
1011 assert(op2 == nullptr);
1013 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1014 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1023 genProduceReg(node);
1026 //------------------------------------------------------------------------
1027 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1030 // node - The hardware intrinsic node
1032 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1034 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1035 GenTree* op1 = node->gtGetOp1();
1036 GenTree* op2 = node->gtGetOp2();
1037 GenTree* op3 = nullptr;
1038 GenTree* op4 = nullptr;
1039 regNumber targetReg = node->gtRegNum;
1040 var_types targetType = node->TypeGet();
1041 var_types baseType = node->gtSIMDBaseType;
1043 regNumber op1Reg = REG_NA;
1044 regNumber op2Reg = REG_NA;
1045 regNumber op3Reg = REG_NA;
1046 regNumber op4Reg = REG_NA;
1047 emitter* emit = getEmitter();
1049 if ((op1 != nullptr) && !op1->OperIsList())
1051 op1Reg = op1->gtRegNum;
1052 genConsumeOperands(node);
1055 switch (intrinsicID)
1057 case NI_SSE41_CeilingScalar:
1058 case NI_SSE41_FloorScalar:
1059 case NI_SSE41_RoundCurrentDirectionScalar:
1060 case NI_SSE41_RoundToNearestIntegerScalar:
1061 case NI_SSE41_RoundToNegativeInfinityScalar:
1062 case NI_SSE41_RoundToPositiveInfinityScalar:
1063 case NI_SSE41_RoundToZeroScalar:
1065 assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE));
1066 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1070 int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
1071 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg, ival);
1075 genHWIntrinsic_R_R_RM_I(node, ins);
1080 case NI_SSE41_TestAllOnes:
1082 regNumber tmpReg = node->GetSingleTempReg();
1083 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1084 emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1085 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1086 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1087 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1091 case NI_SSE41_TestAllZeros:
1092 case NI_SSE41_TestZ:
1094 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1095 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1096 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1097 emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1101 case NI_SSE41_TestC:
1103 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1104 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1105 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1106 emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1110 case NI_SSE41_TestMixOnesZeros:
1111 case NI_SSE41_TestNotZAndNotC:
1113 assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1114 emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1115 emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1116 emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1125 genProduceReg(node);
1128 //------------------------------------------------------------------------
1129 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1132 // node - The hardware intrinsic node
1134 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1136 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1137 GenTree* op1 = node->gtGetOp1();
1138 GenTree* op2 = node->gtGetOp2();
1139 regNumber targetReg = node->gtRegNum;
1140 assert(targetReg != REG_NA);
1141 var_types targetType = node->TypeGet();
1142 var_types baseType = node->gtSIMDBaseType;
1144 regNumber op1Reg = op1->gtRegNum;
1145 regNumber op2Reg = op2->gtRegNum;
1146 genConsumeOperands(node);
1148 switch (intrinsicID)
1150 case NI_SSE42_Crc32:
1151 if (op1Reg != targetReg)
1153 inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1156 if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
1158 assert(targetType == TYP_INT);
1159 inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
1163 assert(op1->TypeGet() == op2->TypeGet());
1164 assert(targetType == TYP_INT || targetType == TYP_LONG);
1165 inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
1173 genProduceReg(node);
1176 //------------------------------------------------------------------------
1177 // genAVXIntrinsic: Generates the code for an AVX hardware intrinsic node
1180 // node - The hardware intrinsic node
1182 void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
1184 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1185 var_types baseType = node->gtSIMDBaseType;
1186 emitAttr attr = EA_ATTR(node->gtSIMDSize);
1187 var_types targetType = node->TypeGet();
1188 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1189 GenTree* op1 = node->gtGetOp1();
1190 GenTree* op2 = node->gtGetOp2();
1191 regNumber targetReg = node->gtRegNum;
1192 emitter* emit = getEmitter();
1194 genConsumeOperands(node);
1196 switch (intrinsicID)
1198 case NI_AVX_SetZeroVector256:
1200 assert(op1 == nullptr);
1201 assert(op2 == nullptr);
1202 // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
1203 // generate xorps on AVX machines.
1204 if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
1206 emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
1210 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1219 genProduceReg(node);
1222 //------------------------------------------------------------------------
1223 // genAVX2Intrinsic: Generates the code for an AVX2 hardware intrinsic node
1226 // node - The hardware intrinsic node
1228 void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node)
1230 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1231 var_types baseType = node->gtSIMDBaseType;
1232 instruction ins = INS_invalid;
1234 genConsumeOperands(node);
1236 switch (intrinsicID)
1243 genProduceReg(node);
1246 //------------------------------------------------------------------------
1247 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
1250 // node - The hardware intrinsic node
1252 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
1254 NYI("Implement AES intrinsic code generation");
1257 //------------------------------------------------------------------------
1258 // genBMI1Intrinsic: Generates the code for a BMI1 hardware intrinsic node
1261 // node - The hardware intrinsic node
1263 void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
1265 NYI("Implement BMI1 intrinsic code generation");
1268 //------------------------------------------------------------------------
1269 // genBMI2Intrinsic: Generates the code for a BMI2 hardware intrinsic node
1272 // node - The hardware intrinsic node
1274 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
1276 NYI("Implement BMI2 intrinsic code generation");
1279 //------------------------------------------------------------------------
1280 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
1283 // node - The hardware intrinsic node
1285 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
1287 NYI("Implement FMA intrinsic code generation");
1290 //------------------------------------------------------------------------
1291 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
1294 // node - The hardware intrinsic node
1296 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
1298 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1299 GenTree* op1 = node->gtGetOp1();
1300 regNumber targetReg = node->gtRegNum;
1301 assert(targetReg != REG_NA);
1302 var_types targetType = node->TypeGet();
1303 regNumber op1Reg = op1->gtRegNum;
1304 genConsumeOperands(node);
1306 assert(intrinsicID == NI_LZCNT_LeadingZeroCount);
1308 inst_RV_RV(INS_lzcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1310 genProduceReg(node);
1313 //------------------------------------------------------------------------
1314 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
1317 // node - The hardware intrinsic node
1319 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
1321 NYI("Implement PCLMULQDQ intrinsic code generation");
1324 //------------------------------------------------------------------------
1325 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
1328 // node - The hardware intrinsic node
1330 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
1332 NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1333 GenTree* op1 = node->gtGetOp1();
1334 regNumber targetReg = node->gtRegNum;
1335 assert(targetReg != REG_NA);
1336 var_types targetType = node->TypeGet();
1337 regNumber op1Reg = op1->gtRegNum;
1338 genConsumeOperands(node);
1340 assert(intrinsicID == NI_POPCNT_PopCount);
1342 inst_RV_RV(INS_popcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1344 genProduceReg(node);
1347 #endif // FEATURE_HW_INTRINSICS