2c6a184b3a0eba45875052ffdc1c1524e5ac892e
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // genIsTableDrivenHWIntrinsic:
29 //
30 // Arguments:
31 //    category - category of a HW intrinsic
32 //
33 // Return Value:
34 //    returns true if this category can be table-driven in CodeGen
35 //
36 static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
37 {
38     // TODO - make more categories to the table-driven framework
39     // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen
40     const bool tableDrivenCategory =
41         category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
42     const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0;
43     return tableDrivenCategory && tableDrivenFlag;
44 }
45
46 //------------------------------------------------------------------------
47 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
48 //
49 // Arguments:
50 //    node - The hardware intrinsic node
51 //
52 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
53 {
54     NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
55     InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
56     HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
57     HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
58     int                 ival        = Compiler::ivalOfHWIntrinsic(intrinsicID);
59     int                 numArgs     = Compiler::numArgsOfHWIntrinsic(intrinsicID);
60
61     assert((flags & HW_Flag_NoCodeGen) == 0);
62
63     if (genIsTableDrivenHWIntrinsic(category, flags))
64     {
65         GenTree*  op1        = node->gtGetOp1();
66         GenTree*  op2        = node->gtGetOp2();
67         regNumber targetReg  = node->gtRegNum;
68         var_types targetType = node->TypeGet();
69         var_types baseType   = node->gtSIMDBaseType;
70
71         regNumber op1Reg = REG_NA;
72         regNumber op2Reg = REG_NA;
73         emitter*  emit   = getEmitter();
74
75         assert(numArgs >= 0);
76         instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
77         assert(ins != INS_invalid);
78         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
79         assert(simdSize != 0);
80
81         switch (numArgs)
82         {
83             case 1:
84                 genConsumeOperands(node);
85                 op1Reg = op1->gtRegNum;
86                 if (category == HW_Category_MemoryLoad)
87                 {
88                     emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
89                 }
90                 else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
91                 {
92                     emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
93                 }
94                 else if ((ival != -1) && varTypeIsFloating(baseType))
95                 {
96                     emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
97                 }
98                 else
99                 {
100                     emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
101                 }
102                 break;
103
104             case 2:
105                 genConsumeOperands(node);
106                 op1Reg = op1->gtRegNum;
107                 op2Reg = op2->gtRegNum;
108                 if (category == HW_Category_MemoryStore)
109                 {
110                     emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
111                 }
112                 else if ((ival != -1) && varTypeIsFloating(baseType))
113                 {
114                     genHWIntrinsic_R_R_RM_I(node, ins);
115                 }
116                 else if (category == HW_Category_MemoryLoad)
117                 {
118                     emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
119                 }
120                 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
121                 {
122                     auto emitSwCase = [&](unsigned i) {
123                         emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
124                     };
125
126                     if (op2->IsCnsIntOrI())
127                     {
128                         ssize_t ival = op2->AsIntCon()->IconValue();
129                         emitSwCase((unsigned)ival);
130                     }
131                     else
132                     {
133                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
134                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
135                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
136                         regNumber baseReg = node->ExtractTempReg();
137                         regNumber offsReg = node->GetSingleTempReg();
138                         genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
139                     }
140                 }
141                 else
142                 {
143                     genHWIntrinsic_R_R_RM(node, ins);
144                 }
145                 break;
146             case 3:
147             {
148                 assert(op1->OperIsList());
149                 assert(op1->gtGetOp2()->OperIsList());
150                 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
151
152                 GenTreeArgList* argList = op1->AsArgList();
153                 op1                     = argList->Current();
154                 genConsumeRegs(op1);
155                 op1Reg = op1->gtRegNum;
156
157                 argList = argList->Rest();
158                 op2     = argList->Current();
159                 genConsumeRegs(op2);
160                 op2Reg = op2->gtRegNum;
161
162                 argList      = argList->Rest();
163                 GenTree* op3 = argList->Current();
164                 genConsumeRegs(op3);
165                 regNumber op3Reg = op3->gtRegNum;
166
167                 if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
168                 {
169                     auto emitSwCase = [&](unsigned i) {
170                         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
171                     };
172                     if (op3->IsCnsIntOrI())
173                     {
174                         ssize_t ival = op3->AsIntCon()->IconValue();
175                         emitSwCase((unsigned)ival);
176                     }
177                     else
178                     {
179                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
180                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
181                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
182                         regNumber baseReg = node->ExtractTempReg();
183                         regNumber offsReg = node->GetSingleTempReg();
184                         genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
185                     }
186                 }
187                 else
188                 {
189                     emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
190                 }
191                 break;
192             }
193
194             default:
195                 unreached();
196                 break;
197         }
198         genProduceReg(node);
199         return;
200     }
201
202     switch (isa)
203     {
204         case InstructionSet_SSE:
205             genSSEIntrinsic(node);
206             break;
207         case InstructionSet_SSE2:
208             genSSE2Intrinsic(node);
209             break;
210         case InstructionSet_SSE41:
211             genSSE41Intrinsic(node);
212             break;
213         case InstructionSet_SSE42:
214             genSSE42Intrinsic(node);
215             break;
216         case InstructionSet_AVX:
217             genAVXIntrinsic(node);
218             break;
219         case InstructionSet_AVX2:
220             genAVX2Intrinsic(node);
221             break;
222         case InstructionSet_AES:
223             genAESIntrinsic(node);
224             break;
225         case InstructionSet_BMI1:
226             genBMI1Intrinsic(node);
227             break;
228         case InstructionSet_BMI2:
229             genBMI2Intrinsic(node);
230             break;
231         case InstructionSet_FMA:
232             genFMAIntrinsic(node);
233             break;
234         case InstructionSet_LZCNT:
235             genLZCNTIntrinsic(node);
236             break;
237         case InstructionSet_PCLMULQDQ:
238             genPCLMULQDQIntrinsic(node);
239             break;
240         case InstructionSet_POPCNT:
241             genPOPCNTIntrinsic(node);
242             break;
243         default:
244             unreached();
245             break;
246     }
247 }
248
249 //------------------------------------------------------------------------
250 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
251 //                        register/memory operand, and that returns a value in register
252 //
253 // Arguments:
254 //    node - The hardware intrinsic node
255 //    ins  - The instruction being generated
256 //
257 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
258 {
259     var_types targetType = node->TypeGet();
260     regNumber targetReg  = node->gtRegNum;
261     GenTree*  op1        = node->gtGetOp1();
262     GenTree*  op2        = node->gtGetOp2();
263     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
264     emitter*  emit       = getEmitter();
265
266     // TODO-XArch-CQ: Commutative operations can have op1 be contained
267     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
268
269     regNumber op1Reg = op1->gtRegNum;
270
271     assert(targetReg != REG_NA);
272     assert(op1Reg != REG_NA);
273
274     if (op2->isContained() || op2->isUsedFromSpillTemp())
275     {
276         assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
277         assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
278
279         TempDsc* tmpDsc = nullptr;
280         unsigned varNum = BAD_VAR_NUM;
281         unsigned offset = (unsigned)-1;
282
283         if (op2->isUsedFromSpillTemp())
284         {
285             assert(op2->IsRegOptional());
286
287             tmpDsc = getSpillTempDsc(op2);
288             varNum = tmpDsc->tdTempNum();
289             offset = 0;
290
291             compiler->tmpRlsTemp(tmpDsc);
292         }
293         else if (op2->OperIsHWIntrinsic())
294         {
295             emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
296             return;
297         }
298         else if (op2->isIndir())
299         {
300             GenTreeIndir* memIndir = op2->AsIndir();
301             GenTree*      memBase  = memIndir->gtOp1;
302
303             switch (memBase->OperGet())
304             {
305                 case GT_LCL_VAR_ADDR:
306                 {
307                     varNum = memBase->AsLclVarCommon()->GetLclNum();
308                     offset = 0;
309
310                     // Ensure that all the GenTreeIndir values are set to their defaults.
311                     assert(!memIndir->HasIndex());
312                     assert(memIndir->Scale() == 1);
313                     assert(memIndir->Offset() == 0);
314
315                     break;
316                 }
317
318                 case GT_CLS_VAR_ADDR:
319                 {
320                     emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
321                     return;
322                 }
323
324                 default:
325                 {
326                     emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
327                     return;
328                 }
329             }
330         }
331         else
332         {
333             switch (op2->OperGet())
334             {
335                 case GT_LCL_FLD:
336                 {
337                     GenTreeLclFld* lclField = op2->AsLclFld();
338
339                     varNum = lclField->GetLclNum();
340                     offset = lclField->gtLclFld.gtLclOffs;
341                     break;
342                 }
343
344                 case GT_LCL_VAR:
345                 {
346                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
347                     varNum = op2->AsLclVar()->GetLclNum();
348                     offset = 0;
349                     break;
350                 }
351
352                 default:
353                     unreached();
354                     break;
355             }
356         }
357
358         // Ensure we got a good varNum and offset.
359         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
360         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
361         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
362         assert(offset != (unsigned)-1);
363
364         emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
365     }
366     else
367     {
368         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum);
369     }
370 }
371
372 //------------------------------------------------------------------------
373 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
374 //                        register/memory operand, an immediate operand, and that returns a value in register
375 //
376 // Arguments:
377 //    node - The hardware intrinsic node
378 //    ins  - The instruction being generated
379 //
380 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
381 {
382     var_types targetType = node->TypeGet();
383     regNumber targetReg  = node->gtRegNum;
384     GenTree*  op1        = node->gtGetOp1();
385     GenTree*  op2        = node->gtGetOp2();
386     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
387     int       ival       = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
388     emitter*  emit       = getEmitter();
389
390     // TODO-XArch-CQ: Commutative operations can have op1 be contained
391     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
392
393     regNumber op1Reg = op1->gtRegNum;
394
395     assert(targetReg != REG_NA);
396     assert(op1Reg != REG_NA);
397
398     if (op2->isContained() || op2->isUsedFromSpillTemp())
399     {
400         assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
401         assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
402
403         TempDsc* tmpDsc = nullptr;
404         unsigned varNum = BAD_VAR_NUM;
405         unsigned offset = (unsigned)-1;
406
407         if (op2->isUsedFromSpillTemp())
408         {
409             assert(op2->IsRegOptional());
410
411             tmpDsc = getSpillTempDsc(op2);
412             varNum = tmpDsc->tdTempNum();
413             offset = 0;
414
415             compiler->tmpRlsTemp(tmpDsc);
416         }
417         else if (op2->OperIsHWIntrinsic())
418         {
419             emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
420             return;
421         }
422         else if (op2->isIndir())
423         {
424             GenTreeIndir* memIndir = op2->AsIndir();
425             GenTree*      memBase  = memIndir->gtOp1;
426
427             switch (memBase->OperGet())
428             {
429                 case GT_LCL_VAR_ADDR:
430                 {
431                     varNum = memBase->AsLclVarCommon()->GetLclNum();
432                     offset = 0;
433
434                     // Ensure that all the GenTreeIndir values are set to their defaults.
435                     assert(!memIndir->HasIndex());
436                     assert(memIndir->Scale() == 1);
437                     assert(memIndir->Offset() == 0);
438
439                     break;
440                 }
441
442                 case GT_CLS_VAR_ADDR:
443                 {
444                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
445                                                ival);
446                     return;
447                 }
448
449                 default:
450                 {
451                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
452                     return;
453                 }
454             }
455         }
456         else
457         {
458             switch (op2->OperGet())
459             {
460                 case GT_LCL_FLD:
461                 {
462                     GenTreeLclFld* lclField = op2->AsLclFld();
463
464                     varNum = lclField->GetLclNum();
465                     offset = lclField->gtLclFld.gtLclOffs;
466                     break;
467                 }
468
469                 case GT_LCL_VAR:
470                 {
471                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
472                     varNum = op2->AsLclVar()->GetLclNum();
473                     offset = 0;
474                     break;
475                 }
476
477                 default:
478                     unreached();
479                     break;
480             }
481         }
482
483         // Ensure we got a good varNum and offset.
484         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
485         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
486         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
487         assert(offset != (unsigned)-1);
488
489         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
490     }
491     else
492     {
493         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2->gtRegNum, ival);
494     }
495 }
496
497 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
498 //                       with non-constant argument
499 //
500 // Arguments:
501 //    intrinsic      - intrinsic ID
502 //    nonConstImmReg - the register contains non-constant imm8 argument
503 //    baseReg        - a register for the start of the switch table
504 //    offsReg        - a register for the offset into the switch table
505 //    emitSwCase     - the lambda to generate siwtch-case
506 //
507 // Return Value:
508 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
509 // Note:
510 //    This function can be used for all imm-intrinsics (whether full-range or not),
511 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
512 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
513 //
514 template <typename HWIntrinsicSwitchCaseBody>
515 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
516                                               regNumber                 nonConstImmReg,
517                                               regNumber                 baseReg,
518                                               regNumber                 offsReg,
519                                               HWIntrinsicSwitchCaseBody emitSwCase)
520 {
521     assert(nonConstImmReg != REG_NA);
522     emitter* emit = getEmitter();
523
524     const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
525     assert(maxByte <= 256);
526     BasicBlock* jmpTable[256];
527
528     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
529     unsigned jmpTableOffs = 0;
530
531     // Emit the jump table
532     for (unsigned i = 0; i < maxByte; i++)
533     {
534         jmpTable[i] = genCreateTempLabel();
535         emit->emitDataGenData(i, jmpTable[i]);
536     }
537
538     emit->emitDataGenEnd();
539
540     // Compute and jump to the appropriate offset in the switch table
541     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
542
543     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
544     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
545     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
546     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
547
548     // Emit the switch table entries
549
550     BasicBlock* switchTableBeg = genCreateTempLabel();
551     BasicBlock* switchTableEnd = genCreateTempLabel();
552
553     genDefineTempLabel(switchTableBeg);
554
555     for (unsigned i = 0; i < maxByte; i++)
556     {
557         genDefineTempLabel(jmpTable[i]);
558         emitSwCase(i);
559         emit->emitIns_J(INS_jmp, switchTableEnd);
560     }
561
562     genDefineTempLabel(switchTableEnd);
563 }
564
565 //------------------------------------------------------------------------
566 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
567 //
568 // Arguments:
569 //    node - The hardware intrinsic node
570 //
571 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
572 {
573     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
574     GenTree*       op1         = node->gtGetOp1();
575     GenTree*       op2         = node->gtGetOp2();
576     GenTree*       op3         = nullptr;
577     GenTree*       op4         = nullptr;
578     regNumber      targetReg   = node->gtRegNum;
579     var_types      targetType  = node->TypeGet();
580     var_types      baseType    = node->gtSIMDBaseType;
581
582     regNumber op1Reg = REG_NA;
583     regNumber op2Reg = REG_NA;
584     regNumber op3Reg = REG_NA;
585     regNumber op4Reg = REG_NA;
586     emitter*  emit   = getEmitter();
587
588     if ((op1 != nullptr) && !op1->OperIsList())
589     {
590         op1Reg = op1->gtRegNum;
591         genConsumeOperands(node);
592     }
593
594     switch (intrinsicID)
595     {
596         case NI_SSE_ConvertScalarToVector128Single:
597         {
598             assert(node->TypeGet() == TYP_SIMD16);
599             assert(node->gtSIMDBaseType == TYP_FLOAT);
600             assert(Compiler::ivalOfHWIntrinsic(intrinsicID) == -1);
601
602             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
603             genHWIntrinsic_R_R_RM(node, ins);
604             break;
605         }
606
607         case NI_SSE_CompareEqualOrderedScalar:
608         case NI_SSE_CompareEqualUnorderedScalar:
609         {
610             assert(baseType == TYP_FLOAT);
611             op2Reg           = op2->gtRegNum;
612             regNumber tmpReg = node->GetSingleTempReg();
613
614             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
615             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
616             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
617             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
618             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
619             emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
620             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
621             break;
622         }
623
624         case NI_SSE_CompareGreaterThanOrderedScalar:
625         case NI_SSE_CompareGreaterThanUnorderedScalar:
626         {
627             assert(baseType == TYP_FLOAT);
628             op2Reg = op2->gtRegNum;
629
630             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
631             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
632             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
633             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
634             break;
635         }
636
637         case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
638         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
639         {
640             assert(baseType == TYP_FLOAT);
641             op2Reg = op2->gtRegNum;
642
643             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
644             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
645             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
646             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
647             break;
648         }
649
650         case NI_SSE_CompareLessThanOrderedScalar:
651         case NI_SSE_CompareLessThanUnorderedScalar:
652         {
653             assert(baseType == TYP_FLOAT);
654             op2Reg = op2->gtRegNum;
655
656             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
657             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
658             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
659             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
660             break;
661         }
662
663         case NI_SSE_CompareLessThanOrEqualOrderedScalar:
664         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
665         {
666             assert(baseType == TYP_FLOAT);
667             op2Reg = op2->gtRegNum;
668
669             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
670             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
671             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
672             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
673             break;
674         }
675
676         case NI_SSE_CompareNotEqualOrderedScalar:
677         case NI_SSE_CompareNotEqualUnorderedScalar:
678         {
679             assert(baseType == TYP_FLOAT);
680             op2Reg = op2->gtRegNum;
681
682             regNumber tmpReg = node->GetSingleTempReg();
683
684             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
685             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
686             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
687             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
688             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
689             emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
690             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
691             break;
692         }
693
694         case NI_SSE_ConvertToSingle:
695         case NI_SSE_StaticCast:
696         {
697             assert(op2 == nullptr);
698             if (op1Reg != targetReg)
699             {
700                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
701                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
702             }
703             break;
704         }
705
706         case NI_SSE_MoveMask:
707         {
708             assert(baseType == TYP_FLOAT);
709             assert(op2 == nullptr);
710
711             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
712             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
713             break;
714         }
715
716         case NI_SSE_Prefetch0:
717         case NI_SSE_Prefetch1:
718         case NI_SSE_Prefetch2:
719         case NI_SSE_PrefetchNonTemporal:
720         {
721             assert(baseType == TYP_UBYTE);
722             assert(op2 == nullptr);
723
724             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
725             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
726             break;
727         }
728
729         case NI_SSE_ReciprocalScalar:
730         case NI_SSE_ReciprocalSqrtScalar:
731         case NI_SSE_SqrtScalar:
732         {
733             assert(baseType == TYP_FLOAT);
734             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
735
736             if (op2 == nullptr)
737             {
738                 emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg);
739             }
740             else
741             {
742                 genHWIntrinsic_R_R_RM(node, ins);
743             }
744             break;
745         }
746
747         case NI_SSE_SetScalarVector128:
748         {
749             assert(baseType == TYP_FLOAT);
750             assert(op2 == nullptr);
751
752             if (op1Reg == targetReg)
753             {
754                 regNumber tmpReg = node->GetSingleTempReg();
755                 emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
756                 op1Reg = tmpReg;
757             }
758
759             emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
760             emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
761             break;
762         }
763
764         case NI_SSE_SetZeroVector128:
765         {
766             assert(baseType == TYP_FLOAT);
767             assert(op1 == nullptr);
768             assert(op2 == nullptr);
769             emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
770             break;
771         }
772
773         case NI_SSE_StoreFence:
774         {
775             assert(baseType == TYP_VOID);
776             assert(op1 == nullptr);
777             assert(op2 == nullptr);
778             emit->emitIns(INS_sfence);
779             break;
780         }
781
782         default:
783             unreached();
784             break;
785     }
786
787     genProduceReg(node);
788 }
789
790 //------------------------------------------------------------------------
791 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
792 //
793 // Arguments:
794 //    node - The hardware intrinsic node
795 //
796 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
797 {
798     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
799     GenTree*       op1         = node->gtGetOp1();
800     GenTree*       op2         = node->gtGetOp2();
801     regNumber      targetReg   = node->gtRegNum;
802     var_types      targetType  = node->TypeGet();
803     var_types      baseType    = node->gtSIMDBaseType;
804     regNumber      op1Reg      = REG_NA;
805     regNumber      op2Reg      = REG_NA;
806     emitter*       emit        = getEmitter();
807     int            ival        = -1;
808
809     if ((op1 != nullptr) && !op1->OperIsList())
810     {
811         op1Reg = op1->gtRegNum;
812         genConsumeOperands(node);
813     }
814
815     switch (intrinsicID)
816     {
817         // All integer overloads are handled by table codegen
818         case NI_SSE2_CompareLessThan:
819         {
820             assert(op1 != nullptr);
821             assert(op2 != nullptr);
822             assert(baseType == TYP_DOUBLE);
823             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
824             op2Reg          = op2->gtRegNum;
825             ival            = Compiler::ivalOfHWIntrinsic(intrinsicID);
826             assert(ival != -1);
827             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
828
829             break;
830         }
831
832         case NI_SSE2_CompareEqualOrderedScalar:
833         case NI_SSE2_CompareEqualUnorderedScalar:
834         {
835             assert(baseType == TYP_DOUBLE);
836             op2Reg             = op2->gtRegNum;
837             regNumber   tmpReg = node->GetSingleTempReg();
838             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
839
840             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
841             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
842             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
843             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
844             emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
845             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
846             break;
847         }
848
849         case NI_SSE2_CompareGreaterThanOrderedScalar:
850         case NI_SSE2_CompareGreaterThanUnorderedScalar:
851         {
852             assert(baseType == TYP_DOUBLE);
853             op2Reg          = op2->gtRegNum;
854             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
855
856             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
857             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
858             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
859             break;
860         }
861
862         case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
863         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
864         {
865             assert(baseType == TYP_DOUBLE);
866             op2Reg          = op2->gtRegNum;
867             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
868
869             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
870             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
871             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
872             break;
873         }
874
875         case NI_SSE2_CompareLessThanOrderedScalar:
876         case NI_SSE2_CompareLessThanUnorderedScalar:
877         {
878             assert(baseType == TYP_DOUBLE);
879             op2Reg          = op2->gtRegNum;
880             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
881
882             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
883             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
884             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
885             break;
886         }
887
888         case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
889         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
890         {
891             assert(baseType == TYP_DOUBLE);
892             op2Reg          = op2->gtRegNum;
893             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
894
895             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
896             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
897             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
898             break;
899         }
900
901         case NI_SSE2_CompareNotEqualOrderedScalar:
902         case NI_SSE2_CompareNotEqualUnorderedScalar:
903         {
904             assert(baseType == TYP_DOUBLE);
905             op2Reg             = op2->gtRegNum;
906             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
907             regNumber   tmpReg = node->GetSingleTempReg();
908
909             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
910             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
911             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
912             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
913             emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
914             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
915             break;
916         }
917
918         case NI_SSE2_ConvertScalarToVector128Double:
919         case NI_SSE2_ConvertScalarToVector128Single:
920         {
921             assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
922             assert(op1 != nullptr);
923             assert(op2 != nullptr);
924             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
925             genHWIntrinsic_R_R_RM(node, ins);
926             break;
927         }
928
929         case NI_SSE2_ConvertScalarToVector128Int64:
930         case NI_SSE2_ConvertScalarToVector128UInt64:
931         {
932             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
933             assert(op1 != nullptr);
934             assert(op2 == nullptr);
935             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
936             // TODO-XArch-CQ -> use of type size of TYP_SIMD16 leads to
937             // instruction register encoding errors for SSE legacy encoding
938             emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
939             break;
940         }
941
942         case NI_SSE2_ConvertToDouble:
943         {
944             assert(op2 == nullptr);
945             if (op1Reg != targetReg)
946             {
947                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
948                 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
949             }
950             break;
951         }
952
953         case NI_SSE2_ConvertToInt32:
954         case NI_SSE2_ConvertToInt64:
955         case NI_SSE2_ConvertToUInt32:
956         case NI_SSE2_ConvertToUInt64:
957         {
958             assert(op2 == nullptr);
959             assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
960                    baseType == TYP_LONG || baseType == TYP_ULONG);
961             if (op1Reg != targetReg)
962             {
963                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
964                 if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
965                 {
966                     emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
967                 }
968                 else
969                 {
970                     // TODO-XArch-Bug https://github.com/dotnet/coreclr/issues/16329
971                     // using hardcoded instruction as workaround for inexact type conversions
972                     emit->emitIns_R_R(INS_mov_xmm2i, emitActualTypeSize(baseType), op1Reg, targetReg);
973                 }
974             }
975             break;
976         }
977
978         case NI_SSE2_LoadFence:
979         {
980             assert(baseType == TYP_VOID);
981             assert(op1 == nullptr);
982             assert(op2 == nullptr);
983             emit->emitIns(INS_lfence);
984             break;
985         }
986
987         case NI_SSE2_MemoryFence:
988         {
989             assert(baseType == TYP_VOID);
990             assert(op1 == nullptr);
991             assert(op2 == nullptr);
992             emit->emitIns(INS_mfence);
993             break;
994         }
995
996         case NI_SSE2_MoveMask:
997         {
998             assert(op2 == nullptr);
999             assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1000
1001             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1002             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1003             break;
1004         }
1005
1006         case NI_SSE2_SetZeroVector128:
1007         {
1008             assert(baseType != TYP_FLOAT);
1009             assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE);
1010             assert(op1 == nullptr);
1011             assert(op2 == nullptr);
1012
1013             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1014             emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1015             break;
1016         }
1017
1018         default:
1019             unreached();
1020             break;
1021     }
1022
1023     genProduceReg(node);
1024 }
1025
1026 //------------------------------------------------------------------------
1027 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1028 //
1029 // Arguments:
1030 //    node - The hardware intrinsic node
1031 //
1032 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1033 {
1034     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1035     GenTree*       op1         = node->gtGetOp1();
1036     GenTree*       op2         = node->gtGetOp2();
1037     GenTree*       op3         = nullptr;
1038     GenTree*       op4         = nullptr;
1039     regNumber      targetReg   = node->gtRegNum;
1040     var_types      targetType  = node->TypeGet();
1041     var_types      baseType    = node->gtSIMDBaseType;
1042
1043     regNumber op1Reg = REG_NA;
1044     regNumber op2Reg = REG_NA;
1045     regNumber op3Reg = REG_NA;
1046     regNumber op4Reg = REG_NA;
1047     emitter*  emit   = getEmitter();
1048
1049     if ((op1 != nullptr) && !op1->OperIsList())
1050     {
1051         op1Reg = op1->gtRegNum;
1052         genConsumeOperands(node);
1053     }
1054
1055     switch (intrinsicID)
1056     {
1057         case NI_SSE41_CeilingScalar:
1058         case NI_SSE41_FloorScalar:
1059         case NI_SSE41_RoundCurrentDirectionScalar:
1060         case NI_SSE41_RoundToNearestIntegerScalar:
1061         case NI_SSE41_RoundToNegativeInfinityScalar:
1062         case NI_SSE41_RoundToPositiveInfinityScalar:
1063         case NI_SSE41_RoundToZeroScalar:
1064         {
1065             assert((baseType == TYP_FLOAT) || (baseType == TYP_DOUBLE));
1066             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1067
1068             if (op2 == nullptr)
1069             {
1070                 int ival = Compiler::ivalOfHWIntrinsic(intrinsicID);
1071                 emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op1Reg, ival);
1072             }
1073             else
1074             {
1075                 genHWIntrinsic_R_R_RM_I(node, ins);
1076             }
1077             break;
1078         }
1079
1080         case NI_SSE41_TestAllOnes:
1081         {
1082             regNumber tmpReg = node->GetSingleTempReg();
1083             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1084             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1085             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1086             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1087             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1088             break;
1089         }
1090
1091         case NI_SSE41_TestAllZeros:
1092         case NI_SSE41_TestZ:
1093         {
1094             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1095             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1096             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1097             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1098             break;
1099         }
1100
1101         case NI_SSE41_TestC:
1102         {
1103             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1104             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1105             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1106             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1107             break;
1108         }
1109
1110         case NI_SSE41_TestMixOnesZeros:
1111         case NI_SSE41_TestNotZAndNotC:
1112         {
1113             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1114             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1115             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1116             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1117             break;
1118         }
1119
1120         default:
1121             unreached();
1122             break;
1123     }
1124
1125     genProduceReg(node);
1126 }
1127
1128 //------------------------------------------------------------------------
1129 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1130 //
1131 // Arguments:
1132 //    node - The hardware intrinsic node
1133 //
1134 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1135 {
1136     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1137     GenTree*       op1         = node->gtGetOp1();
1138     GenTree*       op2         = node->gtGetOp2();
1139     regNumber      targetReg   = node->gtRegNum;
1140     assert(targetReg != REG_NA);
1141     var_types targetType = node->TypeGet();
1142     var_types baseType   = node->gtSIMDBaseType;
1143
1144     regNumber op1Reg = op1->gtRegNum;
1145     regNumber op2Reg = op2->gtRegNum;
1146     genConsumeOperands(node);
1147
1148     switch (intrinsicID)
1149     {
1150         case NI_SSE42_Crc32:
1151             if (op1Reg != targetReg)
1152             {
1153                 inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1154             }
1155
1156             if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
1157             {
1158                 assert(targetType == TYP_INT);
1159                 inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
1160             }
1161             else
1162             {
1163                 assert(op1->TypeGet() == op2->TypeGet());
1164                 assert(targetType == TYP_INT || targetType == TYP_LONG);
1165                 inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
1166             }
1167
1168             break;
1169         default:
1170             unreached();
1171             break;
1172     }
1173     genProduceReg(node);
1174 }
1175
1176 //------------------------------------------------------------------------
1177 // genAVXIntrinsic: Generates the code for an AVX hardware intrinsic node
1178 //
1179 // Arguments:
1180 //    node - The hardware intrinsic node
1181 //
1182 void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
1183 {
1184     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1185     var_types      baseType    = node->gtSIMDBaseType;
1186     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1187     var_types      targetType  = node->TypeGet();
1188     instruction    ins         = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1189     GenTree*       op1         = node->gtGetOp1();
1190     GenTree*       op2         = node->gtGetOp2();
1191     regNumber      targetReg   = node->gtRegNum;
1192     emitter*       emit        = getEmitter();
1193
1194     genConsumeOperands(node);
1195
1196     switch (intrinsicID)
1197     {
1198         case NI_AVX_SetZeroVector256:
1199         {
1200             assert(op1 == nullptr);
1201             assert(op2 == nullptr);
1202             // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
1203             // generate xorps on AVX machines.
1204             if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
1205             {
1206                 emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
1207             }
1208             else
1209             {
1210                 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1211             }
1212             break;
1213         }
1214         default:
1215             unreached();
1216             break;
1217     }
1218
1219     genProduceReg(node);
1220 }
1221
1222 //------------------------------------------------------------------------
1223 // genAVX2Intrinsic: Generates the code for an AVX2 hardware intrinsic node
1224 //
1225 // Arguments:
1226 //    node - The hardware intrinsic node
1227 //
1228 void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node)
1229 {
1230     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1231     var_types      baseType    = node->gtSIMDBaseType;
1232     instruction    ins         = INS_invalid;
1233
1234     genConsumeOperands(node);
1235
1236     switch (intrinsicID)
1237     {
1238         default:
1239             unreached();
1240             break;
1241     }
1242
1243     genProduceReg(node);
1244 }
1245
1246 //------------------------------------------------------------------------
1247 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
1248 //
1249 // Arguments:
1250 //    node - The hardware intrinsic node
1251 //
1252 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
1253 {
1254     NYI("Implement AES intrinsic code generation");
1255 }
1256
1257 //------------------------------------------------------------------------
1258 // genBMI1Intrinsic: Generates the code for a BMI1 hardware intrinsic node
1259 //
1260 // Arguments:
1261 //    node - The hardware intrinsic node
1262 //
1263 void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
1264 {
1265     NYI("Implement BMI1 intrinsic code generation");
1266 }
1267
1268 //------------------------------------------------------------------------
1269 // genBMI2Intrinsic: Generates the code for a BMI2 hardware intrinsic node
1270 //
1271 // Arguments:
1272 //    node - The hardware intrinsic node
1273 //
1274 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
1275 {
1276     NYI("Implement BMI2 intrinsic code generation");
1277 }
1278
1279 //------------------------------------------------------------------------
1280 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
1281 //
1282 // Arguments:
1283 //    node - The hardware intrinsic node
1284 //
1285 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
1286 {
1287     NYI("Implement FMA intrinsic code generation");
1288 }
1289
1290 //------------------------------------------------------------------------
1291 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
1292 //
1293 // Arguments:
1294 //    node - The hardware intrinsic node
1295 //
1296 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
1297 {
1298     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1299     GenTree*       op1         = node->gtGetOp1();
1300     regNumber      targetReg   = node->gtRegNum;
1301     assert(targetReg != REG_NA);
1302     var_types targetType = node->TypeGet();
1303     regNumber op1Reg     = op1->gtRegNum;
1304     genConsumeOperands(node);
1305
1306     assert(intrinsicID == NI_LZCNT_LeadingZeroCount);
1307
1308     inst_RV_RV(INS_lzcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1309
1310     genProduceReg(node);
1311 }
1312
1313 //------------------------------------------------------------------------
1314 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
1315 //
1316 // Arguments:
1317 //    node - The hardware intrinsic node
1318 //
1319 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
1320 {
1321     NYI("Implement PCLMULQDQ intrinsic code generation");
1322 }
1323
1324 //------------------------------------------------------------------------
1325 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
1326 //
1327 // Arguments:
1328 //    node - The hardware intrinsic node
1329 //
1330 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
1331 {
1332     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1333     GenTree*       op1         = node->gtGetOp1();
1334     regNumber      targetReg   = node->gtRegNum;
1335     assert(targetReg != REG_NA);
1336     var_types targetType = node->TypeGet();
1337     regNumber op1Reg     = op1->gtRegNum;
1338     genConsumeOperands(node);
1339
1340     assert(intrinsicID == NI_POPCNT_PopCount);
1341
1342     inst_RV_RV(INS_popcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1343
1344     genProduceReg(node);
1345 }
1346
1347 #endif // FEATURE_HW_INTRINSICS