Merge pull request #16832 from dotnetrt/StoreNonTemporal
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // genIsTableDrivenHWIntrinsic:
29 //
30 // Arguments:
31 //    category - category of a HW intrinsic
32 //
33 // Return Value:
34 //    returns true if this category can be table-driven in CodeGen
35 //
36 static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
37 {
38     // TODO - make more categories to the table-driven framework
39     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
40     const bool tableDrivenCategory =
41         category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
42     const bool tableDrivenFlag = (flags & (HW_Flag_MultiIns | HW_Flag_SpecialCodeGen)) == 0;
43     return tableDrivenCategory && tableDrivenFlag;
44 }
45
46 //------------------------------------------------------------------------
47 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
48 //
49 // Arguments:
50 //    node - The hardware intrinsic node
51 //
52 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
53 {
54     NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
55     InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
56     HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
57     HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
58     int                 ival        = Compiler::ivalOfHWIntrinsic(intrinsicID);
59     int                 numArgs     = Compiler::numArgsOfHWIntrinsic(node);
60
61     assert((flags & HW_Flag_NoCodeGen) == 0);
62
63     if (genIsTableDrivenHWIntrinsic(category, flags))
64     {
65         GenTree*  op1        = node->gtGetOp1();
66         GenTree*  op2        = node->gtGetOp2();
67         regNumber targetReg  = node->gtRegNum;
68         var_types targetType = node->TypeGet();
69         var_types baseType   = node->gtSIMDBaseType;
70
71         regNumber op1Reg = REG_NA;
72         regNumber op2Reg = REG_NA;
73         emitter*  emit   = getEmitter();
74
75         assert(numArgs >= 0);
76         instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
77         assert(ins != INS_invalid);
78         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
79         assert(simdSize != 0);
80
81         switch (numArgs)
82         {
83             case 1:
84             {
85                 genConsumeOperands(node);
86                 op1Reg = op1->gtRegNum;
87                 if (category == HW_Category_MemoryLoad)
88                 {
89                     emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
90                 }
91                 else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
92                 {
93                     emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
94                 }
95                 else if ((ival != -1) && varTypeIsFloating(baseType))
96                 {
97                     emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
98                 }
99                 else
100                 {
101                     emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
102                 }
103                 break;
104             }
105
106             case 2:
107             {
108                 genConsumeOperands(node);
109
110                 op1Reg = op1->gtRegNum;
111                 op2Reg = op2->gtRegNum;
112
113                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
114                 {
115                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
116                     //
117                     // For non-commutative intrinsics, we should have ensured that op2 was marked
118                     // delay free in order to prevent it from getting assigned the same register
119                     // as target. However, for commutative intrinsics, we can just swap the operands
120                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
121
122                     noway_assert(node->OperIsCommutative());
123                     op2Reg = op1Reg;
124                     op1Reg = targetReg;
125                 }
126
127                 if (category == HW_Category_MemoryStore)
128                 {
129                     emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
130                 }
131                 else if ((ival != -1) && varTypeIsFloating(baseType))
132                 {
133                     genHWIntrinsic_R_R_RM_I(node, ins);
134                 }
135                 else if (category == HW_Category_MemoryLoad)
136                 {
137                     emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
138                 }
139                 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
140                 {
141                     if (intrinsicID == NI_SSE2_Extract)
142                     {
143                         // extract instructions return to GP-registers, so it needs int size as the emitsize
144                         simdSize = emitTypeSize(TYP_INT);
145                     }
146                     auto emitSwCase = [&](unsigned i) {
147                         emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
148                     };
149
150                     if (op2->IsCnsIntOrI())
151                     {
152                         ssize_t ival = op2->AsIntCon()->IconValue();
153                         emitSwCase((unsigned)ival);
154                     }
155                     else
156                     {
157                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
158                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
159                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
160                         regNumber baseReg = node->ExtractTempReg();
161                         regNumber offsReg = node->GetSingleTempReg();
162                         genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
163                     }
164                 }
165                 else
166                 {
167                     genHWIntrinsic_R_R_RM(node, ins);
168                 }
169                 break;
170             }
171
172             case 3:
173             {
174                 assert(op1->OperIsList());
175                 assert(op1->gtGetOp2()->OperIsList());
176                 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
177
178                 GenTreeArgList* argList = op1->AsArgList();
179                 op1                     = argList->Current();
180                 genConsumeRegs(op1);
181                 op1Reg = op1->gtRegNum;
182
183                 argList = argList->Rest();
184                 op2     = argList->Current();
185                 genConsumeRegs(op2);
186                 op2Reg = op2->gtRegNum;
187
188                 argList      = argList->Rest();
189                 GenTree* op3 = argList->Current();
190                 genConsumeRegs(op3);
191                 regNumber op3Reg = op3->gtRegNum;
192
193                 if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
194                 {
195                     auto emitSwCase = [&](unsigned i) {
196                         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
197                     };
198                     if (op3->IsCnsIntOrI())
199                     {
200                         ssize_t ival = op3->AsIntCon()->IconValue();
201                         emitSwCase((unsigned)ival);
202                     }
203                     else
204                     {
205                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
206                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
207                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
208                         regNumber baseReg = node->ExtractTempReg();
209                         regNumber offsReg = node->GetSingleTempReg();
210                         genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
211                     }
212                 }
213                 else if (category == HW_Category_MemoryStore)
214                 {
215                     assert(intrinsicID == NI_SSE2_MaskMove);
216                     assert(targetReg == REG_NA);
217
218                     // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
219                     if (op3Reg != REG_EDI)
220                     {
221                         emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
222                     }
223                     emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
224                 }
225                 else
226                 {
227                     emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
228                 }
229                 break;
230             }
231
232             default:
233                 unreached();
234                 break;
235         }
236         genProduceReg(node);
237         return;
238     }
239
240     switch (isa)
241     {
242         case InstructionSet_SSE:
243             genSSEIntrinsic(node);
244             break;
245         case InstructionSet_SSE2:
246             genSSE2Intrinsic(node);
247             break;
248         case InstructionSet_SSE41:
249             genSSE41Intrinsic(node);
250             break;
251         case InstructionSet_SSE42:
252             genSSE42Intrinsic(node);
253             break;
254         case InstructionSet_AVX:
255             genAVXIntrinsic(node);
256             break;
257         case InstructionSet_AVX2:
258             genAVX2Intrinsic(node);
259             break;
260         case InstructionSet_AES:
261             genAESIntrinsic(node);
262             break;
263         case InstructionSet_BMI1:
264             genBMI1Intrinsic(node);
265             break;
266         case InstructionSet_BMI2:
267             genBMI2Intrinsic(node);
268             break;
269         case InstructionSet_FMA:
270             genFMAIntrinsic(node);
271             break;
272         case InstructionSet_LZCNT:
273             genLZCNTIntrinsic(node);
274             break;
275         case InstructionSet_PCLMULQDQ:
276             genPCLMULQDQIntrinsic(node);
277             break;
278         case InstructionSet_POPCNT:
279             genPOPCNTIntrinsic(node);
280             break;
281         default:
282             unreached();
283             break;
284     }
285 }
286
287 //------------------------------------------------------------------------
288 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
289 //                        register/memory operand, and that returns a value in register
290 //
291 // Arguments:
292 //    node - The hardware intrinsic node
293 //    ins  - The instruction being generated
294 //
295 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
296 {
297     var_types targetType = node->TypeGet();
298     regNumber targetReg  = node->gtRegNum;
299     GenTree*  op1        = node->gtGetOp1();
300     GenTree*  op2        = node->gtGetOp2();
301     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
302     emitter*  emit       = getEmitter();
303
304     // TODO-XArch-CQ: Commutative operations can have op1 be contained
305     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
306
307     regNumber op1Reg = op1->gtRegNum;
308
309     assert(targetReg != REG_NA);
310     assert(op1Reg != REG_NA);
311
312     if (op2->isContained() || op2->isUsedFromSpillTemp())
313     {
314         assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
315         assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
316
317         TempDsc* tmpDsc = nullptr;
318         unsigned varNum = BAD_VAR_NUM;
319         unsigned offset = (unsigned)-1;
320
321         if (op2->isUsedFromSpillTemp())
322         {
323             assert(op2->IsRegOptional());
324
325             tmpDsc = getSpillTempDsc(op2);
326             varNum = tmpDsc->tdTempNum();
327             offset = 0;
328
329             compiler->tmpRlsTemp(tmpDsc);
330         }
331         else if (op2->OperIsHWIntrinsic())
332         {
333             emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
334             return;
335         }
336         else if (op2->isIndir())
337         {
338             GenTreeIndir* memIndir = op2->AsIndir();
339             GenTree*      memBase  = memIndir->gtOp1;
340
341             switch (memBase->OperGet())
342             {
343                 case GT_LCL_VAR_ADDR:
344                 {
345                     varNum = memBase->AsLclVarCommon()->GetLclNum();
346                     offset = 0;
347
348                     // Ensure that all the GenTreeIndir values are set to their defaults.
349                     assert(!memIndir->HasIndex());
350                     assert(memIndir->Scale() == 1);
351                     assert(memIndir->Offset() == 0);
352
353                     break;
354                 }
355
356                 case GT_CLS_VAR_ADDR:
357                 {
358                     emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
359                     return;
360                 }
361
362                 default:
363                 {
364                     emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
365                     return;
366                 }
367             }
368         }
369         else
370         {
371             switch (op2->OperGet())
372             {
373                 case GT_LCL_FLD:
374                 {
375                     GenTreeLclFld* lclField = op2->AsLclFld();
376
377                     varNum = lclField->GetLclNum();
378                     offset = lclField->gtLclFld.gtLclOffs;
379                     break;
380                 }
381
382                 case GT_LCL_VAR:
383                 {
384                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
385                     varNum = op2->AsLclVar()->GetLclNum();
386                     offset = 0;
387                     break;
388                 }
389
390                 default:
391                     unreached();
392                     break;
393             }
394         }
395
396         // Ensure we got a good varNum and offset.
397         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
398         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
399         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
400         assert(offset != (unsigned)-1);
401
402         emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
403     }
404     else
405     {
406         regNumber op2Reg = op2->gtRegNum;
407
408         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
409         {
410             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
411             //
412             // For non-commutative intrinsics, we should have ensured that op2 was marked
413             // delay free in order to prevent it from getting assigned the same register
414             // as target. However, for commutative intrinsics, we can just swap the operands
415             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
416
417             noway_assert(node->OperIsCommutative());
418             op2Reg = op1Reg;
419             op1Reg = targetReg;
420         }
421
422         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg);
423     }
424 }
425
426 //------------------------------------------------------------------------
427 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
428 //                        register/memory operand, an immediate operand, and that returns a value in register
429 //
430 // Arguments:
431 //    node - The hardware intrinsic node
432 //    ins  - The instruction being generated
433 //
434 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
435 {
436     var_types targetType = node->TypeGet();
437     regNumber targetReg  = node->gtRegNum;
438     GenTree*  op1        = node->gtGetOp1();
439     GenTree*  op2        = node->gtGetOp2();
440     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
441     int       ival       = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
442     emitter*  emit       = getEmitter();
443
444     // TODO-XArch-CQ: Commutative operations can have op1 be contained
445     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
446
447     regNumber op1Reg = op1->gtRegNum;
448
449     assert(targetReg != REG_NA);
450     assert(op1Reg != REG_NA);
451
452     if (op2->isContained() || op2->isUsedFromSpillTemp())
453     {
454         assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
455         assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
456
457         TempDsc* tmpDsc = nullptr;
458         unsigned varNum = BAD_VAR_NUM;
459         unsigned offset = (unsigned)-1;
460
461         if (op2->isUsedFromSpillTemp())
462         {
463             assert(op2->IsRegOptional());
464
465             tmpDsc = getSpillTempDsc(op2);
466             varNum = tmpDsc->tdTempNum();
467             offset = 0;
468
469             compiler->tmpRlsTemp(tmpDsc);
470         }
471         else if (op2->OperIsHWIntrinsic())
472         {
473             emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
474             return;
475         }
476         else if (op2->isIndir())
477         {
478             GenTreeIndir* memIndir = op2->AsIndir();
479             GenTree*      memBase  = memIndir->gtOp1;
480
481             switch (memBase->OperGet())
482             {
483                 case GT_LCL_VAR_ADDR:
484                 {
485                     varNum = memBase->AsLclVarCommon()->GetLclNum();
486                     offset = 0;
487
488                     // Ensure that all the GenTreeIndir values are set to their defaults.
489                     assert(!memIndir->HasIndex());
490                     assert(memIndir->Scale() == 1);
491                     assert(memIndir->Offset() == 0);
492
493                     break;
494                 }
495
496                 case GT_CLS_VAR_ADDR:
497                 {
498                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
499                                                ival);
500                     return;
501                 }
502
503                 default:
504                 {
505                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
506                     return;
507                 }
508             }
509         }
510         else
511         {
512             switch (op2->OperGet())
513             {
514                 case GT_LCL_FLD:
515                 {
516                     GenTreeLclFld* lclField = op2->AsLclFld();
517
518                     varNum = lclField->GetLclNum();
519                     offset = lclField->gtLclFld.gtLclOffs;
520                     break;
521                 }
522
523                 case GT_LCL_VAR:
524                 {
525                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
526                     varNum = op2->AsLclVar()->GetLclNum();
527                     offset = 0;
528                     break;
529                 }
530
531                 default:
532                     unreached();
533                     break;
534             }
535         }
536
537         // Ensure we got a good varNum and offset.
538         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
539         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
540         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
541         assert(offset != (unsigned)-1);
542
543         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
544     }
545     else
546     {
547         regNumber op2Reg = op2->gtRegNum;
548
549         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
550         {
551             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
552             //
553             // For non-commutative intrinsics, we should have ensured that op2 was marked
554             // delay free in order to prevent it from getting assigned the same register
555             // as target. However, for commutative intrinsics, we can just swap the operands
556             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
557
558             noway_assert(node->OperIsCommutative());
559             op2Reg = op1Reg;
560             op1Reg = targetReg;
561         }
562
563         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
564     }
565 }
566
567 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
568 //                       with non-constant argument
569 //
570 // Arguments:
571 //    intrinsic      - intrinsic ID
572 //    nonConstImmReg - the register contains non-constant imm8 argument
573 //    baseReg        - a register for the start of the switch table
574 //    offsReg        - a register for the offset into the switch table
575 //    emitSwCase     - the lambda to generate siwtch-case
576 //
577 // Return Value:
578 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
579 // Note:
580 //    This function can be used for all imm-intrinsics (whether full-range or not),
581 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
582 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
583 //
584 template <typename HWIntrinsicSwitchCaseBody>
585 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
586                                               regNumber                 nonConstImmReg,
587                                               regNumber                 baseReg,
588                                               regNumber                 offsReg,
589                                               HWIntrinsicSwitchCaseBody emitSwCase)
590 {
591     assert(nonConstImmReg != REG_NA);
592     emitter* emit = getEmitter();
593
594     const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
595     assert(maxByte <= 256);
596     BasicBlock* jmpTable[256];
597
598     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
599     unsigned jmpTableOffs = 0;
600
601     // Emit the jump table
602     for (unsigned i = 0; i < maxByte; i++)
603     {
604         jmpTable[i] = genCreateTempLabel();
605         emit->emitDataGenData(i, jmpTable[i]);
606     }
607
608     emit->emitDataGenEnd();
609
610     // Compute and jump to the appropriate offset in the switch table
611     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
612
613     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
614     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
615     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
616     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
617
618     // Emit the switch table entries
619
620     BasicBlock* switchTableBeg = genCreateTempLabel();
621     BasicBlock* switchTableEnd = genCreateTempLabel();
622
623     genDefineTempLabel(switchTableBeg);
624
625     for (unsigned i = 0; i < maxByte; i++)
626     {
627         genDefineTempLabel(jmpTable[i]);
628         emitSwCase(i);
629         emit->emitIns_J(INS_jmp, switchTableEnd);
630     }
631
632     genDefineTempLabel(switchTableEnd);
633 }
634
635 //------------------------------------------------------------------------
636 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
637 //
638 // Arguments:
639 //    node - The hardware intrinsic node
640 //
641 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
642 {
643     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
644     GenTree*       op1         = node->gtGetOp1();
645     GenTree*       op2         = node->gtGetOp2();
646     GenTree*       op3         = nullptr;
647     GenTree*       op4         = nullptr;
648     regNumber      targetReg   = node->gtRegNum;
649     var_types      targetType  = node->TypeGet();
650     var_types      baseType    = node->gtSIMDBaseType;
651
652     regNumber op1Reg = REG_NA;
653     regNumber op2Reg = REG_NA;
654     regNumber op3Reg = REG_NA;
655     regNumber op4Reg = REG_NA;
656     emitter*  emit   = getEmitter();
657
658     if ((op1 != nullptr) && !op1->OperIsList())
659     {
660         op1Reg = op1->gtRegNum;
661         genConsumeOperands(node);
662     }
663
664     switch (intrinsicID)
665     {
666         case NI_SSE_CompareEqualOrderedScalar:
667         case NI_SSE_CompareEqualUnorderedScalar:
668         {
669             assert(baseType == TYP_FLOAT);
670             op2Reg             = op2->gtRegNum;
671             regNumber   tmpReg = node->GetSingleTempReg();
672             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
673
674             // Ensure we aren't overwriting targetReg
675             assert(tmpReg != targetReg);
676
677             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
678             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
679             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
680             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
681             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
682             break;
683         }
684
685         case NI_SSE_CompareGreaterThanOrderedScalar:
686         case NI_SSE_CompareGreaterThanUnorderedScalar:
687         {
688             assert(baseType == TYP_FLOAT);
689             op2Reg = op2->gtRegNum;
690
691             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
692             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
693             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
694             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
695             break;
696         }
697
698         case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
699         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
700         {
701             assert(baseType == TYP_FLOAT);
702             op2Reg = op2->gtRegNum;
703
704             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
705             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
706             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
707             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
708             break;
709         }
710
711         case NI_SSE_CompareLessThanOrderedScalar:
712         case NI_SSE_CompareLessThanUnorderedScalar:
713         {
714             assert(baseType == TYP_FLOAT);
715             op2Reg = op2->gtRegNum;
716
717             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
718             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
719             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
720             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
721             break;
722         }
723
724         case NI_SSE_CompareLessThanOrEqualOrderedScalar:
725         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
726         {
727             assert(baseType == TYP_FLOAT);
728             op2Reg = op2->gtRegNum;
729
730             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
731             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
732             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
733             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
734             break;
735         }
736
737         case NI_SSE_CompareNotEqualOrderedScalar:
738         case NI_SSE_CompareNotEqualUnorderedScalar:
739         {
740             assert(baseType == TYP_FLOAT);
741             op2Reg             = op2->gtRegNum;
742             regNumber   tmpReg = node->GetSingleTempReg();
743             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
744
745             // Ensure we aren't overwriting targetReg
746             assert(tmpReg != targetReg);
747
748             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
749             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
750             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
751             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
752             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
753             break;
754         }
755
756         case NI_SSE_ConvertToSingle:
757         case NI_SSE_StaticCast:
758         {
759             assert(op2 == nullptr);
760             if (op1Reg != targetReg)
761             {
762                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
763                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
764             }
765             break;
766         }
767
768         case NI_SSE_MoveMask:
769         {
770             assert(baseType == TYP_FLOAT);
771             assert(op2 == nullptr);
772
773             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
774             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
775             break;
776         }
777
778         case NI_SSE_Prefetch0:
779         case NI_SSE_Prefetch1:
780         case NI_SSE_Prefetch2:
781         case NI_SSE_PrefetchNonTemporal:
782         {
783             assert(baseType == TYP_UBYTE);
784             assert(op2 == nullptr);
785
786             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
787             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
788             break;
789         }
790
791         case NI_SSE_SetScalarVector128:
792         {
793             assert(baseType == TYP_FLOAT);
794             assert(op2 == nullptr);
795
796             if (op1Reg == targetReg)
797             {
798                 regNumber tmpReg = node->GetSingleTempReg();
799
800                 // Ensure we aren't overwriting targetReg
801                 assert(tmpReg != targetReg);
802
803                 emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
804                 op1Reg = tmpReg;
805             }
806
807             emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
808             emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
809             break;
810         }
811
812         case NI_SSE_SetZeroVector128:
813         {
814             assert(baseType == TYP_FLOAT);
815             assert(op1 == nullptr);
816             assert(op2 == nullptr);
817             emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
818             break;
819         }
820
821         case NI_SSE_StoreFence:
822         {
823             assert(baseType == TYP_VOID);
824             assert(op1 == nullptr);
825             assert(op2 == nullptr);
826             emit->emitIns(INS_sfence);
827             break;
828         }
829
830         default:
831             unreached();
832             break;
833     }
834
835     genProduceReg(node);
836 }
837
838 //------------------------------------------------------------------------
839 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
840 //
841 // Arguments:
842 //    node - The hardware intrinsic node
843 //
844 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
845 {
846     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
847     GenTree*       op1         = node->gtGetOp1();
848     GenTree*       op2         = node->gtGetOp2();
849     regNumber      targetReg   = node->gtRegNum;
850     var_types      targetType  = node->TypeGet();
851     var_types      baseType    = node->gtSIMDBaseType;
852     regNumber      op1Reg      = REG_NA;
853     regNumber      op2Reg      = REG_NA;
854     emitter*       emit        = getEmitter();
855     int            ival        = -1;
856
857     if ((op1 != nullptr) && !op1->OperIsList())
858     {
859         op1Reg = op1->gtRegNum;
860         genConsumeOperands(node);
861     }
862
863     switch (intrinsicID)
864     {
865         // All integer overloads are handled by table codegen
866         case NI_SSE2_CompareLessThan:
867         {
868             assert(op1 != nullptr);
869             assert(op2 != nullptr);
870             assert(baseType == TYP_DOUBLE);
871             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
872             op2Reg          = op2->gtRegNum;
873             ival            = Compiler::ivalOfHWIntrinsic(intrinsicID);
874             assert(ival != -1);
875             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
876
877             break;
878         }
879
880         case NI_SSE2_CompareEqualOrderedScalar:
881         case NI_SSE2_CompareEqualUnorderedScalar:
882         {
883             assert(baseType == TYP_DOUBLE);
884             op2Reg             = op2->gtRegNum;
885             regNumber   tmpReg = node->GetSingleTempReg();
886             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
887
888             // Ensure we aren't overwriting targetReg
889             assert(tmpReg != targetReg);
890
891             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
892             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
893             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
894             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
895             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
896             break;
897         }
898
899         case NI_SSE2_CompareGreaterThanOrderedScalar:
900         case NI_SSE2_CompareGreaterThanUnorderedScalar:
901         {
902             assert(baseType == TYP_DOUBLE);
903             op2Reg          = op2->gtRegNum;
904             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
905
906             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
907             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
908             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
909             break;
910         }
911
912         case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
913         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
914         {
915             assert(baseType == TYP_DOUBLE);
916             op2Reg          = op2->gtRegNum;
917             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
918
919             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
920             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
921             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
922             break;
923         }
924
925         case NI_SSE2_CompareLessThanOrderedScalar:
926         case NI_SSE2_CompareLessThanUnorderedScalar:
927         {
928             assert(baseType == TYP_DOUBLE);
929             op2Reg          = op2->gtRegNum;
930             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
931
932             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
933             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
934             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
935             break;
936         }
937
938         case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
939         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
940         {
941             assert(baseType == TYP_DOUBLE);
942             op2Reg          = op2->gtRegNum;
943             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
944
945             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
946             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
947             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
948             break;
949         }
950
951         case NI_SSE2_CompareNotEqualOrderedScalar:
952         case NI_SSE2_CompareNotEqualUnorderedScalar:
953         {
954             assert(baseType == TYP_DOUBLE);
955             op2Reg             = op2->gtRegNum;
956             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
957             regNumber   tmpReg = node->GetSingleTempReg();
958
959             // Ensure we aren't overwriting targetReg
960             assert(tmpReg != targetReg);
961
962             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
963             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
964             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
965             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
966             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
967             break;
968         }
969
970         case NI_SSE2_ConvertScalarToVector128Double:
971         case NI_SSE2_ConvertScalarToVector128Single:
972         {
973             assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
974             assert(op1 != nullptr);
975             assert(op2 != nullptr);
976             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
977             genHWIntrinsic_R_R_RM(node, ins);
978             break;
979         }
980
981         case NI_SSE2_ConvertScalarToVector128Int64:
982         case NI_SSE2_ConvertScalarToVector128UInt64:
983         {
984             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
985             assert(op1 != nullptr);
986             assert(op2 == nullptr);
987             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
988             emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
989             break;
990         }
991
992         case NI_SSE2_ConvertToDouble:
993         {
994             assert(op2 == nullptr);
995             if (op1Reg != targetReg)
996             {
997                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
998                 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
999             }
1000             break;
1001         }
1002
1003         case NI_SSE2_ConvertToInt32:
1004         case NI_SSE2_ConvertToInt64:
1005         case NI_SSE2_ConvertToUInt32:
1006         case NI_SSE2_ConvertToUInt64:
1007         {
1008             assert(op2 == nullptr);
1009             assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
1010                    baseType == TYP_LONG || baseType == TYP_ULONG);
1011             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1012             if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
1013             {
1014                 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
1015             }
1016             else
1017             {
1018                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1019             }
1020             break;
1021         }
1022
1023         case NI_SSE2_LoadFence:
1024         {
1025             assert(baseType == TYP_VOID);
1026             assert(op1 == nullptr);
1027             assert(op2 == nullptr);
1028             emit->emitIns(INS_lfence);
1029             break;
1030         }
1031
1032         case NI_SSE2_MemoryFence:
1033         {
1034             assert(baseType == TYP_VOID);
1035             assert(op1 == nullptr);
1036             assert(op2 == nullptr);
1037             emit->emitIns(INS_mfence);
1038             break;
1039         }
1040
1041         case NI_SSE2_MoveMask:
1042         {
1043             assert(op2 == nullptr);
1044             assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1045
1046             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1047             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1048             break;
1049         }
1050
1051         case NI_SSE2_SetScalarVector128:
1052         {
1053             assert(baseType == TYP_DOUBLE);
1054             assert(op2 == nullptr);
1055
1056             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1057             if (op1Reg == targetReg)
1058             {
1059                 regNumber tmpReg = node->GetSingleTempReg();
1060
1061                 // Ensure we aren't overwriting targetReg
1062                 assert(tmpReg != targetReg);
1063
1064                 emit->emitIns_R_R(INS_movapd, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
1065                 op1Reg = tmpReg;
1066             }
1067
1068             emit->emitIns_SIMD_R_R_R(INS_xorpd, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1069             emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
1070             break;
1071         }
1072
1073         case NI_SSE2_SetZeroVector128:
1074         {
1075             assert(baseType != TYP_FLOAT);
1076             assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE);
1077             assert(op1 == nullptr);
1078             assert(op2 == nullptr);
1079
1080             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1081             emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1082             break;
1083         }
1084
1085         case NI_SSE2_StoreNonTemporal:
1086         {
1087             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1088             assert(op1 != nullptr);
1089             assert(op2 != nullptr);
1090
1091             op2Reg          = op2->gtRegNum;
1092             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1093             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1094             break;
1095         }
1096
1097         default:
1098             unreached();
1099             break;
1100     }
1101
1102     genProduceReg(node);
1103 }
1104
1105 //------------------------------------------------------------------------
1106 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1107 //
1108 // Arguments:
1109 //    node - The hardware intrinsic node
1110 //
1111 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1112 {
1113     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1114     GenTree*       op1         = node->gtGetOp1();
1115     GenTree*       op2         = node->gtGetOp2();
1116     GenTree*       op3         = nullptr;
1117     GenTree*       op4         = nullptr;
1118     regNumber      targetReg   = node->gtRegNum;
1119     var_types      targetType  = node->TypeGet();
1120     var_types      baseType    = node->gtSIMDBaseType;
1121
1122     regNumber op1Reg = REG_NA;
1123     regNumber op2Reg = REG_NA;
1124     regNumber op3Reg = REG_NA;
1125     regNumber op4Reg = REG_NA;
1126     emitter*  emit   = getEmitter();
1127
1128     if ((op1 != nullptr) && !op1->OperIsList())
1129     {
1130         op1Reg = op1->gtRegNum;
1131         genConsumeOperands(node);
1132     }
1133
1134     switch (intrinsicID)
1135     {
1136         case NI_SSE41_TestAllOnes:
1137         {
1138             regNumber tmpReg = node->GetSingleTempReg();
1139             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1140             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1141             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1142             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1143             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1144             break;
1145         }
1146
1147         case NI_SSE41_TestAllZeros:
1148         case NI_SSE41_TestZ:
1149         {
1150             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1151             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1152             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1153             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1154             break;
1155         }
1156
1157         case NI_SSE41_TestC:
1158         {
1159             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1160             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1161             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1162             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1163             break;
1164         }
1165
1166         case NI_SSE41_TestMixOnesZeros:
1167         case NI_SSE41_TestNotZAndNotC:
1168         {
1169             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1170             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1171             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1172             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1173             break;
1174         }
1175
1176         case NI_SSE41_Extract:
1177         {
1178             regNumber   tmpTargetReg = REG_NA;
1179             instruction ins          = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1180             if (baseType == TYP_FLOAT)
1181             {
1182                 tmpTargetReg = node->ExtractTempReg();
1183             }
1184             auto emitSwCase = [&](unsigned i) {
1185                 if (baseType == TYP_FLOAT)
1186                 {
1187                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1188                     emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), op1Reg, tmpTargetReg, (int)i);
1189                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1190                 }
1191                 else
1192                 {
1193                     emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, (int)i);
1194                 }
1195             };
1196
1197             if (op2->IsCnsIntOrI())
1198             {
1199                 ssize_t ival = op2->AsIntCon()->IconValue();
1200                 emitSwCase((unsigned)ival);
1201             }
1202             else
1203             {
1204                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1205                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1206                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1207                 regNumber baseReg = node->ExtractTempReg();
1208                 regNumber offsReg = node->GetSingleTempReg();
1209                 genHWIntrinsicJumpTableFallback(intrinsicID, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1210             }
1211             break;
1212         }
1213
1214         default:
1215             unreached();
1216             break;
1217     }
1218
1219     genProduceReg(node);
1220 }
1221
1222 //------------------------------------------------------------------------
1223 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1224 //
1225 // Arguments:
1226 //    node - The hardware intrinsic node
1227 //
1228 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1229 {
1230     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1231     GenTree*       op1         = node->gtGetOp1();
1232     GenTree*       op2         = node->gtGetOp2();
1233     regNumber      targetReg   = node->gtRegNum;
1234     assert(targetReg != REG_NA);
1235     var_types targetType = node->TypeGet();
1236     var_types baseType   = node->gtSIMDBaseType;
1237
1238     regNumber op1Reg = op1->gtRegNum;
1239     regNumber op2Reg = op2->gtRegNum;
1240     genConsumeOperands(node);
1241
1242     switch (intrinsicID)
1243     {
1244         case NI_SSE42_Crc32:
1245             if (op1Reg != targetReg)
1246             {
1247                 assert(op2Reg != targetReg);
1248                 inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1249             }
1250
1251             if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
1252             {
1253                 assert(targetType == TYP_INT);
1254                 inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
1255             }
1256             else
1257             {
1258                 assert(op1->TypeGet() == op2->TypeGet());
1259                 assert(targetType == TYP_INT || targetType == TYP_LONG);
1260                 inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
1261             }
1262
1263             break;
1264         default:
1265             unreached();
1266             break;
1267     }
1268     genProduceReg(node);
1269 }
1270
1271 //------------------------------------------------------------------------
1272 // genAVXIntrinsic: Generates the code for an AVX hardware intrinsic node
1273 //
1274 // Arguments:
1275 //    node - The hardware intrinsic node
1276 //
1277 void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node)
1278 {
1279     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1280     var_types      baseType    = node->gtSIMDBaseType;
1281     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1282     var_types      targetType  = node->TypeGet();
1283     instruction    ins         = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1284     GenTree*       op1         = node->gtGetOp1();
1285     GenTree*       op2         = node->gtGetOp2();
1286     regNumber      targetReg   = node->gtRegNum;
1287     emitter*       emit        = getEmitter();
1288
1289     genConsumeOperands(node);
1290
1291     switch (intrinsicID)
1292     {
1293         case NI_AVX_SetZeroVector256:
1294         {
1295             assert(op1 == nullptr);
1296             assert(op2 == nullptr);
1297             // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
1298             // generate xorps on AVX machines.
1299             if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
1300             {
1301                 emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
1302             }
1303             else
1304             {
1305                 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1306             }
1307             break;
1308         }
1309
1310         case NI_AVX_ExtendToVector256:
1311         {
1312             // ExtendToVector256 has zero-extend semantics in order to ensure it is deterministic
1313             // We always emit a move to the target register, even when op1Reg == targetReg, in order
1314             // to ensure that Bits MAXVL-1:128 are zeroed.
1315
1316             assert(op2 == nullptr);
1317             regNumber op1Reg = op1->gtRegNum;
1318             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
1319             break;
1320         }
1321
1322         case NI_AVX_GetLowerHalf:
1323         case NI_AVX_StaticCast:
1324         {
1325             assert(op2 == nullptr);
1326             regNumber op1Reg = op1->gtRegNum;
1327
1328             if (op1Reg != targetReg)
1329             {
1330                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1331                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
1332             }
1333             break;
1334         }
1335
1336         case NI_AVX_TestC:
1337         {
1338             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1339             emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1340             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1341             break;
1342         }
1343
1344         case NI_AVX_TestNotZAndNotC:
1345         {
1346             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1347             emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1348             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1349             break;
1350         }
1351
1352         case NI_AVX_TestZ:
1353         {
1354             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1355             emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1356             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1357             break;
1358         }
1359
1360         default:
1361             unreached();
1362             break;
1363     }
1364
1365     genProduceReg(node);
1366 }
1367
1368 //------------------------------------------------------------------------
1369 // genAVX2Intrinsic: Generates the code for an AVX2 hardware intrinsic node
1370 //
1371 // Arguments:
1372 //    node - The hardware intrinsic node
1373 //
1374 void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node)
1375 {
1376     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1377     var_types      baseType    = node->gtSIMDBaseType;
1378     instruction    ins         = INS_invalid;
1379
1380     genConsumeOperands(node);
1381
1382     switch (intrinsicID)
1383     {
1384         default:
1385             unreached();
1386             break;
1387     }
1388
1389     genProduceReg(node);
1390 }
1391
1392 //------------------------------------------------------------------------
1393 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
1394 //
1395 // Arguments:
1396 //    node - The hardware intrinsic node
1397 //
1398 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
1399 {
1400     NYI("Implement AES intrinsic code generation");
1401 }
1402
1403 //------------------------------------------------------------------------
1404 // genBMI1Intrinsic: Generates the code for a BMI1 hardware intrinsic node
1405 //
1406 // Arguments:
1407 //    node - The hardware intrinsic node
1408 //
1409 void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
1410 {
1411     NYI("Implement BMI1 intrinsic code generation");
1412 }
1413
1414 //------------------------------------------------------------------------
1415 // genBMI2Intrinsic: Generates the code for a BMI2 hardware intrinsic node
1416 //
1417 // Arguments:
1418 //    node - The hardware intrinsic node
1419 //
1420 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
1421 {
1422     NYI("Implement BMI2 intrinsic code generation");
1423 }
1424
1425 //------------------------------------------------------------------------
1426 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
1427 //
1428 // Arguments:
1429 //    node - The hardware intrinsic node
1430 //
1431 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
1432 {
1433     NYI("Implement FMA intrinsic code generation");
1434 }
1435
1436 //------------------------------------------------------------------------
1437 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
1438 //
1439 // Arguments:
1440 //    node - The hardware intrinsic node
1441 //
1442 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
1443 {
1444     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1445     GenTree*       op1         = node->gtGetOp1();
1446     regNumber      targetReg   = node->gtRegNum;
1447     assert(targetReg != REG_NA);
1448     var_types targetType = node->TypeGet();
1449     regNumber op1Reg     = op1->gtRegNum;
1450     genConsumeOperands(node);
1451
1452     assert(intrinsicID == NI_LZCNT_LeadingZeroCount);
1453
1454     inst_RV_RV(INS_lzcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1455
1456     genProduceReg(node);
1457 }
1458
1459 //------------------------------------------------------------------------
1460 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
1461 //
1462 // Arguments:
1463 //    node - The hardware intrinsic node
1464 //
1465 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
1466 {
1467     NYI("Implement PCLMULQDQ intrinsic code generation");
1468 }
1469
1470 //------------------------------------------------------------------------
1471 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
1472 //
1473 // Arguments:
1474 //    node - The hardware intrinsic node
1475 //
1476 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
1477 {
1478     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1479     GenTree*       op1         = node->gtGetOp1();
1480     regNumber      targetReg   = node->gtRegNum;
1481     assert(targetReg != REG_NA);
1482     var_types targetType = node->TypeGet();
1483     regNumber op1Reg     = op1->gtRegNum;
1484     genConsumeOperands(node);
1485
1486     assert(intrinsicID == NI_POPCNT_PopCount);
1487
1488     inst_RV_RV(INS_popcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1489
1490     genProduceReg(node);
1491 }
1492
1493 #endif // FEATURE_HW_INTRINSICS