Fix reading Time zone rules using Julian days (#17672)
[platform/upstream/coreclr.git] / src / jit / hwintrinsiccodegenxarch.cpp
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
7 XX                                                                           XX
8 XX               Intel hardware intrinsic Code Generator                     XX
9 XX                                                                           XX
10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12 */
13 #include "jitpch.h"
14 #ifdef _MSC_VER
15 #pragma hdrstop
16 #endif
17
18 #ifdef FEATURE_HW_INTRINSICS
19
20 #include "emit.h"
21 #include "codegen.h"
22 #include "sideeffects.h"
23 #include "lower.h"
24 #include "gcinfo.h"
25 #include "gcinfoencoder.h"
26
27 //------------------------------------------------------------------------
28 // genIsTableDrivenHWIntrinsic:
29 //
30 // Arguments:
31 //    category - category of a HW intrinsic
32 //
33 // Return Value:
34 //    returns true if this category can be table-driven in CodeGen
35 //
36 static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags)
37 {
38     // TODO - make more categories to the table-driven framework
39     // HW_Category_Helper and HW_Flag_MultiIns/HW_Flag_SpecialCodeGen usually need manual codegen
40     const bool tableDrivenCategory =
41         category != HW_Category_Special && category != HW_Category_Scalar && category != HW_Category_Helper;
42     const bool tableDrivenFlag = (flags & (HW_Flag_MultiIns | HW_Flag_SpecialCodeGen)) == 0;
43     return tableDrivenCategory && tableDrivenFlag;
44 }
45
46 //------------------------------------------------------------------------
47 // genHWIntrinsic: Generates the code for a given hardware intrinsic node.
48 //
49 // Arguments:
50 //    node - The hardware intrinsic node
51 //
52 void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
53 {
54     NamedIntrinsic      intrinsicID = node->gtHWIntrinsicId;
55     InstructionSet      isa         = Compiler::isaOfHWIntrinsic(intrinsicID);
56     HWIntrinsicCategory category    = Compiler::categoryOfHWIntrinsic(intrinsicID);
57     HWIntrinsicFlag     flags       = Compiler::flagsOfHWIntrinsic(intrinsicID);
58     int                 ival        = Compiler::ivalOfHWIntrinsic(intrinsicID);
59     int                 numArgs     = Compiler::numArgsOfHWIntrinsic(node);
60
61     assert((flags & HW_Flag_NoCodeGen) == 0);
62
63     if (genIsTableDrivenHWIntrinsic(category, flags))
64     {
65         GenTree*  op1        = node->gtGetOp1();
66         GenTree*  op2        = node->gtGetOp2();
67         regNumber targetReg  = node->gtRegNum;
68         var_types targetType = node->TypeGet();
69         var_types baseType   = node->gtSIMDBaseType;
70
71         regNumber op1Reg = REG_NA;
72         regNumber op2Reg = REG_NA;
73         emitter*  emit   = getEmitter();
74
75         assert(numArgs >= 0);
76         instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
77         assert(ins != INS_invalid);
78         emitAttr simdSize = EA_ATTR(node->gtSIMDSize);
79         assert(simdSize != 0);
80
81         switch (numArgs)
82         {
83             case 1:
84             {
85                 genConsumeOperands(node);
86                 op1Reg = op1->gtRegNum;
87                 if (category == HW_Category_MemoryLoad)
88                 {
89                     emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
90                 }
91                 else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0)
92                 {
93                     emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
94                 }
95                 else if ((ival != -1) && varTypeIsFloating(baseType))
96                 {
97                     emit->emitIns_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
98                 }
99                 else
100                 {
101                     emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg);
102                 }
103                 break;
104             }
105
106             case 2:
107             {
108                 genConsumeOperands(node);
109
110                 op1Reg = op1->gtRegNum;
111                 op2Reg = op2->gtRegNum;
112
113                 if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
114                 {
115                     // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
116                     //
117                     // For non-commutative intrinsics, we should have ensured that op2 was marked
118                     // delay free in order to prevent it from getting assigned the same register
119                     // as target. However, for commutative intrinsics, we can just swap the operands
120                     // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
121
122                     noway_assert(node->OperIsCommutative());
123                     op2Reg = op1Reg;
124                     op1Reg = targetReg;
125                 }
126
127                 if (category == HW_Category_MemoryStore)
128                 {
129                     emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
130                 }
131                 else if ((ival != -1) && varTypeIsFloating(baseType))
132                 {
133                     genHWIntrinsic_R_R_RM_I(node, ins);
134                 }
135                 else if (category == HW_Category_MemoryLoad)
136                 {
137                     if (intrinsicID == NI_AVX_MaskLoad)
138                     {
139                         emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
140                     }
141                     else
142                     {
143                         emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
144                     }
145                 }
146                 else if (Compiler::isImmHWIntrinsic(intrinsicID, op2))
147                 {
148                     if (intrinsicID == NI_SSE2_Extract)
149                     {
150                         // extract instructions return to GP-registers, so it needs int size as the emitsize
151                         simdSize = emitTypeSize(TYP_INT);
152                     }
153                     auto emitSwCase = [&](unsigned i) {
154                         emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, (int)i);
155                     };
156
157                     if (op2->IsCnsIntOrI())
158                     {
159                         ssize_t ival = op2->AsIntCon()->IconValue();
160                         emitSwCase((unsigned)ival);
161                     }
162                     else
163                     {
164                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
165                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
166                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
167                         regNumber baseReg = node->ExtractTempReg();
168                         regNumber offsReg = node->GetSingleTempReg();
169                         genHWIntrinsicJumpTableFallback(intrinsicID, op2Reg, baseReg, offsReg, emitSwCase);
170                     }
171                 }
172                 else
173                 {
174                     genHWIntrinsic_R_R_RM(node, ins);
175                 }
176                 break;
177             }
178
179             case 3:
180             {
181                 assert(op1->OperIsList());
182                 assert(op1->gtGetOp2()->OperIsList());
183                 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
184
185                 GenTreeArgList* argList = op1->AsArgList();
186                 op1                     = argList->Current();
187                 genConsumeRegs(op1);
188                 op1Reg = op1->gtRegNum;
189
190                 argList = argList->Rest();
191                 op2     = argList->Current();
192                 genConsumeRegs(op2);
193                 op2Reg = op2->gtRegNum;
194
195                 argList      = argList->Rest();
196                 GenTree* op3 = argList->Current();
197                 genConsumeRegs(op3);
198                 regNumber op3Reg = op3->gtRegNum;
199
200                 if (Compiler::isImmHWIntrinsic(intrinsicID, op3))
201                 {
202                     auto emitSwCase = [&](unsigned i) {
203                         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, (int)i);
204                     };
205                     if (op3->IsCnsIntOrI())
206                     {
207                         ssize_t ival = op3->AsIntCon()->IconValue();
208                         emitSwCase((unsigned)ival);
209                     }
210                     else
211                     {
212                         // We emit a fallback case for the scenario when the imm-op is not a constant. This should
213                         // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
214                         // can also occur if the consumer calls it directly and just doesn't pass a constant value.
215                         regNumber baseReg = node->ExtractTempReg();
216                         regNumber offsReg = node->GetSingleTempReg();
217                         genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
218                     }
219                 }
220                 else if (category == HW_Category_MemoryStore)
221                 {
222                     assert(intrinsicID == NI_SSE2_MaskMove);
223                     assert(targetReg == REG_NA);
224
225                     // SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI
226                     if (op3Reg != REG_EDI)
227                     {
228                         emit->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_EDI, op3Reg);
229                     }
230                     emit->emitIns_R_R(ins, simdSize, op1Reg, op2Reg);
231                 }
232                 else
233                 {
234                     emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg);
235                 }
236                 break;
237             }
238
239             default:
240                 unreached();
241                 break;
242         }
243         genProduceReg(node);
244         return;
245     }
246
247     switch (isa)
248     {
249         case InstructionSet_SSE:
250             genSSEIntrinsic(node);
251             break;
252         case InstructionSet_SSE2:
253             genSSE2Intrinsic(node);
254             break;
255         case InstructionSet_SSE41:
256             genSSE41Intrinsic(node);
257             break;
258         case InstructionSet_SSE42:
259             genSSE42Intrinsic(node);
260             break;
261         case InstructionSet_AVX:
262         case InstructionSet_AVX2:
263             genAvxOrAvx2Intrinsic(node);
264             break;
265         case InstructionSet_AES:
266             genAESIntrinsic(node);
267             break;
268         case InstructionSet_BMI1:
269             genBMI1Intrinsic(node);
270             break;
271         case InstructionSet_BMI2:
272             genBMI2Intrinsic(node);
273             break;
274         case InstructionSet_FMA:
275             genFMAIntrinsic(node);
276             break;
277         case InstructionSet_LZCNT:
278             genLZCNTIntrinsic(node);
279             break;
280         case InstructionSet_PCLMULQDQ:
281             genPCLMULQDQIntrinsic(node);
282             break;
283         case InstructionSet_POPCNT:
284             genPOPCNTIntrinsic(node);
285             break;
286         default:
287             unreached();
288             break;
289     }
290 }
291
292 //------------------------------------------------------------------------
293 // genHWIntrinsic_R_R_RM: Generates the code for a hardware intrinsic node that takes a register operand, a
294 //                        register/memory operand, and that returns a value in register
295 //
296 // Arguments:
297 //    node - The hardware intrinsic node
298 //    ins  - The instruction being generated
299 //
300 void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins)
301 {
302     var_types targetType = node->TypeGet();
303     regNumber targetReg  = node->gtRegNum;
304     GenTree*  op1        = node->gtGetOp1();
305     GenTree*  op2        = node->gtGetOp2();
306     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
307     emitter*  emit       = getEmitter();
308
309     // TODO-XArch-CQ: Commutative operations can have op1 be contained
310     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
311
312     regNumber op1Reg = op1->gtRegNum;
313
314     assert(targetReg != REG_NA);
315     assert(op1Reg != REG_NA);
316
317     if (op2->isContained() || op2->isUsedFromSpillTemp())
318     {
319         assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
320         assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
321
322         TempDsc* tmpDsc = nullptr;
323         unsigned varNum = BAD_VAR_NUM;
324         unsigned offset = (unsigned)-1;
325
326         if (op2->isUsedFromSpillTemp())
327         {
328             assert(op2->IsRegOptional());
329
330             tmpDsc = getSpillTempDsc(op2);
331             varNum = tmpDsc->tdTempNum();
332             offset = 0;
333
334             compiler->tmpRlsTemp(tmpDsc);
335         }
336         else if (op2->OperIsHWIntrinsic())
337         {
338             emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
339             return;
340         }
341         else if (op2->isIndir())
342         {
343             GenTreeIndir* memIndir = op2->AsIndir();
344             GenTree*      memBase  = memIndir->gtOp1;
345
346             switch (memBase->OperGet())
347             {
348                 case GT_LCL_VAR_ADDR:
349                 {
350                     varNum = memBase->AsLclVarCommon()->GetLclNum();
351                     offset = 0;
352
353                     // Ensure that all the GenTreeIndir values are set to their defaults.
354                     assert(!memIndir->HasIndex());
355                     assert(memIndir->Scale() == 1);
356                     assert(memIndir->Offset() == 0);
357
358                     break;
359                 }
360
361                 case GT_CLS_VAR_ADDR:
362                 {
363                     emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0);
364                     return;
365                 }
366
367                 default:
368                 {
369                     emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir);
370                     return;
371                 }
372             }
373         }
374         else
375         {
376             switch (op2->OperGet())
377             {
378                 case GT_LCL_FLD:
379                 {
380                     GenTreeLclFld* lclField = op2->AsLclFld();
381
382                     varNum = lclField->GetLclNum();
383                     offset = lclField->gtLclFld.gtLclOffs;
384                     break;
385                 }
386
387                 case GT_LCL_VAR:
388                 {
389                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
390                     varNum = op2->AsLclVar()->GetLclNum();
391                     offset = 0;
392                     break;
393                 }
394
395                 default:
396                     unreached();
397                     break;
398             }
399         }
400
401         // Ensure we got a good varNum and offset.
402         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
403         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
404         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
405         assert(offset != (unsigned)-1);
406
407         emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset);
408     }
409     else
410     {
411         regNumber op2Reg = op2->gtRegNum;
412
413         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
414         {
415             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
416             //
417             // For non-commutative intrinsics, we should have ensured that op2 was marked
418             // delay free in order to prevent it from getting assigned the same register
419             // as target. However, for commutative intrinsics, we can just swap the operands
420             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
421
422             noway_assert(node->OperIsCommutative());
423             op2Reg = op1Reg;
424             op1Reg = targetReg;
425         }
426
427         emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg);
428     }
429 }
430
431 //------------------------------------------------------------------------
432 // genHWIntrinsic_R_R_RM_I: Generates the code for a hardware intrinsic node that takes a register operand, a
433 //                        register/memory operand, an immediate operand, and that returns a value in register
434 //
435 // Arguments:
436 //    node - The hardware intrinsic node
437 //    ins  - The instruction being generated
438 //
439 void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins)
440 {
441     var_types targetType = node->TypeGet();
442     regNumber targetReg  = node->gtRegNum;
443     GenTree*  op1        = node->gtGetOp1();
444     GenTree*  op2        = node->gtGetOp2();
445     emitAttr  simdSize   = EA_ATTR(node->gtSIMDSize);
446     int       ival       = Compiler::ivalOfHWIntrinsic(node->gtHWIntrinsicId);
447     emitter*  emit       = getEmitter();
448
449     // TODO-XArch-CQ: Commutative operations can have op1 be contained
450     // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
451
452     regNumber op1Reg = op1->gtRegNum;
453
454     assert(targetReg != REG_NA);
455     assert(op1Reg != REG_NA);
456
457     if (op2->isContained() || op2->isUsedFromSpillTemp())
458     {
459         assert((Compiler::flagsOfHWIntrinsic(node->gtHWIntrinsicId) & HW_Flag_NoContainment) == 0);
460         assert(compiler->m_pLowering->IsContainableHWIntrinsicOp(node, op2) || op2->IsRegOptional());
461
462         TempDsc* tmpDsc = nullptr;
463         unsigned varNum = BAD_VAR_NUM;
464         unsigned offset = (unsigned)-1;
465
466         if (op2->isUsedFromSpillTemp())
467         {
468             assert(op2->IsRegOptional());
469
470             tmpDsc = getSpillTempDsc(op2);
471             varNum = tmpDsc->tdTempNum();
472             offset = 0;
473
474             compiler->tmpRlsTemp(tmpDsc);
475         }
476         else if (op2->OperIsHWIntrinsic())
477         {
478             emit->emitIns_SIMD_R_R_AR_I(ins, simdSize, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum, ival);
479             return;
480         }
481         else if (op2->isIndir())
482         {
483             GenTreeIndir* memIndir = op2->AsIndir();
484             GenTree*      memBase  = memIndir->gtOp1;
485
486             switch (memBase->OperGet())
487             {
488                 case GT_LCL_VAR_ADDR:
489                 {
490                     varNum = memBase->AsLclVarCommon()->GetLclNum();
491                     offset = 0;
492
493                     // Ensure that all the GenTreeIndir values are set to their defaults.
494                     assert(!memIndir->HasIndex());
495                     assert(memIndir->Scale() == 1);
496                     assert(memIndir->Offset() == 0);
497
498                     break;
499                 }
500
501                 case GT_CLS_VAR_ADDR:
502                 {
503                     emit->emitIns_SIMD_R_R_C_I(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0,
504                                                ival);
505                     return;
506                 }
507
508                 default:
509                 {
510                     emit->emitIns_SIMD_R_R_A_I(ins, simdSize, targetReg, op1Reg, memIndir, ival);
511                     return;
512                 }
513             }
514         }
515         else
516         {
517             switch (op2->OperGet())
518             {
519                 case GT_LCL_FLD:
520                 {
521                     GenTreeLclFld* lclField = op2->AsLclFld();
522
523                     varNum = lclField->GetLclNum();
524                     offset = lclField->gtLclFld.gtLclOffs;
525                     break;
526                 }
527
528                 case GT_LCL_VAR:
529                 {
530                     assert(op2->IsRegOptional() || !compiler->lvaTable[op2->gtLclVar.gtLclNum].lvIsRegCandidate());
531                     varNum = op2->AsLclVar()->GetLclNum();
532                     offset = 0;
533                     break;
534                 }
535
536                 default:
537                     unreached();
538                     break;
539             }
540         }
541
542         // Ensure we got a good varNum and offset.
543         // We also need to check for `tmpDsc != nullptr` since spill temp numbers
544         // are negative and start with -1, which also happens to be BAD_VAR_NUM.
545         assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
546         assert(offset != (unsigned)-1);
547
548         emit->emitIns_SIMD_R_R_S_I(ins, simdSize, targetReg, op1Reg, varNum, offset, ival);
549     }
550     else
551     {
552         regNumber op2Reg = op2->gtRegNum;
553
554         if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler))
555         {
556             // We have "reg2 = reg1 op reg2" where "reg1 != reg2" on a RMW intrinsic.
557             //
558             // For non-commutative intrinsics, we should have ensured that op2 was marked
559             // delay free in order to prevent it from getting assigned the same register
560             // as target. However, for commutative intrinsics, we can just swap the operands
561             // in order to have "reg2 = reg2 op reg1" which will end up producing the right code.
562
563             noway_assert(node->OperIsCommutative());
564             op2Reg = op1Reg;
565             op1Reg = targetReg;
566         }
567
568         emit->emitIns_SIMD_R_R_R_I(ins, simdSize, targetReg, op1Reg, op2Reg, ival);
569     }
570 }
571
572 // genHWIntrinsicJumpTableFallback : generate the jump-table fallback for imm-intrinsics
573 //                       with non-constant argument
574 //
575 // Arguments:
576 //    intrinsic      - intrinsic ID
577 //    nonConstImmReg - the register contains non-constant imm8 argument
578 //    baseReg        - a register for the start of the switch table
579 //    offsReg        - a register for the offset into the switch table
580 //    emitSwCase     - the lambda to generate siwtch-case
581 //
582 // Return Value:
583 //    generate the jump-table fallback for imm-intrinsics with non-constant argument.
584 // Note:
585 //    This function can be used for all imm-intrinsics (whether full-range or not),
586 //    The compiler front-end (i.e. importer) is responsible to insert a range-check IR
587 //    (GT_HW_INTRINSIC_CHK) for imm8 argument, so this function does not need to do range-check.
588 //
589 template <typename HWIntrinsicSwitchCaseBody>
590 void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsic,
591                                               regNumber                 nonConstImmReg,
592                                               regNumber                 baseReg,
593                                               regNumber                 offsReg,
594                                               HWIntrinsicSwitchCaseBody emitSwCase)
595 {
596     assert(nonConstImmReg != REG_NA);
597     emitter* emit = getEmitter();
598
599     const unsigned maxByte = (unsigned)Compiler::immUpperBoundOfHWIntrinsic(intrinsic) + 1;
600     assert(maxByte <= 256);
601     BasicBlock* jmpTable[256];
602
603     unsigned jmpTableBase = emit->emitBBTableDataGenBeg(maxByte, true);
604     unsigned jmpTableOffs = 0;
605
606     // Emit the jump table
607     for (unsigned i = 0; i < maxByte; i++)
608     {
609         jmpTable[i] = genCreateTempLabel();
610         emit->emitDataGenData(i, jmpTable[i]);
611     }
612
613     emit->emitDataGenEnd();
614
615     // Compute and jump to the appropriate offset in the switch table
616     emit->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), offsReg, compiler->eeFindJitDataOffs(jmpTableBase), 0);
617
618     emit->emitIns_R_ARX(INS_mov, EA_4BYTE, offsReg, offsReg, nonConstImmReg, 4, 0);
619     emit->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, baseReg);
620     emit->emitIns_R_R(INS_add, EA_PTRSIZE, offsReg, baseReg);
621     emit->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), offsReg);
622
623     // Emit the switch table entries
624
625     BasicBlock* switchTableBeg = genCreateTempLabel();
626     BasicBlock* switchTableEnd = genCreateTempLabel();
627
628     genDefineTempLabel(switchTableBeg);
629
630     for (unsigned i = 0; i < maxByte; i++)
631     {
632         genDefineTempLabel(jmpTable[i]);
633         emitSwCase(i);
634         emit->emitIns_J(INS_jmp, switchTableEnd);
635     }
636
637     genDefineTempLabel(switchTableEnd);
638 }
639
640 //------------------------------------------------------------------------
641 // genSSEIntrinsic: Generates the code for an SSE hardware intrinsic node
642 //
643 // Arguments:
644 //    node - The hardware intrinsic node
645 //
646 void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
647 {
648     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
649     GenTree*       op1         = node->gtGetOp1();
650     GenTree*       op2         = node->gtGetOp2();
651     GenTree*       op3         = nullptr;
652     GenTree*       op4         = nullptr;
653     regNumber      targetReg   = node->gtRegNum;
654     var_types      targetType  = node->TypeGet();
655     var_types      baseType    = node->gtSIMDBaseType;
656
657     regNumber op1Reg = REG_NA;
658     regNumber op2Reg = REG_NA;
659     regNumber op3Reg = REG_NA;
660     regNumber op4Reg = REG_NA;
661     emitter*  emit   = getEmitter();
662
663     if ((op1 != nullptr) && !op1->OperIsList())
664     {
665         op1Reg = op1->gtRegNum;
666         genConsumeOperands(node);
667     }
668
669     switch (intrinsicID)
670     {
671         case NI_SSE_CompareEqualOrderedScalar:
672         case NI_SSE_CompareEqualUnorderedScalar:
673         {
674             assert(baseType == TYP_FLOAT);
675             op2Reg             = op2->gtRegNum;
676             regNumber   tmpReg = node->GetSingleTempReg();
677             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
678
679             // Ensure we aren't overwriting targetReg
680             assert(tmpReg != targetReg);
681
682             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
683             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
684             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
685             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
686             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
687             break;
688         }
689
690         case NI_SSE_CompareGreaterThanOrderedScalar:
691         case NI_SSE_CompareGreaterThanUnorderedScalar:
692         {
693             assert(baseType == TYP_FLOAT);
694             op2Reg = op2->gtRegNum;
695
696             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
697             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
698             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
699             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
700             break;
701         }
702
703         case NI_SSE_CompareGreaterThanOrEqualOrderedScalar:
704         case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar:
705         {
706             assert(baseType == TYP_FLOAT);
707             op2Reg = op2->gtRegNum;
708
709             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
710             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
711             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
712             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
713             break;
714         }
715
716         case NI_SSE_CompareLessThanOrderedScalar:
717         case NI_SSE_CompareLessThanUnorderedScalar:
718         {
719             assert(baseType == TYP_FLOAT);
720             op2Reg = op2->gtRegNum;
721
722             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
723             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
724             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
725             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
726             break;
727         }
728
729         case NI_SSE_CompareLessThanOrEqualOrderedScalar:
730         case NI_SSE_CompareLessThanOrEqualUnorderedScalar:
731         {
732             assert(baseType == TYP_FLOAT);
733             op2Reg = op2->gtRegNum;
734
735             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
736             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
737             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
738             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
739             break;
740         }
741
742         case NI_SSE_CompareNotEqualOrderedScalar:
743         case NI_SSE_CompareNotEqualUnorderedScalar:
744         {
745             assert(baseType == TYP_FLOAT);
746             op2Reg             = op2->gtRegNum;
747             regNumber   tmpReg = node->GetSingleTempReg();
748             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
749
750             // Ensure we aren't overwriting targetReg
751             assert(tmpReg != targetReg);
752
753             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
754             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
755             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
756             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
757             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
758             break;
759         }
760
761         case NI_SSE_ConvertToSingle:
762         case NI_SSE_StaticCast:
763         {
764             assert(op2 == nullptr);
765             if (op1Reg != targetReg)
766             {
767                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
768                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
769             }
770             break;
771         }
772
773         case NI_SSE_MoveMask:
774         {
775             assert(baseType == TYP_FLOAT);
776             assert(op2 == nullptr);
777
778             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
779             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
780             break;
781         }
782
783         case NI_SSE_Prefetch0:
784         case NI_SSE_Prefetch1:
785         case NI_SSE_Prefetch2:
786         case NI_SSE_PrefetchNonTemporal:
787         {
788             assert(baseType == TYP_UBYTE);
789             assert(op2 == nullptr);
790
791             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
792             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
793             break;
794         }
795
796         case NI_SSE_SetScalarVector128:
797         {
798             assert(baseType == TYP_FLOAT);
799             assert(op2 == nullptr);
800
801             if (op1Reg == targetReg)
802             {
803                 regNumber tmpReg = node->GetSingleTempReg();
804
805                 // Ensure we aren't overwriting targetReg
806                 assert(tmpReg != targetReg);
807
808                 emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
809                 op1Reg = tmpReg;
810             }
811
812             emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
813             emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
814             break;
815         }
816
817         case NI_SSE_SetZeroVector128:
818         {
819             assert(baseType == TYP_FLOAT);
820             assert(op1 == nullptr);
821             assert(op2 == nullptr);
822             emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
823             break;
824         }
825
826         case NI_SSE_StoreFence:
827         {
828             assert(baseType == TYP_VOID);
829             assert(op1 == nullptr);
830             assert(op2 == nullptr);
831             emit->emitIns(INS_sfence);
832             break;
833         }
834
835         default:
836             unreached();
837             break;
838     }
839
840     genProduceReg(node);
841 }
842
843 //------------------------------------------------------------------------
844 // genSSE2Intrinsic: Generates the code for an SSE2 hardware intrinsic node
845 //
846 // Arguments:
847 //    node - The hardware intrinsic node
848 //
849 void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
850 {
851     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
852     GenTree*       op1         = node->gtGetOp1();
853     GenTree*       op2         = node->gtGetOp2();
854     regNumber      targetReg   = node->gtRegNum;
855     var_types      targetType  = node->TypeGet();
856     var_types      baseType    = node->gtSIMDBaseType;
857     regNumber      op1Reg      = REG_NA;
858     regNumber      op2Reg      = REG_NA;
859     emitter*       emit        = getEmitter();
860     int            ival        = -1;
861
862     if ((op1 != nullptr) && !op1->OperIsList())
863     {
864         op1Reg = op1->gtRegNum;
865         genConsumeOperands(node);
866     }
867
868     switch (intrinsicID)
869     {
870         // All integer overloads are handled by table codegen
871         case NI_SSE2_CompareLessThan:
872         {
873             assert(op1 != nullptr);
874             assert(op2 != nullptr);
875             assert(baseType == TYP_DOUBLE);
876             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
877             op2Reg          = op2->gtRegNum;
878             ival            = Compiler::ivalOfHWIntrinsic(intrinsicID);
879             assert(ival != -1);
880             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
881
882             break;
883         }
884
885         case NI_SSE2_CompareEqualOrderedScalar:
886         case NI_SSE2_CompareEqualUnorderedScalar:
887         {
888             assert(baseType == TYP_DOUBLE);
889             op2Reg             = op2->gtRegNum;
890             regNumber   tmpReg = node->GetSingleTempReg();
891             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
892
893             // Ensure we aren't overwriting targetReg
894             assert(tmpReg != targetReg);
895
896             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
897             emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
898             emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
899             emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
900             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
901             break;
902         }
903
904         case NI_SSE2_CompareGreaterThanOrderedScalar:
905         case NI_SSE2_CompareGreaterThanUnorderedScalar:
906         {
907             assert(baseType == TYP_DOUBLE);
908             op2Reg          = op2->gtRegNum;
909             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
910
911             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
912             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
913             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
914             break;
915         }
916
917         case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
918         case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
919         {
920             assert(baseType == TYP_DOUBLE);
921             op2Reg          = op2->gtRegNum;
922             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
923
924             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
925             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
926             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
927             break;
928         }
929
930         case NI_SSE2_CompareLessThanOrderedScalar:
931         case NI_SSE2_CompareLessThanUnorderedScalar:
932         {
933             assert(baseType == TYP_DOUBLE);
934             op2Reg          = op2->gtRegNum;
935             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
936
937             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
938             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
939             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
940             break;
941         }
942
943         case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
944         case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
945         {
946             assert(baseType == TYP_DOUBLE);
947             op2Reg          = op2->gtRegNum;
948             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
949
950             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
951             emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
952             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
953             break;
954         }
955
956         case NI_SSE2_CompareNotEqualOrderedScalar:
957         case NI_SSE2_CompareNotEqualUnorderedScalar:
958         {
959             assert(baseType == TYP_DOUBLE);
960             op2Reg             = op2->gtRegNum;
961             instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
962             regNumber   tmpReg = node->GetSingleTempReg();
963
964             // Ensure we aren't overwriting targetReg
965             assert(tmpReg != targetReg);
966
967             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
968             emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
969             emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
970             emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
971             emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, tmpReg);
972             break;
973         }
974
975         case NI_SSE2_ConvertScalarToVector128Double:
976         case NI_SSE2_ConvertScalarToVector128Single:
977         {
978             assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
979             assert(op1 != nullptr);
980             assert(op2 != nullptr);
981             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
982             genHWIntrinsic_R_R_RM(node, ins);
983             break;
984         }
985
986         case NI_SSE2_ConvertScalarToVector128Int64:
987         case NI_SSE2_ConvertScalarToVector128UInt64:
988         {
989             assert(baseType == TYP_LONG || baseType == TYP_ULONG);
990             assert(op1 != nullptr);
991             assert(op2 == nullptr);
992             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
993             emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
994             break;
995         }
996
997         case NI_SSE2_ConvertToDouble:
998         {
999             assert(op2 == nullptr);
1000             if (op1Reg != targetReg)
1001             {
1002                 instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1003                 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
1004             }
1005             break;
1006         }
1007
1008         case NI_SSE2_ConvertToInt32:
1009         case NI_SSE2_ConvertToInt64:
1010         case NI_SSE2_ConvertToUInt32:
1011         case NI_SSE2_ConvertToUInt64:
1012         {
1013             assert(op2 == nullptr);
1014             assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
1015                    baseType == TYP_LONG || baseType == TYP_ULONG);
1016             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1017             if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
1018             {
1019                 emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
1020             }
1021             else
1022             {
1023                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
1024             }
1025             break;
1026         }
1027
1028         case NI_SSE2_LoadFence:
1029         {
1030             assert(baseType == TYP_VOID);
1031             assert(op1 == nullptr);
1032             assert(op2 == nullptr);
1033             emit->emitIns(INS_lfence);
1034             break;
1035         }
1036
1037         case NI_SSE2_MemoryFence:
1038         {
1039             assert(baseType == TYP_VOID);
1040             assert(op1 == nullptr);
1041             assert(op2 == nullptr);
1042             emit->emitIns(INS_mfence);
1043             break;
1044         }
1045
1046         case NI_SSE2_MoveMask:
1047         {
1048             assert(op2 == nullptr);
1049             assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
1050
1051             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1052             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
1053             break;
1054         }
1055
1056         case NI_SSE2_SetScalarVector128:
1057         {
1058             assert(baseType == TYP_DOUBLE);
1059             assert(op2 == nullptr);
1060
1061             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType);
1062             if (op1Reg == targetReg)
1063             {
1064                 regNumber tmpReg = node->GetSingleTempReg();
1065
1066                 // Ensure we aren't overwriting targetReg
1067                 assert(tmpReg != targetReg);
1068
1069                 emit->emitIns_R_R(INS_movapd, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg);
1070                 op1Reg = tmpReg;
1071             }
1072
1073             emit->emitIns_SIMD_R_R_R(INS_xorpd, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1074             emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg);
1075             break;
1076         }
1077
1078         case NI_SSE2_SetZeroVector128:
1079         {
1080             assert(baseType != TYP_FLOAT);
1081             assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE);
1082             assert(op1 == nullptr);
1083             assert(op2 == nullptr);
1084
1085             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1086             emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
1087             break;
1088         }
1089
1090         case NI_SSE2_StoreNonTemporal:
1091         {
1092             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
1093             assert(op1 != nullptr);
1094             assert(op2 != nullptr);
1095
1096             op2Reg          = op2->gtRegNum;
1097             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1098             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
1099             break;
1100         }
1101
1102         default:
1103             unreached();
1104             break;
1105     }
1106
1107     genProduceReg(node);
1108 }
1109
1110 //------------------------------------------------------------------------
1111 // genSSE41Intrinsic: Generates the code for an SSE4.1 hardware intrinsic node
1112 //
1113 // Arguments:
1114 //    node - The hardware intrinsic node
1115 //
1116 void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
1117 {
1118     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1119     GenTree*       op1         = node->gtGetOp1();
1120     GenTree*       op2         = node->gtGetOp2();
1121     GenTree*       op3         = nullptr;
1122     GenTree*       op4         = nullptr;
1123     regNumber      targetReg   = node->gtRegNum;
1124     var_types      targetType  = node->TypeGet();
1125     var_types      baseType    = node->gtSIMDBaseType;
1126
1127     regNumber op1Reg = REG_NA;
1128     regNumber op2Reg = REG_NA;
1129     regNumber op3Reg = REG_NA;
1130     regNumber op4Reg = REG_NA;
1131     emitter*  emit   = getEmitter();
1132
1133     if ((op1 != nullptr) && !op1->OperIsList())
1134     {
1135         op1Reg = op1->gtRegNum;
1136         genConsumeOperands(node);
1137     }
1138
1139     switch (intrinsicID)
1140     {
1141         case NI_SSE41_TestAllOnes:
1142         {
1143             regNumber tmpReg = node->GetSingleTempReg();
1144             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1145             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
1146             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1147             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, tmpReg);
1148             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1149             break;
1150         }
1151
1152         case NI_SSE41_TestAllZeros:
1153         case NI_SSE41_TestZ:
1154         {
1155             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1156             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1157             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1158             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1159             break;
1160         }
1161
1162         case NI_SSE41_TestC:
1163         {
1164             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1165             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1166             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1167             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1168             break;
1169         }
1170
1171         case NI_SSE41_TestMixOnesZeros:
1172         case NI_SSE41_TestNotZAndNotC:
1173         {
1174             assert(Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType) == INS_ptest);
1175             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1176             emit->emitIns_R_R(INS_ptest, emitTypeSize(TYP_SIMD16), op1Reg, op2->gtRegNum);
1177             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1178             break;
1179         }
1180
1181         case NI_SSE41_Extract:
1182         {
1183             regNumber   tmpTargetReg = REG_NA;
1184             instruction ins          = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1185             if (baseType == TYP_FLOAT)
1186             {
1187                 tmpTargetReg = node->ExtractTempReg();
1188             }
1189             auto emitSwCase = [&](unsigned i) {
1190                 if (baseType == TYP_FLOAT)
1191                 {
1192                     // extract instructions return to GP-registers, so it needs int size as the emitsize
1193                     emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, (int)i);
1194                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
1195                 }
1196                 else
1197                 {
1198                     emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, (int)i);
1199                 }
1200             };
1201
1202             if (op2->IsCnsIntOrI())
1203             {
1204                 ssize_t ival = op2->AsIntCon()->IconValue();
1205                 emitSwCase((unsigned)ival);
1206             }
1207             else
1208             {
1209                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1210                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1211                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1212                 regNumber baseReg = node->ExtractTempReg();
1213                 regNumber offsReg = node->GetSingleTempReg();
1214                 genHWIntrinsicJumpTableFallback(intrinsicID, op2->gtRegNum, baseReg, offsReg, emitSwCase);
1215             }
1216             break;
1217         }
1218
1219         default:
1220             unreached();
1221             break;
1222     }
1223
1224     genProduceReg(node);
1225 }
1226
1227 //------------------------------------------------------------------------
1228 // genSSE42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
1229 //
1230 // Arguments:
1231 //    node - The hardware intrinsic node
1232 //
1233 void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
1234 {
1235     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1236     GenTree*       op1         = node->gtGetOp1();
1237     GenTree*       op2         = node->gtGetOp2();
1238     regNumber      targetReg   = node->gtRegNum;
1239     assert(targetReg != REG_NA);
1240     var_types targetType = node->TypeGet();
1241     var_types baseType   = node->gtSIMDBaseType;
1242
1243     regNumber op1Reg = op1->gtRegNum;
1244     regNumber op2Reg = op2->gtRegNum;
1245     genConsumeOperands(node);
1246
1247     switch (intrinsicID)
1248     {
1249         case NI_SSE42_Crc32:
1250             if (op1Reg != targetReg)
1251             {
1252                 assert(op2Reg != targetReg);
1253                 inst_RV_RV(INS_mov, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1254             }
1255
1256             if (baseType == TYP_UBYTE || baseType == TYP_USHORT) // baseType is the type of the second argument
1257             {
1258                 assert(targetType == TYP_INT);
1259                 inst_RV_RV(INS_crc32, targetReg, op2Reg, baseType, emitTypeSize(baseType));
1260             }
1261             else
1262             {
1263                 assert(op1->TypeGet() == op2->TypeGet());
1264                 assert(targetType == TYP_INT || targetType == TYP_LONG);
1265                 inst_RV_RV(INS_crc32, targetReg, op2Reg, targetType, emitTypeSize(targetType));
1266             }
1267
1268             break;
1269         default:
1270             unreached();
1271             break;
1272     }
1273     genProduceReg(node);
1274 }
1275
1276 //------------------------------------------------------------------------
1277 // genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node
1278 //
1279 // Arguments:
1280 //    node - The hardware intrinsic node
1281 //
1282 void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
1283 {
1284     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1285     var_types      baseType    = node->gtSIMDBaseType;
1286     emitAttr       attr        = EA_ATTR(node->gtSIMDSize);
1287     var_types      targetType  = node->TypeGet();
1288     instruction    ins         = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
1289     int            numArgs     = Compiler::numArgsOfHWIntrinsic(node);
1290     GenTree*       op1         = node->gtGetOp1();
1291     GenTree*       op2         = node->gtGetOp2();
1292     regNumber      op1Reg      = REG_NA;
1293     regNumber      op2Reg      = REG_NA;
1294     regNumber      targetReg   = node->gtRegNum;
1295     emitter*       emit        = getEmitter();
1296
1297     if ((op1 != nullptr) && !op1->OperIsList())
1298     {
1299         genConsumeOperands(node);
1300     }
1301
1302     switch (intrinsicID)
1303     {
1304         case NI_AVX_SetZeroVector256:
1305         {
1306             assert(op1 == nullptr);
1307             assert(op2 == nullptr);
1308             // SetZeroVector256 will generate pxor with integral base-typ, but pxor is a AVX2 instruction, so we
1309             // generate xorps on AVX machines.
1310             if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType))
1311             {
1312                 emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
1313             }
1314             else
1315             {
1316                 emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg);
1317             }
1318             break;
1319         }
1320
1321         case NI_AVX_SetAllVector256:
1322         {
1323             assert(op1 != nullptr);
1324             assert(op2 == nullptr);
1325             op1Reg = op1->gtRegNum;
1326             if (varTypeIsIntegral(baseType))
1327             {
1328                 // If the argument is a integer, it needs to be moved into a XMM register
1329                 regNumber tmpXMM = node->ExtractTempReg();
1330                 emit->emitIns_R_R(INS_mov_i2xmm, emitActualTypeSize(baseType), tmpXMM, op1Reg);
1331                 op1Reg = tmpXMM;
1332             }
1333
1334             if (compiler->compSupports(InstructionSet_AVX2))
1335             {
1336                 // generate broadcast instructions if AVX2 is available
1337                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
1338             }
1339             else
1340             {
1341                 // duplicate the scalar argument to XMM register
1342                 switch (baseType)
1343                 {
1344                     case TYP_FLOAT:
1345                         emit->emitIns_SIMD_R_R_I(INS_vpermilps, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1346                         break;
1347                     case TYP_DOUBLE:
1348                         emit->emitIns_R_R(INS_movddup, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg);
1349                         break;
1350                     case TYP_BYTE:
1351                     case TYP_UBYTE:
1352                     {
1353                         regNumber tmpZeroReg = node->GetSingleTempReg();
1354                         emit->emitIns_R_R(INS_pxor, emitTypeSize(TYP_SIMD16), tmpZeroReg, tmpZeroReg);
1355                         emit->emitIns_SIMD_R_R_R(INS_pshufb, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, tmpZeroReg);
1356                         break;
1357                     }
1358                     case TYP_SHORT:
1359                     case TYP_USHORT:
1360                         emit->emitIns_SIMD_R_R_I(INS_pshuflw, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1361                         emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 80);
1362                         break;
1363                     case TYP_INT:
1364                     case TYP_UINT:
1365                         emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1366                         break;
1367                     case TYP_LONG:
1368                     case TYP_ULONG:
1369                         emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 68);
1370                         break;
1371
1372                     default:
1373                         unreached();
1374                         break;
1375                 }
1376                 // duplicate the XMM register to YMM register
1377                 emit->emitIns_SIMD_R_R_R_I(INS_vinsertf128, emitTypeSize(TYP_SIMD32), targetReg, op1Reg, op1Reg, 1);
1378             }
1379             break;
1380         }
1381
1382         case NI_AVX_ExtendToVector256:
1383         {
1384             // ExtendToVector256 has zero-extend semantics in order to ensure it is deterministic
1385             // We always emit a move to the target register, even when op1Reg == targetReg, in order
1386             // to ensure that Bits MAXVL-1:128 are zeroed.
1387
1388             assert(op2 == nullptr);
1389             regNumber op1Reg = op1->gtRegNum;
1390             emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg);
1391             break;
1392         }
1393
1394         case NI_AVX_GetLowerHalf:
1395         case NI_AVX_StaticCast:
1396         {
1397             assert(op2 == nullptr);
1398             regNumber op1Reg = op1->gtRegNum;
1399
1400             if (op1Reg != targetReg)
1401             {
1402                 emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
1403             }
1404             break;
1405         }
1406
1407         case NI_AVX_TestC:
1408         {
1409             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1410             emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1411             emit->emitIns_R(INS_setb, EA_1BYTE, targetReg);
1412             break;
1413         }
1414
1415         case NI_AVX_TestNotZAndNotC:
1416         {
1417             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1418             emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1419             emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
1420             break;
1421         }
1422
1423         case NI_AVX_TestZ:
1424         {
1425             emit->emitIns_R_R(INS_xor, EA_4BYTE, targetReg, targetReg);
1426             emit->emitIns_R_R(ins, attr, op1->gtRegNum, op2->gtRegNum);
1427             emit->emitIns_R(INS_sete, EA_1BYTE, targetReg);
1428             break;
1429         }
1430
1431         case NI_AVX_ExtractVector128:
1432         case NI_AVX_InsertVector128:
1433         case NI_AVX2_ExtractVector128:
1434         case NI_AVX2_InsertVector128:
1435         {
1436             GenTree* lastOp = nullptr;
1437             if (numArgs == 2)
1438             {
1439                 assert(intrinsicID == NI_AVX_ExtractVector128 || NI_AVX_ExtractVector128);
1440                 op1Reg = op1->gtRegNum;
1441                 op2Reg = op2->gtRegNum;
1442                 lastOp = op2;
1443             }
1444             else
1445             {
1446                 assert(numArgs == 3);
1447                 assert(op1->OperIsList());
1448                 assert(op1->gtGetOp2()->OperIsList());
1449                 assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
1450
1451                 GenTreeArgList* argList = op1->AsArgList();
1452                 op1                     = argList->Current();
1453                 genConsumeRegs(op1);
1454                 op1Reg = op1->gtRegNum;
1455
1456                 argList = argList->Rest();
1457                 op2     = argList->Current();
1458                 genConsumeRegs(op2);
1459                 op2Reg = op2->gtRegNum;
1460
1461                 argList = argList->Rest();
1462                 lastOp  = argList->Current();
1463                 genConsumeRegs(lastOp);
1464             }
1465
1466             regNumber op3Reg = lastOp->gtRegNum;
1467
1468             auto emitSwCase = [&](unsigned i) {
1469                 // TODO-XARCH-Bug the emitter cannot work with imm8 >= 128,
1470                 // so clear the 8th bit that is not used by the instructions
1471                 i &= 0x7FU;
1472                 if (numArgs == 3)
1473                 {
1474                     if (intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128)
1475                     {
1476                         emit->emitIns_AR_R_I(ins, attr, op1Reg, 0, op2Reg, (int)i);
1477                     }
1478                     else if (op2->TypeGet() == TYP_I_IMPL)
1479                     {
1480                         emit->emitIns_SIMD_R_R_AR_I(ins, attr, targetReg, op1Reg, op2Reg, (int)i);
1481                     }
1482                     else
1483                     {
1484                         assert(op2->TypeGet() == TYP_SIMD16);
1485                         emit->emitIns_SIMD_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, (int)i);
1486                     }
1487                 }
1488                 else
1489                 {
1490                     assert(numArgs == 2);
1491                     assert(intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128);
1492                     emit->emitIns_SIMD_R_R_I(ins, attr, targetReg, op1Reg, (int)i);
1493                 }
1494             };
1495
1496             if (lastOp->IsCnsIntOrI())
1497             {
1498                 ssize_t ival = lastOp->AsIntCon()->IconValue();
1499                 emitSwCase((unsigned)ival);
1500             }
1501             else
1502             {
1503                 // We emit a fallback case for the scenario when the imm-op is not a constant. This should
1504                 // normally happen when the intrinsic is called indirectly, such as via Reflection. However, it
1505                 // can also occur if the consumer calls it directly and just doesn't pass a constant value.
1506                 regNumber baseReg = node->ExtractTempReg();
1507                 regNumber offsReg = node->GetSingleTempReg();
1508                 genHWIntrinsicJumpTableFallback(intrinsicID, op3Reg, baseReg, offsReg, emitSwCase);
1509             }
1510             break;
1511         }
1512
1513         default:
1514             unreached();
1515             break;
1516     }
1517
1518     genProduceReg(node);
1519 }
1520
1521 //------------------------------------------------------------------------
1522 // genAESIntrinsic: Generates the code for an AES hardware intrinsic node
1523 //
1524 // Arguments:
1525 //    node - The hardware intrinsic node
1526 //
1527 void CodeGen::genAESIntrinsic(GenTreeHWIntrinsic* node)
1528 {
1529     NYI("Implement AES intrinsic code generation");
1530 }
1531
1532 //------------------------------------------------------------------------
1533 // genBMI1Intrinsic: Generates the code for a BMI1 hardware intrinsic node
1534 //
1535 // Arguments:
1536 //    node - The hardware intrinsic node
1537 //
1538 void CodeGen::genBMI1Intrinsic(GenTreeHWIntrinsic* node)
1539 {
1540     NYI("Implement BMI1 intrinsic code generation");
1541 }
1542
1543 //------------------------------------------------------------------------
1544 // genBMI2Intrinsic: Generates the code for a BMI2 hardware intrinsic node
1545 //
1546 // Arguments:
1547 //    node - The hardware intrinsic node
1548 //
1549 void CodeGen::genBMI2Intrinsic(GenTreeHWIntrinsic* node)
1550 {
1551     NYI("Implement BMI2 intrinsic code generation");
1552 }
1553
1554 //------------------------------------------------------------------------
1555 // genFMAIntrinsic: Generates the code for an FMA hardware intrinsic node
1556 //
1557 // Arguments:
1558 //    node - The hardware intrinsic node
1559 //
1560 void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
1561 {
1562     NYI("Implement FMA intrinsic code generation");
1563 }
1564
1565 //------------------------------------------------------------------------
1566 // genLZCNTIntrinsic: Generates the code for a LZCNT hardware intrinsic node
1567 //
1568 // Arguments:
1569 //    node - The hardware intrinsic node
1570 //
1571 void CodeGen::genLZCNTIntrinsic(GenTreeHWIntrinsic* node)
1572 {
1573     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1574     GenTree*       op1         = node->gtGetOp1();
1575     regNumber      targetReg   = node->gtRegNum;
1576     assert(targetReg != REG_NA);
1577     var_types targetType = node->TypeGet();
1578     regNumber op1Reg     = op1->gtRegNum;
1579     genConsumeOperands(node);
1580
1581     assert(intrinsicID == NI_LZCNT_LeadingZeroCount);
1582
1583     inst_RV_RV(INS_lzcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1584
1585     genProduceReg(node);
1586 }
1587
1588 //------------------------------------------------------------------------
1589 // genPCLMULQDQIntrinsic: Generates the code for a PCLMULQDQ hardware intrinsic node
1590 //
1591 // Arguments:
1592 //    node - The hardware intrinsic node
1593 //
1594 void CodeGen::genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node)
1595 {
1596     NYI("Implement PCLMULQDQ intrinsic code generation");
1597 }
1598
1599 //------------------------------------------------------------------------
1600 // genPOPCNTIntrinsic: Generates the code for a POPCNT hardware intrinsic node
1601 //
1602 // Arguments:
1603 //    node - The hardware intrinsic node
1604 //
1605 void CodeGen::genPOPCNTIntrinsic(GenTreeHWIntrinsic* node)
1606 {
1607     NamedIntrinsic intrinsicID = node->gtHWIntrinsicId;
1608     GenTree*       op1         = node->gtGetOp1();
1609     regNumber      targetReg   = node->gtRegNum;
1610     assert(targetReg != REG_NA);
1611     var_types targetType = node->TypeGet();
1612     regNumber op1Reg     = op1->gtRegNum;
1613     genConsumeOperands(node);
1614
1615     assert(intrinsicID == NI_POPCNT_PopCount);
1616
1617     inst_RV_RV(INS_popcnt, targetReg, op1Reg, targetType, emitTypeSize(targetType));
1618
1619     genProduceReg(node);
1620 }
1621
1622 #endif // FEATURE_HW_INTRINSICS