src/jit/simdcodegenxarch.cpp

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   6 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   7 XX                                                                           XX
   8 XX                        Amd64 SIMD Code Generator                          XX
   9 XX                                                                           XX
  10 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  11 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  12 */
  13 #include "jitpch.h"
  14 #ifdef _MSC_VER
  15 #pragma hdrstop
  16 #endif
  17
  18 #ifndef LEGACY_BACKEND // This file is ONLY used for the RyuJIT backend that uses the linear scan register allocator.
  19
  20 #ifdef _TARGET_AMD64_
  21 #include "emit.h"
  22 #include "codegen.h"
  23 #include "lower.h"
  24 #include "gcinfo.h"
  25 #include "gcinfoencoder.h"
  26
  27 #ifdef FEATURE_SIMD
  28
  29 // Instruction immediates
  30
  31 // Insertps:
  32 // - bits 6 and 7 of the immediate indicate which source item to select (0..3)
  33 // - bits 4 and 5 of the immediate indicate which target item to insert into (0..3)
  34 // - bits 0 to 3 of the immediate indicate which target item to zero
  35 #define INSERTPS_SOURCE_SELECT(i)       (i<<6)
  36 #define INSERTPS_TARGET_SELECT(i)       (i<<4)
  37 #define INSERTPS_ZERO(i)                (1<<i)
  38
  39 // getOpForSIMDIntrinsic: return the opcode for the given SIMD Intrinsic
  40 //
  41 // Arguments:
  42 //   intrinsicId    -   SIMD intrinsic Id
  43 //   baseType       -   Base type of the SIMD vector
  44 //   immed          -   Out param. Any immediate byte operand that needs to be passed to SSE2 opcode
  45 //
  46 //
  47 // Return Value:
  48 //   Instruction (op) to be used, and immed is set if instruction requires an immediate operand.
  49 //
  50 instruction
  51 CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId,
  52                                var_types baseType,
  53                                unsigned *ival /*=nullptr*/)
  54 {
  55     // Minimal required instruction set is SSE2.
  56     assert(compiler->canUseSSE2());
  57
  58     instruction result = INS_invalid;
  59     switch(intrinsicId)
  60     {
  61         case SIMDIntrinsicInit:
  62             if (compiler->canUseAVX())
  63             {
  64                 // AVX supports broadcast instructions to populate YMM reg with a single float/double value from memory.
  65                 // AVX2 supports broadcast instructions to populate YMM reg with a single value from memory or mm reg.
  66                 // If we decide to use AVX2 only, we can remove this assert.
  67                 if ((compiler->opts.eeFlags & CORJIT_FLG_USE_AVX2) == 0)
  68                 {
  69                     assert(baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
  70                 }
  71                 switch (baseType)
  72                 {
  73                 case TYP_FLOAT:     result = INS_vbroadcastss; break;
  74                 case TYP_DOUBLE:    result = INS_vbroadcastsd; break;
  75                 case TYP_ULONG:     __fallthrough;
  76                 case TYP_LONG:      result = INS_vpbroadcastq; break;
  77                 case TYP_UINT:      __fallthrough;
  78                 case TYP_INT:       result = INS_vpbroadcastd; break;
  79                 case TYP_CHAR:      __fallthrough;
  80                 case TYP_SHORT:     result = INS_vpbroadcastw; break;
  81                 case TYP_UBYTE:     __fallthrough;
  82                 case TYP_BYTE:      result = INS_vpbroadcastb; break;
  83                 default:            unreached();
  84                 }
  85                 break;
  86             }
  87             // For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
  88             __fallthrough;
  89         case SIMDIntrinsicShuffleSSE2:
  90             if (baseType == TYP_FLOAT)
  91             {
  92                 result = INS_shufps;
  93             }
  94             else if (baseType == TYP_DOUBLE)
  95             {
  96                 result = INS_shufpd;
  97             }
  98             else if (baseType == TYP_INT || baseType == TYP_UINT)
  99             {
 100                 result = INS_pshufd;
 101             }
 102             else if (baseType == TYP_LONG || baseType == TYP_ULONG)
 103             {
 104                 // We don't have a seperate SSE2 instruction and will
 105                 // use the instruction meant for doubles since it is
 106                 // of the same size as a long.
 107                 result = INS_shufpd;
 108             }
 109             break;
 110
 111         case SIMDIntrinsicSqrt:
 112             if (baseType == TYP_FLOAT)
 113             {
 114                 result = INS_sqrtps;
 115             }
 116             else if (baseType == TYP_DOUBLE)
 117             {
 118                 result = INS_sqrtpd;
 119             }
 120             else
 121             {
 122                 unreached();
 123             }
 124             break;
 125
 126         case SIMDIntrinsicAdd:
 127             if (baseType == TYP_FLOAT)
 128             {
 129                 result = INS_addps;
 130             }
 131             else if (baseType == TYP_DOUBLE)
 132             {
 133                 result = INS_addpd;
 134             }
 135             else if (baseType == TYP_INT || baseType == TYP_UINT)
 136             {
 137                 result = INS_paddd;
 138             }
 139             else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
 140             {
 141                 result = INS_paddw;
 142             }
 143             else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
 144             {
 145                 result = INS_paddb;
 146             }
 147             else if (baseType == TYP_LONG || baseType == TYP_ULONG)
 148             {
 149                 result = INS_paddq;
 150             }
 151             break;
 152
 153         case SIMDIntrinsicSub:
 154             if (baseType == TYP_FLOAT)
 155             {
 156                 result = INS_subps;
 157             }
 158             else if (baseType == TYP_DOUBLE)
 159             {
 160                 result = INS_subpd;
 161             }
 162             else if (baseType == TYP_INT || baseType == TYP_UINT)
 163             {
 164                 result = INS_psubd;
 165             }
 166             else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
 167             {
 168                 result = INS_psubw;
 169             }
 170             else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
 171             {
 172                 result = INS_psubb;
 173             }
 174             else if (baseType == TYP_LONG || baseType == TYP_ULONG)
 175             {
 176                 result = INS_psubq;
 177             }
 178             break;
 179
 180         case SIMDIntrinsicMul:
 181             if (baseType == TYP_FLOAT)
 182             {
 183                 result = INS_mulps;
 184             }
 185             else if (baseType == TYP_DOUBLE)
 186             {
 187                 result = INS_mulpd;
 188             }
 189             else if (baseType == TYP_SHORT)
 190             {
 191                 result = INS_pmullw;
 192             }
 193             else if (compiler->canUseAVX())
 194             {
 195                 if (baseType == TYP_INT)
 196                 {
 197                     result = INS_pmulld;
 198                 }
 199             }
 200             break;
 201
 202         case SIMDIntrinsicDiv:
 203             if (baseType == TYP_FLOAT)
 204             {
 205                 result = INS_divps;
 206             }
 207             else if (baseType == TYP_DOUBLE)
 208             {
 209                 result = INS_divpd;
 210             }
 211             else
 212             {
 213                 unreached();
 214             }
 215             break;
 216
 217         case SIMDIntrinsicMin:
 218             if (baseType == TYP_FLOAT)
 219             {
 220                 result = INS_minps;
 221             }
 222             else if (baseType == TYP_DOUBLE)
 223             {
 224                 result = INS_minpd;
 225             }
 226             else if (baseType == TYP_UBYTE)
 227             {
 228                 result = INS_pminub;
 229             }
 230             else if (baseType == TYP_SHORT)
 231             {
 232                 result = INS_pminsw;
 233             }
 234             else
 235             {
 236                 unreached();
 237             }
 238             break;
 239
 240         case SIMDIntrinsicMax:
 241             if (baseType == TYP_FLOAT)
 242             {
 243                 result = INS_maxps;
 244             }
 245             else if (baseType == TYP_DOUBLE)
 246             {
 247                 result = INS_maxpd;
 248             }
 249             else if (baseType == TYP_UBYTE)
 250             {
 251                 result = INS_pmaxub;
 252             }
 253             else if (baseType == TYP_SHORT)
 254             {
 255                 result = INS_pmaxsw;
 256             }
 257             else
 258             {
 259                 unreached();
 260             }
 261             break;
 262
 263         case SIMDIntrinsicEqual:
 264             if (baseType == TYP_FLOAT)
 265             {
 266                 result = INS_cmpps;
 267                 assert(ival != nullptr);
 268                 *ival = 0;
 269             }
 270             else if (baseType == TYP_DOUBLE)
 271             {
 272                 result = INS_cmppd;
 273                 assert(ival != nullptr);
 274                 *ival = 0;
 275             }
 276             else if (baseType == TYP_INT || baseType == TYP_UINT)
 277             {
 278                 result = INS_pcmpeqd;
 279             }
 280             else if (baseType == TYP_CHAR || baseType == TYP_SHORT)
 281             {
 282                 result = INS_pcmpeqw;
 283             }
 284             else if (baseType == TYP_UBYTE || baseType == TYP_BYTE)
 285             {
 286                 result = INS_pcmpeqb;
 287             }
 288             else if (compiler->canUseAVX() && (baseType == TYP_ULONG || baseType == TYP_LONG))
 289             {
 290                 result = INS_pcmpeqq;
 291             }
 292             break;
 293
 294         case SIMDIntrinsicLessThan:
 295             // Packed integers use > with swapped operands
 296             assert(baseType != TYP_INT);
 297
 298             if (baseType == TYP_FLOAT)
 299             {
 300                 result = INS_cmpps;
 301                 assert(ival != nullptr);
 302                 *ival = 1;
 303             }
 304             else if (baseType == TYP_DOUBLE)
 305             {
 306                 result = INS_cmppd;
 307                 assert(ival != nullptr);
 308                 *ival = 1;
 309             }
 310             break;
 311
 312         case SIMDIntrinsicLessThanOrEqual:
 313             // Packed integers use (a==b) || ( b > a) in place of a <= b.
 314             assert(baseType != TYP_INT);
 315
 316             if (baseType == TYP_FLOAT)
 317             {
 318                 result = INS_cmpps;
 319                 assert(ival != nullptr);
 320                 *ival = 2;
 321             }
 322             else if (baseType == TYP_DOUBLE)
 323             {
 324                 result = INS_cmppd;
 325                 assert(ival != nullptr);
 326                 *ival = 2;
 327             }
 328             break;
 329
 330         case SIMDIntrinsicGreaterThan:
 331             // Packed float/double use < with swapped operands
 332             assert(!varTypeIsFloating(baseType));
 333
 334             // SSE2 supports only signed >
 335             if (baseType == TYP_INT)
 336             {
 337                 result = INS_pcmpgtd;
 338             }
 339             else if (baseType == TYP_SHORT)
 340             {
 341                 result = INS_pcmpgtw;
 342             }
 343             else if (baseType == TYP_BYTE)
 344             {
 345                 result = INS_pcmpgtb;
 346             }
 347             else if (compiler->canUseAVX() && (baseType == TYP_LONG))
 348             {
 349                 result = INS_pcmpgtq;
 350             }
 351             break;
 352
 353         case SIMDIntrinsicBitwiseAnd:
 354             if (baseType == TYP_FLOAT)
 355             {
 356                 result = INS_andps;
 357             }
 358             else if (baseType == TYP_DOUBLE)
 359             {
 360                 result = INS_andpd;
 361             }
 362             else if (varTypeIsIntegral(baseType))
 363             {
 364                 result = INS_pand;
 365             }
 366             break;
 367
 368         case SIMDIntrinsicBitwiseAndNot:
 369             if (baseType == TYP_FLOAT)
 370             {
 371                 result = INS_andnps;
 372             }
 373             else if (baseType == TYP_DOUBLE)
 374             {
 375                 result = INS_andnpd;
 376             }
 377             else if (baseType == TYP_INT)
 378             {
 379                 result = INS_pandn;
 380             }
 381             else if (varTypeIsIntegral(baseType))
 382             {
 383                 result = INS_pandn;
 384             }
 385             break;
 386
 387         case SIMDIntrinsicBitwiseOr:
 388             if (baseType == TYP_FLOAT)
 389             {
 390                 result = INS_orps;
 391             }
 392             else if (baseType == TYP_DOUBLE)
 393             {
 394                 result = INS_orpd;
 395             }
 396             else if (varTypeIsIntegral(baseType))
 397             {
 398                 result = INS_por;
 399             }
 400             break;
 401
 402         case SIMDIntrinsicBitwiseXor:
 403             if (baseType == TYP_FLOAT)
 404             {
 405                 result = INS_xorps;
 406             }
 407             else if (baseType == TYP_DOUBLE)
 408             {
 409                 result = INS_xorpd;
 410             }
 411             else if (varTypeIsIntegral(baseType))
 412             {
 413                 result = INS_pxor;
 414             }
 415             break;
 416
 417         case SIMDIntrinsicCast:
 418             result = INS_movaps;
 419             break;
 420
 421         case SIMDIntrinsicShiftLeftInternal:
 422             // base type doesn't matter since the entire vector is shifted left
 423             result = INS_pslldq;
 424             break;
 425
 426         case SIMDIntrinsicShiftRightInternal:
 427             // base type doesn't matter since the entire vector is shifted right
 428             result = INS_psrldq;
 429             break;
 430
 431         case SIMDIntrinsicUpperSave:
 432             result = INS_vextractf128;
 433             break;
 434
 435         case SIMDIntrinsicUpperRestore:
 436             result = INS_insertps;
 437             break;
 438
 439         default:
 440             assert(!"Unsupported SIMD intrinsic");
 441             unreached();
 442     }
 443
 444     noway_assert(result != INS_invalid);
 445     return result;
 446 }
 447
 448 // genSIMDScalarMove: Generate code to move a value of type "type" from src mm reg
 449 // to target mm reg, zeroing out the upper bits if and only if specified.
 450 //
 451 // Arguments:
 452 //    type             the type of value to be moved
 453 //    targetReg        the target reg
 454 //    srcReg           the src reg
 455 //    moveType         action to be performed on target upper bits
 456 //
 457 // Return Value:
 458 //    None
 459 //
 460 // Notes:
 461 //    This is currently only supported for floating point types.
 462 //
 463 void
 464 CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
 465 {
 466     var_types targetType = compiler->getSIMDVectorType();
 467     assert(varTypeIsFloating(type));
 468 #ifdef FEATURE_AVX_SUPPORT
 469     if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
 470     {
 471         switch (moveType)
 472         {
 473         case SMT_PreserveUpper:
 474             if (srcReg != targetReg)
 475             {
 476                 instruction ins = ins_Store(type);
 477                 if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
 478                 {
 479                     // In general, when we use a three-operands move instruction, we want to merge the src with itself.
 480                     // This is an exception in that we actually want the "merge" behavior, so we must specify it with
 481                     // all 3 operands.
 482                     inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
 483                 }
 484                 else
 485                 {
 486                     inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
 487                 }
 488             }
 489             break;
 490
 491         case SMT_ZeroInitUpper:
 492             {
 493                 // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
 494                 // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
 495                 // to zero all but the lower bits.
 496                 unsigned int insertpsImm = (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
 497                 inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
 498                 break;
 499             }
 500
 501         case SMT_ZeroInitUpper_SrcHasUpperZeros:
 502             if (srcReg != targetReg)
 503             {
 504                 instruction ins = ins_Copy(type);
 505                 assert(!getEmitter()->IsThreeOperandMoveAVXInstruction(ins));
 506                 inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
 507             }
 508             break;
 509
 510         default:
 511             unreached();
 512         }
 513     }
 514     else
 515 #endif // FEATURE_AVX_SUPPORT
 516     {
 517         // SSE
 518
 519         switch (moveType)
 520         {
 521         case SMT_PreserveUpper:
 522             if (srcReg != targetReg)
 523             {
 524                 inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
 525             }
 526             break;
 527
 528         case SMT_ZeroInitUpper:
 529             if (srcReg == targetReg)
 530             {
 531                 // There is no guarantee that upper bits of op1Reg are zero.
 532                 // We achieve this by using left logical shift 12-bytes and right logical shift 12 bytes.
 533                 instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, type);
 534                 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
 535                 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, type);
 536                 getEmitter()->emitIns_R_I(ins, EA_16BYTE, srcReg, 12);
 537             }
 538             else
 539             {
 540                 genSIMDZero(targetType, TYP_FLOAT, targetReg);
 541                 inst_RV_RV(ins_Store(type), targetReg, srcReg);
 542             }
 543             break;
 544
 545         case SMT_ZeroInitUpper_SrcHasUpperZeros:
 546             if (srcReg != targetReg)
 547             {
 548                 inst_RV_RV(ins_Copy(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
 549             }
 550             break;
 551
 552         default:
 553             unreached();
 554         }
 555     }
 556 }
 557
 558 void
 559 CodeGen::genSIMDZero(var_types targetType, var_types baseType, regNumber targetReg)
 560 {
 561     // pxor reg, reg
 562     instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseXor, baseType);
 563     inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
 564 }
 565
 566 //------------------------------------------------------------------------
 567 // genSIMDIntrinsicInit: Generate code for SIMD Intrinsic Initialize.
 568 //
 569 // Arguments:
 570 //    simdNode - The GT_SIMD node
 571 //
 572 // Return Value:
 573 //    None.
 574 //
 575 void
 576 CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
 577 {
 578     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInit);
 579
 580     GenTree* op1 = simdNode->gtGetOp1();
 581     var_types baseType = simdNode->gtSIMDBaseType;
 582     regNumber targetReg = simdNode->gtRegNum;
 583     assert(targetReg != REG_NA);
 584     var_types targetType = simdNode->TypeGet();
 585     InstructionSet iset = compiler->getSIMDInstructionSet();
 586     unsigned size = simdNode->gtSIMDSize;
 587
 588     // Should never see small int base type vectors except for zero initialization.
 589     noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
 590
 591     instruction ins = INS_invalid;
 592     if (op1->isContained())
 593     {
 594         if (op1->IsIntegralConst(0) || op1->IsFPZero())
 595         {
 596             genSIMDZero(targetType, baseType, targetReg);
 597         }
 598         else if (varTypeIsIntegral(baseType) && op1->IsIntegralConst(-1))
 599         {
 600             // case of initializing elements of vector with all 1's
 601             // generate pcmpeqd reg, reg
 602             ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
 603             inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
 604         }
 605 #ifdef FEATURE_AVX_SUPPORT
 606         else
 607         {
 608             assert(iset == InstructionSet_AVX);
 609             ins = getOpForSIMDIntrinsic(SIMDIntrinsicInit, baseType);
 610             if (op1->IsCnsFltOrDbl())
 611             {
 612                 getEmitter()->emitInsBinary(ins, emitTypeSize(targetType), simdNode, op1);
 613             }
 614             else if (op1->OperIsLocalAddr())
 615             {
 616                 unsigned offset = (op1->OperGet() == GT_LCL_FLD_ADDR) ? op1->gtLclFld.gtLclOffs : 0;
 617                 getEmitter()->emitIns_R_S(ins, emitTypeSize(targetType), targetReg, op1->gtLclVarCommon.gtLclNum, offset);
 618             }
 619             else
 620             {
 621                 unreached();
 622             }
 623         }
 624 #endif // FEATURE_AVX_SUPPORT
 625     }
 626     else if (iset == InstructionSet_AVX && ((size == 32) || (size == 16)))
 627     {
 628         regNumber srcReg = genConsumeReg(op1);
 629         if (baseType == TYP_INT || baseType == TYP_UINT ||
 630             baseType == TYP_LONG || baseType == TYP_ULONG)
 631         {
 632             ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
 633             assert(ins != INS_invalid);
 634             inst_RV_RV(ins, targetReg, srcReg, baseType, emitTypeSize(baseType));
 635             srcReg = targetReg;
 636         }
 637
 638         ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
 639         getEmitter()->emitIns_R_R(ins, emitActualTypeSize(targetType), targetReg, srcReg);
 640     }
 641     else
 642     {
 643         // If we reach here, op1 is not contained and we are using SSE or it is a SubRegisterSIMDType.
 644         // In either case we are going to use the SSE2 shuffle instruction.
 645
 646         regNumber op1Reg = genConsumeReg(op1);
 647         unsigned shuffleControl = 0;
 648
 649         if (compiler->isSubRegisterSIMDType(simdNode))
 650         {
 651              assert(baseType == TYP_FLOAT);
 652
 653             // We cannot assume that upper bits of op1Reg or targetReg be zero.
 654             // Therefore we need to explicitly zero out upper bits.  This is
 655             // essential for the shuffle operation performed below.
 656             //
 657             // If op1 is a float/double constant, we would have loaded it from
 658             // data section using movss/sd.  Similarly if op1 is a memory op we
 659             // would have loaded it using movss/sd.  Movss/sd when loading a xmm reg
 660             // from memory would zero-out upper bits. In these cases we can
 661             // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently
 662             // if they are not the same.
 663             SIMDScalarMoveType moveType = op1->IsCnsFltOrDbl() || op1->isMemoryOp()
 664                 ? SMT_ZeroInitUpper_SrcHasUpperZeros
 665                 : SMT_ZeroInitUpper;
 666
 667             genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, moveType);
 668
 669             if (size == 8)
 670             {
 671                 shuffleControl = 0x50;
 672             }
 673             else if (size == 12)
 674             {
 675                 shuffleControl = 0x40;
 676             }
 677             else
 678             {
 679                 noway_assert(!"Unexpected size for SIMD type");
 680             }
 681         }
 682         else // Vector<T>
 683         {
 684             if (op1Reg != targetReg)
 685             {
 686                 if (varTypeIsFloating(baseType))
 687                 {
 688                     ins = ins_Copy(targetType);
 689                 }
 690                 else if (baseType == TYP_INT || baseType == TYP_UINT ||
 691                          baseType == TYP_LONG || baseType == TYP_ULONG)
 692                 {
 693                     ins = ins_CopyIntToFloat(baseType, TYP_FLOAT);
 694                 }
 695
 696                 assert(ins != INS_invalid);
 697                 inst_RV_RV(ins, targetReg, op1Reg, baseType, emitTypeSize(baseType));
 698             }
 699         }
 700
 701         ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
 702         getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, shuffleControl);
 703     }
 704
 705     genProduceReg(simdNode);
 706 }
 707
 708 //-------------------------------------------------------------------------------------------
 709 // genSIMDIntrinsicInitN: Generate code for SIMD Intrinsic Initialize for the form that takes
 710 //                        a number of arguments equal to the length of the Vector.
 711 //
 712 // Arguments:
 713 //    simdNode - The GT_SIMD node
 714 //
 715 // Return Value:
 716 //    None.
 717 //
 718 void
 719 CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
 720 {
 721     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN);
 722
 723     // Right now this intrinsic is supported only on TYP_FLOAT vectors
 724     var_types baseType = simdNode->gtSIMDBaseType;
 725     noway_assert(baseType == TYP_FLOAT);
 726
 727     regNumber targetReg = simdNode->gtRegNum;
 728     assert(targetReg != REG_NA);
 729
 730     var_types targetType = simdNode->TypeGet();
 731
 732     // Note that we cannot use targetReg before consumed all source operands. Therefore,
 733     // Need an internal register to stitch together all the values into a single vector
 734     // in an XMM reg.
 735     assert(simdNode->gtRsvdRegs != RBM_NONE);
 736     assert(genCountBits(simdNode->gtRsvdRegs) == 1);
 737     regNumber vectorReg = genRegNumFromMask(simdNode->gtRsvdRegs);
 738
 739     // Zero out vectorReg if we are constructing a vector whose size is not equal to targetType vector size.
 740     // For example in case of Vector4f we don't need to zero when using SSE2.
 741     if (compiler->isSubRegisterSIMDType(simdNode))
 742     {
 743         genSIMDZero(targetType, baseType, vectorReg);
 744     }
 745
 746     unsigned int baseTypeSize = genTypeSize(baseType);
 747     instruction insLeftShift = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
 748
 749     // We will first consume the list items in execution (left to right) order,
 750     // and record the registers.
 751     regNumber operandRegs[SIMD_INTRINSIC_MAX_PARAM_COUNT];
 752     unsigned initCount = 0;
 753     for (GenTree* list = simdNode->gtGetOp1(); list != nullptr; list = list->gtGetOp2())
 754     {
 755         assert(list->OperGet() == GT_LIST);
 756         GenTree* listItem = list->gtGetOp1();
 757         assert(listItem->TypeGet() == baseType);
 758         assert(!listItem->isContained());
 759         regNumber operandReg = genConsumeReg(listItem);
 760         operandRegs[initCount] = operandReg;
 761         initCount++;
 762     }
 763
 764     unsigned int offset = 0;
 765     for (unsigned i = 0; i < initCount; i++)
 766     {
 767         // We will now construct the vector from the list items in reverse order.
 768         // This allows us to efficiently stitch together a vector as follows:
 769         // vectorReg = (vectorReg << offset)
 770         // VectorReg[0] = listItemReg
 771         // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper
 772         // bits of vectorReg are not modified.
 773
 774         regNumber operandReg = operandRegs[initCount - i - 1];
 775         if (offset != 0)
 776         {
 777             getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
 778         }
 779         genSIMDScalarMove(baseType, vectorReg, operandReg, SMT_PreserveUpper);
 780
 781         offset += baseTypeSize;
 782     }
 783
 784     noway_assert(offset == simdNode->gtSIMDSize);
 785
 786     // Load the initialized value.
 787     if (targetReg != vectorReg)
 788     {
 789         inst_RV_RV(ins_Copy(targetType), targetReg, vectorReg, targetType, emitActualTypeSize(targetType));
 790     }
 791     genProduceReg(simdNode);
 792 }
 793
 794 //----------------------------------------------------------------------------------
 795 // genSIMDIntrinsicUnOp: Generate code for SIMD Intrinsic unary operations like sqrt.
 796 //
 797 // Arguments:
 798 //    simdNode - The GT_SIMD node
 799 //
 800 // Return Value:
 801 //    None.
 802 //
 803 void
 804 CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode)
 805 {
 806     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSqrt || simdNode->gtSIMDIntrinsicID == SIMDIntrinsicCast);
 807
 808     GenTree* op1 = simdNode->gtGetOp1();
 809     var_types baseType = simdNode->gtSIMDBaseType;
 810     regNumber targetReg = simdNode->gtRegNum;
 811     assert(targetReg != REG_NA);
 812     var_types targetType = simdNode->TypeGet();
 813
 814     regNumber op1Reg = genConsumeReg(op1);
 815     instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
 816     if (simdNode->gtSIMDIntrinsicID != SIMDIntrinsicCast || targetReg != op1Reg)
 817     {
 818         inst_RV_RV(ins, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
 819     }
 820     genProduceReg(simdNode);
 821 }
 822
 823 //--------------------------------------------------------------------------------
 824 // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations
 825 // add, sub, mul, bit-wise And, AndNot and Or.
 826 //
 827 // Arguments:
 828 //    simdNode - The GT_SIMD node
 829 //
 830 // Return Value:
 831 //    None.
 832 //
 833 void
 834 CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
 835 {
 836     assert( simdNode->gtSIMDIntrinsicID == SIMDIntrinsicAdd           ||
 837             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicSub           ||
 838             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul           ||
 839             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv           ||
 840             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAnd    ||
 841             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseAndNot ||
 842             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseOr     ||
 843             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicBitwiseXor    ||
 844             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMin           ||
 845             simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMax
 846            );
 847
 848     GenTree* op1 = simdNode->gtGetOp1();
 849     GenTree* op2 = simdNode->gtGetOp2();
 850     var_types baseType = simdNode->gtSIMDBaseType;
 851     regNumber targetReg = simdNode->gtRegNum;
 852     assert(targetReg != REG_NA);
 853     var_types targetType = simdNode->TypeGet();
 854     InstructionSet iset = compiler->getSIMDInstructionSet();
 855
 856     genConsumeOperands(simdNode);
 857     regNumber op1Reg = op1->gtRegNum;
 858     regNumber op2Reg = op2->gtRegNum;
 859     regNumber otherReg = op2Reg;
 860
 861     // Vector<Int>.Mul:
 862     // SSE2 doesn't have an instruction to perform this operation directly
 863     // whereas SSE4.1 does (pmulld).  This is special cased and computed
 864     // as follows.
 865     if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicMul &&
 866         baseType == TYP_INT &&
 867         iset == InstructionSet_SSE2)
 868     {
 869         // We need a temporary register that is NOT the same as the target,
 870         // and we MAY need another.
 871         assert(simdNode->gtRsvdRegs != RBM_NONE);
 872         assert(genCountBits(simdNode->gtRsvdRegs) == 2);
 873
 874         regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
 875         regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
 876         tmpRegsMask &= ~tmpReg1Mask;
 877         regNumber tmpReg = genRegNumFromMask(tmpReg1Mask);
 878         regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
 879         // The register allocator guarantees the following conditions:
 880         // - the only registers that may be the same among op1Reg, op2Reg, tmpReg
 881         //   and tmpReg2 are op1Reg and op2Reg.
 882         // Let's be extra-careful and assert that now.
 883         assert((op1Reg != tmpReg) && (op1Reg != tmpReg2) &&
 884                (op2Reg != tmpReg) && (op2Reg != tmpReg2) &&
 885                (tmpReg != tmpReg2));
 886
 887         // We will start by setting things up so that:
 888         //    - We have op1 in op1Reg and targetReg, and they are different registers.
 889         //    - We have op2 in op2Reg and tmpReg
 890         //    - Either we will leave the input registers (the original op1Reg and op2Reg) unmodified,
 891         //      OR they are the targetReg that will be produced.
 892         //      (Note that in the code we generate below op1Reg and op2Reg are never written.)
 893         // We will copy things as necessary to ensure that this is the case.
 894         // Note that we can swap op1 and op2, since multiplication is commutative.
 895         // We will not modify the values in op1Reg and op2Reg.
 896         // (Though note that if either op1 or op2 is the same as targetReg, we will make
 897         // a copy and use that copy as the input register.  In that case we WILL modify
 898         // the original value in the register, but will wind up with the result in targetReg
 899         // in the end, as expected.)
 900
 901         // First, we need a tmpReg that is NOT the same as targetReg.
 902         // Note that if we have another reg that is the same as targetReg,
 903         // we can use tmpReg2 for that case, as we will not have hit this case.
 904         if (tmpReg == targetReg)
 905         {
 906             tmpReg = tmpReg2;
 907         }
 908
 909         if (op2Reg == targetReg)
 910         {
 911             // We will swap the operands.
 912             // Since the code below only deals with registers, this now becomes the case where
 913             // op1Reg == targetReg.
 914             op2Reg = op1Reg;
 915             op1Reg = targetReg;
 916         }
 917         if (op1Reg == targetReg)
 918         {
 919             // Copy op1, and make tmpReg2 the new op1Reg.
 920             // Note that those regs can't be the same, as we asserted above.
 921             // Also, we know that tmpReg2 hasn't been used, because we couldn't have hit
 922             // the "tmpReg == targetReg" case.
 923             inst_RV_RV(INS_movaps, tmpReg2, op1Reg, targetType, emitActualTypeSize(targetType));
 924             op1Reg = tmpReg2;
 925             inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
 926             // However, we have one more case to worry about: what if op2Reg is also targetReg
 927             // (i.e. we have the same operand as op1 and op2)?
 928             // In that case we will set op2Reg to the same register as op1Reg.
 929             if (op2Reg == targetReg)
 930             {
 931                 op2Reg = tmpReg2;
 932             }
 933         }
 934         else
 935         {
 936             // Copy op1 to targetReg and op2 to tmpReg.
 937             inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
 938             inst_RV_RV(INS_movaps, tmpReg, op2Reg, targetType, emitActualTypeSize(targetType));
 939         }
 940         // Let's assert that things are as we expect.
 941         //    - We have op1 in op1Reg and targetReg, and they are different registers.
 942         assert(op1Reg != targetReg);
 943         //    - We have op2 in op2Reg and tmpReg, and they are different registers.
 944         assert(op2Reg != tmpReg);
 945         //    - Either we are going to leave op1's reg unmodified, or it is the targetReg.
 946         assert((op1->gtRegNum == op1Reg) || (op1->gtRegNum == op2Reg) || (op1->gtRegNum == targetReg));
 947         //    - Similarly, we are going to leave op2's reg unmodified, or it is the targetReg.
 948         assert((op2->gtRegNum == op1Reg) || (op2->gtRegNum == op2Reg) || (op2->gtRegNum == targetReg));
 949
 950         // Now we can generate the code.
 951
 952         // targetReg = op1 >> 4-bytes (op1 is already in targetReg)
 953         getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), targetReg, 4);
 954
 955         // tmpReg  = op2 >> 4-bytes (op2 is already in tmpReg)
 956         getEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(targetType), tmpReg, 4);
 957
 958         // tmp = unsigned double word multiply of targetReg and tmpReg. Essentially
 959         // tmpReg[63:0] = op1[1] * op2[1]
 960         // tmpReg[127:64] = op1[3] * op2[3]
 961         inst_RV_RV(INS_pmuludq, tmpReg, targetReg, targetType, emitActualTypeSize(targetType));
 962
 963         // Extract first and third double word results from tmpReg
 964         // tmpReg = shuffle(0,0,2,0) of tmpReg
 965         getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), tmpReg, tmpReg, 0x08);
 966
 967         // targetReg[63:0] = op1[0] * op2[0]
 968         // targetReg[127:64] = op1[2] * op2[2]
 969         inst_RV_RV(INS_movaps, targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
 970         inst_RV_RV(INS_pmuludq, targetReg, op2Reg, targetType, emitActualTypeSize(targetType));
 971
 972         // Extract first and third double word results from targetReg
 973         // targetReg = shuffle(0,0,2,0) of targetReg
 974         getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(targetType), targetReg, targetReg, 0x08);
 975
 976         // pack the results into a single vector
 977         inst_RV_RV(INS_punpckldq, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
 978     }
 979     else
 980     {
 981         instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
 982
 983         //Currently AVX doesn't support integer.
 984         //if the ins is INS_cvtsi2ss or INS_cvtsi2sd, we won't use AVX.
 985         if (op1Reg != targetReg   &&
 986             compiler->canUseAVX() &&
 987             !(ins == INS_cvtsi2ss || ins == INS_cvtsi2sd) &&
 988             getEmitter()->IsThreeOperandAVXInstruction(ins))
 989         {
 990             inst_RV_RV_RV(ins, targetReg, op1Reg, op2Reg, emitActualTypeSize(targetType));
 991         }
 992         else
 993         {
 994             if (op2Reg == targetReg)
 995             {
 996                 otherReg = op1Reg;
 997             }
 998             else if (op1Reg != targetReg)
 999             {
1000                 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1001             }
1002
1003             inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1004         }
1005     }
1006
1007     // Vector2/3 div: since the top-most elements will be zero, we end up
1008     // perfoming 0/0 which is a NAN. Therefore, post division we need to set the
1009     // top-most elements to zero. This is achieved by left logical shift followed
1010     // by right logical shift of targetReg.
1011     if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDiv && (simdNode->gtSIMDSize < 16))
1012     {
1013         // These are 16 byte operations, so we subtract from 16 bytes, not the vector register length.
1014         unsigned shiftCount = 16 - simdNode->gtSIMDSize;
1015         assert(shiftCount != 0);
1016         instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
1017         getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1018         ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1019         getEmitter()->emitIns_R_I(ins, EA_16BYTE, targetReg, shiftCount);
1020     }
1021
1022     genProduceReg(simdNode);
1023 }
1024
1025 //--------------------------------------------------------------------------------
1026 // genSIMDIntrinsicRelOp: Generate code for a SIMD Intrinsic relational operater
1027 // <, <=, >, >= and ==
1028 //
1029 // Arguments:
1030 //    simdNode - The GT_SIMD node
1031 //
1032 // Return Value:
1033 //    None.
1034 //
1035 void
1036 CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
1037 {
1038     GenTree* op1 = simdNode->gtGetOp1();
1039     GenTree* op2 = simdNode->gtGetOp2();
1040     var_types baseType = simdNode->gtSIMDBaseType;
1041     regNumber targetReg = simdNode->gtRegNum;
1042     assert(targetReg != REG_NA);
1043     var_types targetType = simdNode->TypeGet();
1044     InstructionSet iset = compiler->getSIMDInstructionSet();
1045
1046     genConsumeOperands(simdNode);
1047     regNumber op1Reg = op1->gtRegNum;
1048     regNumber op2Reg = op2->gtRegNum;
1049     regNumber otherReg = op2Reg;
1050
1051     switch(simdNode->gtSIMDIntrinsicID)
1052     {
1053     case SIMDIntrinsicEqual:
1054     case SIMDIntrinsicGreaterThan:
1055         {
1056             // SSE2: vector<(u)long> relation op should be implemented in terms of TYP_INT comparison operations
1057             assert(((iset == InstructionSet_AVX) || (baseType != TYP_LONG)) &&
1058                     (baseType != TYP_ULONG));
1059
1060             // Greater-than: Floating point vectors use "<" with swapped operands
1061             if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGreaterThan)
1062             {
1063                 assert(!varTypeIsFloating(baseType));
1064             }
1065
1066             unsigned ival = 0;
1067             instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1068
1069             // targetReg = op1reg > op2reg
1070             // Therefore, we can optimize if op1Reg == targetReg
1071             otherReg = op2Reg;
1072             if (op1Reg != targetReg)
1073             {
1074                 if (op2Reg == targetReg)
1075                 {
1076                     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicEqual);
1077                     otherReg = op1Reg;
1078                 }
1079                 else
1080                 {
1081                     inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1082                 }
1083             }
1084
1085             if (varTypeIsFloating(baseType))
1086             {
1087                 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, otherReg, ival);
1088             }
1089             else
1090             {
1091                 inst_RV_RV(ins, targetReg, otherReg, targetType, emitActualTypeSize(targetType));
1092             }
1093         }
1094         break;
1095
1096     case SIMDIntrinsicLessThan:
1097     case SIMDIntrinsicLessThanOrEqual:
1098         {
1099             // Int vectors use ">" and ">=" with swapped operands
1100             assert(varTypeIsFloating(baseType));
1101
1102             // Get the instruction opcode for compare operation
1103             unsigned ival;
1104             instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType, &ival);
1105
1106             // targetReg = op1reg RelOp op2reg
1107             // Thefore, we can optimize if op1Reg == targetReg
1108             if (op1Reg != targetReg)
1109             {
1110                 inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1111             }
1112
1113             getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, op2Reg, ival);
1114         }
1115         break;
1116
1117     // (In)Equality that produces bool result instead of a bit vector
1118     case SIMDIntrinsicOpEquality:
1119     case SIMDIntrinsicOpInEquality:
1120         {
1121             assert(genIsValidIntReg(targetReg));
1122
1123             // We need two additional XMM register as scratch
1124             assert(simdNode->gtRsvdRegs != RBM_NONE);
1125             assert(genCountBits(simdNode->gtRsvdRegs) == 2);
1126
1127             regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
1128             regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1129             tmpRegsMask &= ~tmpReg1Mask;
1130             regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1131             regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1132             var_types simdType = op1->TypeGet();
1133             // TODO-1stClassStructs: Temporary to minimize asmDiffs
1134             if (simdType == TYP_DOUBLE)
1135                 simdType = TYP_SIMD8;
1136
1137             // Here we should consider TYP_SIMD12 operands as if they were TYP_SIMD16
1138             // since both the operands will be in XMM registers.
1139             if (simdType == TYP_SIMD12)
1140             {
1141                 simdType = TYP_SIMD16;
1142             }
1143
1144             // tmpReg1 = (op1Reg == op2Reg)
1145             // Call this value of tmpReg1 as 'compResult' for further reference below.
1146             regNumber otherReg = op2Reg;
1147             if (tmpReg1 != op2Reg)
1148             {
1149                 if (tmpReg1 != op1Reg)
1150                 {
1151                     inst_RV_RV(ins_Copy(simdType), tmpReg1, op1Reg, simdType, emitActualTypeSize(simdType));
1152                 }
1153             }
1154             else
1155             {
1156                 otherReg = op1Reg;
1157             }
1158
1159             // For all integer types we can use TYP_INT comparison.
1160             unsigned ival = 0;
1161             instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, varTypeIsFloating(baseType) ? baseType : TYP_INT, &ival);
1162
1163             if (varTypeIsFloating(baseType))
1164             {
1165                 getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(simdType), tmpReg1, otherReg, ival);
1166             }
1167             else
1168             {
1169                 inst_RV_RV(ins, tmpReg1, otherReg, simdType, emitActualTypeSize(simdType));
1170             }
1171
1172             // If we have 32 bytes, start by anding the two 16-byte halves to get a 16-byte result.
1173             if (compiler->canUseAVX() && (simdType == TYP_SIMD32))
1174             {
1175                 // Reduce tmpReg1 from 256-bits to 128-bits bitwise-Anding the lower and uppper 128-bits
1176                 //
1177                 // Generated code sequence
1178                 // - vextractf128 tmpReg2, tmpReg1, 0x01
1179                 //       tmpReg2[128..255] <- 0
1180                 //       tmpReg2[0..127]   <- tmpReg1[128..255]
1181                 // - vandps tmpReg1, tempReg2
1182                 //       This will zero-out upper portion of tmpReg1 and
1183                 //       lower portion of tmpReg1 is and of upper and lower 128-bit comparison result.
1184                 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg2, tmpReg1, 0x01);
1185                 inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
1186             }
1187             // Next, if we have more than 8 bytes, and the two 8-byte halves to get a 8-byte result.
1188             if (simdType != TYP_SIMD8)
1189             {
1190                 // tmpReg2 = Shuffle(tmpReg1, (1,0,3,2))
1191                 // Note: vpshufd is a 128-bit only instruction. Therefore, explicitly pass EA_16BYTE
1192                 getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x4E);
1193
1194                 // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
1195                 //
1196                 // Note that what we have computed is as follows at this point:
1197                 // tmpReg1[0] = compResult[0] & compResult[2]
1198                 // tmpReg1[1] = compResult[1] & compResult[3]
1199                 inst_RV_RV(INS_andps, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType));
1200             }
1201             // At this point, we have either reduced the result to 8 bytes: tmpReg1[0] and tmpReg1[1],
1202             // OR we have a Vector2 (TYP_SIMD8) in tmpReg1, which has only those two fields.
1203
1204             // tmpReg2 = Shuffle(tmpReg1, (0,0,0,1))
1205             // tmpReg2[0] = compResult[1] & compResult[3]
1206             getEmitter()->emitIns_R_R_I(INS_pshufd, EA_16BYTE, tmpReg2, tmpReg1, 0x1);
1207
1208             // tmpReg1 = BitwiseAnd(tmpReg1, tmpReg2)
1209             // That is tmpReg1[0] = compResult[0] & compResult[1] & compResult[2] & compResult[3]
1210             inst_RV_RV(INS_pand, tmpReg1, tmpReg2, simdType, emitActualTypeSize(simdType)); // ??? INS_andps??
1211
1212             // targetReg = lower 32-bits of tmpReg1 = compResult[0] & compResult[1] & compResult[2] & compResult[3]
1213             // (Note that for mov_xmm2i, the int register is always in the reg2 position.
1214             inst_RV_RV(INS_mov_xmm2i, tmpReg1, targetReg, TYP_INT);
1215
1216             // Since we need to compute a bool result, targetReg needs to be set to 1 on true and zero on false.
1217             // Equality:
1218             //   cmp targetReg, 0xFFFFFFFF
1219             //   sete targetReg
1220             //   movzx targetReg, targetReg
1221             //
1222             // InEquality:
1223             //   cmp targetReg, 0xFFFFFFFF
1224             //   setne targetReg
1225             //   movzx targetReg, targetReg
1226             //
1227             getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, targetReg, 0xFFFFFFFF);
1228             inst_RV((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicOpEquality) ? INS_sete : INS_setne, targetReg, TYP_INT, EA_1BYTE);
1229             assert(simdNode->TypeGet() == TYP_INT);
1230             // Set the higher bytes to 0
1231             inst_RV_RV(ins_Move_Extend(TYP_UBYTE, true), targetReg, targetReg, TYP_UBYTE, emitTypeSize(TYP_UBYTE));
1232         }
1233         break;
1234
1235     default:
1236         noway_assert(!"Unimplemented SIMD relational operation.");
1237         unreached();
1238     }
1239
1240     genProduceReg(simdNode);
1241 }
1242
1243
1244 //--------------------------------------------------------------------------------
1245 // genSIMDIntrinsicDotProduct: Generate code for SIMD Intrinsic Dot Product.
1246 //
1247 // Arguments:
1248 //    simdNode - The GT_SIMD node
1249 //
1250 // Return Value:
1251 //    None.
1252 //
1253 void
1254 CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
1255 {
1256     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicDotProduct);
1257
1258     GenTree* op1 = simdNode->gtGetOp1();
1259     GenTree* op2 = simdNode->gtGetOp2();
1260     var_types baseType = simdNode->gtSIMDBaseType;
1261     var_types simdType = op1->TypeGet();
1262     // TODO-1stClassStructs: Temporary to minimize asmDiffs
1263     if (simdType == TYP_DOUBLE)
1264         simdType = TYP_SIMD8;
1265     var_types simdEvalType = (simdType == TYP_SIMD12) ? TYP_SIMD16 : simdType;
1266     regNumber targetReg = simdNode->gtRegNum;
1267     assert(targetReg != REG_NA);
1268
1269     // DotProduct is only supported on floating point types.
1270     var_types targetType = simdNode->TypeGet();
1271     assert(targetType == baseType);
1272     assert(varTypeIsFloating(baseType));
1273
1274     genConsumeOperands(simdNode);
1275     regNumber op1Reg = op1->gtRegNum;
1276     regNumber op2Reg = op2->gtRegNum;
1277
1278     regNumber tmpReg = REG_NA;
1279     // For SSE, or AVX with 32-byte vectors, we need an additional Xmm register as scratch.
1280     // However, it must be distinct from targetReg, so we request two from the register allocator.
1281     // Note that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
1282     if ((compiler->getSIMDInstructionSet() == InstructionSet_SSE2) || (simdEvalType == TYP_SIMD32))
1283     {
1284         assert(simdNode->gtRsvdRegs != RBM_NONE);
1285         assert(genCountBits(simdNode->gtRsvdRegs) == 2);
1286
1287         regMaskTP tmpRegsMask = simdNode->gtRsvdRegs;
1288         regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1289         tmpRegsMask &= ~tmpReg1Mask;
1290         regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1291         regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1292
1293         // Choose any register different from targetReg as tmpReg
1294         if (tmpReg1 != targetReg)
1295         {
1296             tmpReg = tmpReg1;
1297         }
1298         else
1299         {
1300             assert(targetReg != tmpReg2);
1301             tmpReg = tmpReg2;
1302         }
1303         assert(tmpReg != REG_NA);
1304         assert(tmpReg != targetReg);
1305     }
1306
1307     if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
1308     {
1309         // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg
1310         if (op1Reg == targetReg)
1311         {
1312             // Best case
1313             // nothing to do, we have registers in the right place
1314         }
1315         else if (op2Reg == targetReg)
1316         {
1317             op2Reg = op1Reg;
1318         }
1319         else
1320         {
1321             inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
1322         }
1323
1324         // DotProduct(v1, v2)
1325         // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg
1326         if (baseType == TYP_FLOAT)
1327         {
1328             // v0 = v1 * v2
1329             // tmp = v0                                       // v0  = (3, 2, 1, 0) - each element is given by its position
1330             // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1))      // tmp = (2, 3, 0, 1)
1331             // v0 = v0 + tmp                                  // v0  = (3+2, 2+3, 1+0, 0+1)
1332             // tmp = v0
1333             // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3))      // tmp = (0+1, 1+0, 2+3, 3+2)
1334             // v0 = v0 + tmp                                  // v0  = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
1335             //                                                // Essentially horizontal addtion of all elements.
1336             //                                                // We could achieve the same using SSEv3 instruction HADDPS.
1337             //
1338             inst_RV_RV(INS_mulps, targetReg, op2Reg);
1339             inst_RV_RV(INS_movaps, tmpReg, targetReg);
1340             inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0xb1);
1341             inst_RV_RV(INS_addps, targetReg, tmpReg);
1342             inst_RV_RV(INS_movaps, tmpReg, targetReg);
1343             inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg, tmpReg, 0x1b);
1344             inst_RV_RV(INS_addps, targetReg, tmpReg);
1345         }
1346         else if (baseType == TYP_DOUBLE)
1347         {
1348             // v0 = v1 * v2
1349             // tmp = v0                                       // v0  = (1, 0) - each element is given by its position
1350             // tmp = shuffle(tmp, tmp, Shuffle(0,1))          // tmp = (0, 1)
1351             // v0 = v0 + tmp                                  // v0  = (1+0, 0+1)
1352             inst_RV_RV(INS_mulpd, targetReg, op2Reg);
1353             inst_RV_RV(INS_movaps, tmpReg, targetReg);
1354             inst_RV_RV_IV(INS_shufpd, EA_16BYTE, tmpReg, tmpReg, 0x01);
1355             inst_RV_RV(INS_addpd, targetReg, tmpReg);
1356         }
1357         else
1358         {
1359             unreached();
1360         }
1361     }
1362     else
1363     {
1364         // We avoid reg move if either op1Reg == targetReg or op2Reg == targetReg.
1365         // Note that this is a duplicate of the code above for SSE, but in the AVX case we can eventually
1366         // use the 3-op form, so that we can avoid these copies.
1367         // TODO-CQ: Add inst_RV_RV_RV_IV().
1368         if (op1Reg == targetReg)
1369         {
1370             // Best case
1371             // nothing to do, we have registers in the right place
1372         }
1373         else if (op2Reg == targetReg)
1374         {
1375             op2Reg = op1Reg;
1376         }
1377         else
1378         {
1379             inst_RV_RV(ins_Copy(simdType), targetReg, op1Reg, simdEvalType, emitActualTypeSize(simdType));
1380         }
1381
1382         emitAttr emitSize = emitActualTypeSize(simdEvalType);
1383         if (baseType == TYP_FLOAT)
1384         {
1385             // dpps computes the dot product of the upper & lower halves of the 32-byte register.
1386             // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
1387             inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
1388             // If this is TYP_SIMD32, we need to combine the lower & upper results.
1389             if (simdEvalType == TYP_SIMD32)
1390             {
1391                 getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
1392                 inst_RV_RV(INS_addps, targetReg, tmpReg, targetType, emitTypeSize(targetType));
1393             }
1394         }
1395         else if (baseType == TYP_DOUBLE)
1396         {
1397             // On AVX, we have no 16-byte vectors of double.  Note that, if we did, we could use
1398             // dppd directly.
1399             assert(simdType == TYP_SIMD32);
1400
1401             // targetReg = targetReg * op2Reg
1402             // targetReg = vhaddpd(targetReg, targetReg) ; horizontal sum of lower & upper halves
1403             // tmpReg    = vextractf128(targetReg, 1)    ; Moves the upper sum into tempReg
1404             // targetReg = targetReg + tmpReg
1405             inst_RV_RV(INS_mulpd, targetReg, op2Reg, simdEvalType, emitActualTypeSize(simdType));
1406             inst_RV_RV(INS_haddpd, targetReg, targetReg, simdEvalType, emitActualTypeSize(simdType));
1407             getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tmpReg, targetReg, 0x01);
1408             inst_RV_RV(INS_addpd, targetReg, tmpReg, targetType, emitTypeSize(targetType));
1409         }
1410         else
1411         {
1412             unreached();
1413         }
1414     }
1415
1416     genProduceReg(simdNode);
1417 }
1418
1419 //------------------------------------------------------------------------------------
1420 // genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
1421 //
1422 // Arguments:
1423 //    simdNode - The GT_SIMD node
1424 //
1425 // Return Value:
1426 //    None.
1427 //
1428 void
1429 CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
1430 {
1431     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
1432
1433     GenTree* op1 = simdNode->gtGetOp1();
1434     GenTree* op2 = simdNode->gtGetOp2();
1435     var_types simdType = op1->TypeGet();
1436     assert(varTypeIsSIMD(simdType));
1437
1438     // op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
1439     // since it is in XMM register.
1440     if (simdType == TYP_SIMD12)
1441     {
1442         simdType = TYP_SIMD16;
1443     }
1444
1445     var_types baseType = simdNode->gtSIMDBaseType;
1446     regNumber targetReg = simdNode->gtRegNum;
1447     assert(targetReg != REG_NA);
1448     var_types targetType = simdNode->TypeGet();
1449     assert(targetType == genActualType(baseType));
1450
1451     // GetItem has 2 operands:
1452     // - the source of SIMD type (op1)
1453     // - the index of the value to be returned.
1454     genConsumeOperands(simdNode);
1455     regNumber srcReg = op1->gtRegNum;
1456
1457     // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
1458     // For the non-constant case, we will use the SIMD temp location to store the vector, and
1459     // the load the desired element.
1460     // The range check will already have been performed, so at this point we know we have an index
1461     // within the bounds of the vector.
1462     if (!op2->IsCnsIntOrI())
1463     {
1464         unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
1465         noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
1466         bool isEBPbased;
1467         unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
1468         regNumber indexReg = op2->gtRegNum;
1469
1470         // Store the vector to the temp location.
1471         getEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
1472                                   emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0);
1473
1474         // Now, load the desired element.
1475         getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false),   // Load
1476                                     emitTypeSize(baseType),             // Of the vector baseType
1477                                     targetReg,                          // To targetReg
1478                                     (isEBPbased) ? REG_EBP : REG_ESP,   // Stack-based
1479                                     indexReg,                           // Indexed
1480                                     genTypeSize(baseType),              // by the size of the baseType
1481                                     offs);
1482         genProduceReg(simdNode);
1483         return;
1484     }
1485
1486     noway_assert(op2->isContained());
1487     unsigned int index = (unsigned int) op2->gtIntCon.gtIconVal;
1488     unsigned int byteShiftCnt = index * genTypeSize(baseType);
1489
1490     // In general we shouldn't have an index greater than or equal to the length of the vector.
1491     // However, if we have an out-of-range access, under minOpts it will not be optimized
1492     // away. The code will throw before we reach this point, but we still need to generate
1493     // code. In that case, we will simply mask off the upper bits.
1494     if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength())
1495     {
1496         byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1);
1497         index = byteShiftCnt / genTypeSize(baseType);
1498     }
1499
1500     regNumber tmpReg = REG_NA;
1501     if (simdNode->gtRsvdRegs != RBM_NONE)
1502     {
1503         assert(genCountBits(simdNode->gtRsvdRegs) == 1);
1504         tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
1505     }
1506     else
1507     {
1508         assert((byteShiftCnt == 0) ||
1509                varTypeIsFloating(baseType) ||
1510                (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16)));
1511     }
1512
1513     if (byteShiftCnt >= 16)
1514     {
1515         assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
1516         byteShiftCnt -= 16;
1517         regNumber newSrcReg;
1518         if (varTypeIsFloating(baseType))
1519         {
1520             newSrcReg = targetReg;
1521         }
1522         else
1523         {
1524             // Integer types
1525             assert(tmpReg != REG_NA);
1526             newSrcReg = tmpReg;
1527         }
1528         getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01);
1529
1530         srcReg = newSrcReg;
1531     }
1532
1533     // Generate the following sequence:
1534     // 1) baseType is floating point
1535     //   movaps    targetReg, srcReg
1536     //   psrldq    targetReg, byteShiftCnt  <-- not generated if accessing zero'th element
1537     //
1538     // 2) baseType is not floating point
1539     //   movaps    tmpReg, srcReg           <-- not generated if accessing zero'th element
1540     //                                          OR if tmpReg == srcReg
1541     //   psrldq    tmpReg, byteShiftCnt     <-- not generated if accessing zero'th element
1542     //   mov_xmm2i targetReg, tmpReg
1543     if (varTypeIsFloating(baseType))
1544     {
1545         if (targetReg != srcReg)
1546         {
1547             inst_RV_RV(ins_Copy(simdType), targetReg, srcReg, simdType, emitActualTypeSize(simdType));
1548         }
1549
1550         if (byteShiftCnt != 0)
1551         {
1552             instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1553             getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
1554         }
1555     }
1556     else
1557     {
1558         if (varTypeIsSmallInt(baseType))
1559         {
1560             // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
1561             // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
1562             // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
1563             // by 8-bits if index is odd.  In case of Vector<sbyte> also sign extend targetReg.
1564
1565             unsigned baseSize = genTypeSize(baseType);
1566             if (baseSize == 1)
1567             {
1568                 index /= 2;
1569             }
1570             // We actually want index % 8 for the AVX case (for SSE it will never be > 8).
1571             // Note that this doesn't matter functionally, because the instruction uses just the
1572             // low 3 bits of index, but it's better to use the right value.
1573             if (index > 8)
1574             {
1575                 assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
1576                 index -= 8;
1577             }
1578
1579             getEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
1580
1581             bool ZeroOrSignExtnReqd = true;
1582             if (baseSize == 1)
1583             {
1584                 if ((op2->gtIntCon.gtIconVal % 2) == 1)
1585                 {
1586                     // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
1587                     inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);
1588
1589                     // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
1590                     ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
1591                 }
1592                 // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
1593             }
1594             else
1595             {
1596                 // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
1597                 assert(baseSize == 2);
1598                 ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
1599             }
1600
1601             if (ZeroOrSignExtnReqd)
1602             {
1603                 // Zero/sign extend the byte/short to 32-bits
1604                 inst_RV_RV(ins_Move_Extend(baseType, false), targetReg, targetReg, baseType, emitTypeSize(baseType));
1605             }
1606         }
1607         else
1608         {
1609             // We need a temp xmm register if the baseType is not floating point and
1610             // accessing non-zero'th element.
1611             instruction ins;
1612
1613             if (byteShiftCnt != 0)
1614             {
1615                 assert(tmpReg != REG_NA);
1616
1617                 if (tmpReg != srcReg)
1618                 {
1619                     inst_RV_RV(ins_Copy(simdType), tmpReg, srcReg, simdType, emitActualTypeSize(simdType));
1620                 }
1621
1622                 ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType);
1623                 getEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
1624             }
1625             else
1626             {
1627                 tmpReg = srcReg;
1628             }
1629
1630             assert(tmpReg != REG_NA);
1631             ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
1632             // (Note that for mov_xmm2i, the int register is always in the reg2 position.
1633             inst_RV_RV(ins, tmpReg, targetReg, baseType);
1634         }
1635     }
1636
1637     genProduceReg(simdNode);
1638 }
1639
1640 //------------------------------------------------------------------------------------
1641 // genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
1642 //
1643 // Arguments:
1644 //    simdNode - The GT_SIMD node
1645 //
1646 // Return Value:
1647 //    None.
1648 //
1649 // TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
1650 //
1651 void
1652 CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
1653 {
1654     // Determine index based on intrinsic ID
1655     int index = -1;
1656     switch(simdNode->gtSIMDIntrinsicID)
1657     {
1658     case SIMDIntrinsicSetX:
1659         index = 0;
1660         break;
1661     case SIMDIntrinsicSetY:
1662         index = 1;
1663         break;
1664     case SIMDIntrinsicSetZ:
1665         index = 2;
1666         break;
1667     case SIMDIntrinsicSetW:
1668         index = 3;
1669         break;
1670
1671     default:
1672         unreached();
1673     }
1674     assert(index != -1);
1675
1676     // op1 is the SIMD vector
1677     // op2 is the value to be set
1678     GenTree* op1 = simdNode->gtGetOp1();
1679     GenTree* op2 = simdNode->gtGetOp2();
1680
1681     var_types baseType = simdNode->gtSIMDBaseType;
1682     regNumber targetReg = simdNode->gtRegNum;
1683     assert(targetReg != REG_NA);
1684     var_types targetType = simdNode->TypeGet();
1685     assert(varTypeIsSIMD(targetType));
1686
1687     // the following assert must hold.
1688     // supported only on vector2f/3f/4f right now
1689     noway_assert(baseType == TYP_FLOAT);
1690     assert(op2->TypeGet() == baseType);
1691     assert(simdNode->gtSIMDSize >= ((index + 1) * genTypeSize(baseType)));
1692
1693     genConsumeOperands(simdNode);
1694     regNumber op1Reg = op1->gtRegNum;
1695     regNumber op2Reg = op2->gtRegNum;
1696
1697     // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
1698     if (targetReg != op1Reg)
1699     {
1700         inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1701     }
1702
1703     // Right now this intrinsic is supported only for float base type vectors.
1704     // If in future need to support on other base type vectors, the below
1705     // logic needs modification.
1706     noway_assert(baseType == TYP_FLOAT);
1707
1708     if (compiler->getSIMDInstructionSet() == InstructionSet_SSE2)
1709     {
1710         // We need one additional int register as scratch
1711         assert(simdNode->gtRsvdRegs != RBM_NONE);
1712         assert(genCountBits(simdNode->gtRsvdRegs) == 1);
1713         regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
1714         assert(genIsValidIntReg(tmpReg));
1715
1716         // Move the value from xmm reg to an int reg
1717         instruction ins = ins_CopyFloatToInt(TYP_FLOAT, TYP_INT);
1718         // (Note that for mov_xmm2i, the int register is always in the reg2 position.
1719         inst_RV_RV(ins, op2Reg, tmpReg, baseType);
1720
1721         // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
1722         // since every float has two 16-bit words.
1723         getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index);
1724
1725         // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
1726         inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
1727         getEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2*index+1);
1728     }
1729     else
1730     {
1731         unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0)|INSERTPS_TARGET_SELECT(index));
1732         inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, insertpsImm);
1733     }
1734
1735     genProduceReg(simdNode);
1736 }
1737
1738 //------------------------------------------------------------------------
1739 // genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
1740 //
1741 // Arguments:
1742 //    simdNode - The GT_SIMD node
1743 //
1744 // Return Value:
1745 //    None.
1746 //
1747 void
1748 CodeGen::genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode)
1749 {
1750     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicShuffleSSE2);
1751     noway_assert(compiler->getSIMDInstructionSet() == InstructionSet_SSE2);
1752
1753     GenTree* op1 = simdNode->gtGetOp1();
1754     GenTree* op2 = simdNode->gtGetOp2();
1755     assert(op2->isContained());
1756     assert(op2->IsCnsIntOrI());
1757     int shuffleControl = (int) op2->AsIntConCommon()->IconValue();
1758     var_types baseType = simdNode->gtSIMDBaseType;
1759     var_types targetType = simdNode->TypeGet();
1760     regNumber targetReg = simdNode->gtRegNum;
1761     assert(targetReg != REG_NA);
1762
1763     regNumber op1Reg = genConsumeReg(op1);
1764     if (targetReg != op1Reg)
1765     {
1766         inst_RV_RV(ins_Copy(targetType), targetReg, op1Reg, targetType, emitActualTypeSize(targetType));
1767     }
1768
1769     instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType);
1770     getEmitter()->emitIns_R_R_I(ins, emitTypeSize(baseType), targetReg, targetReg, shuffleControl);
1771     genProduceReg(simdNode);
1772 }
1773
1774 //-----------------------------------------------------------------------------
1775 // genStoreIndTypeSIMD12: store indirect a TYP_SIMD12 (i.e. Vector3) to memory.
1776 // Since Vector3 is not a hardware supported write size, it is performed
1777 // as two writes: 8 byte followed by 4-byte.
1778 //
1779 // Arguments:
1780 //    treeNode - tree node that is attempting to store indirect
1781 //
1782 //
1783 // Return Value:
1784 //    None.
1785 //
1786 void
1787 CodeGen::genStoreIndTypeSIMD12(GenTree* treeNode)
1788 {
1789     assert(treeNode->OperGet() == GT_STOREIND);
1790
1791     GenTree* addr = treeNode->gtOp.gtOp1;
1792     GenTree* data = treeNode->gtOp.gtOp2;
1793
1794     // addr and data should not be contained.
1795     assert(!data->isContained());
1796     assert(!addr->isContained());
1797
1798 #ifdef DEBUG
1799     // Should not require a write barrier
1800     GCInfo::WriteBarrierForm writeBarrierForm = gcInfo.gcIsWriteBarrierCandidate(treeNode, data);
1801     assert(writeBarrierForm == GCInfo::WBF_NoBarrier);
1802 #endif
1803
1804     // Need an addtional Xmm register to extract upper 4 bytes from data.
1805     assert(treeNode->gtRsvdRegs != RBM_NONE);
1806     assert(genCountBits(treeNode->gtRsvdRegs) == 1);
1807     regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
1808
1809     genConsumeOperands(treeNode->AsOp());
1810
1811     // 8-byte write
1812     getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, data->gtRegNum, addr->gtRegNum, 0);
1813
1814     // Extract upper 4-bytes from data
1815     getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, data->gtRegNum, 0x02);
1816
1817     // 4-byte write
1818     getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, addr->gtRegNum, 8);
1819 }
1820
1821 //-----------------------------------------------------------------------------
1822 // genLoadIndTypeSIMD12: load indirect a TYP_SIMD12 (i.e. Vector3) value.
1823 // Since Vector3 is not a hardware supported write size, it is performed
1824 // as two loads: 8 byte followed by 4-byte.
1825 //
1826 // Arguments:
1827 //    treeNode - tree node of GT_IND
1828 //
1829 //
1830 // Return Value:
1831 //    None.
1832 //
1833 void
1834 CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
1835 {
1836     assert(treeNode->OperGet() == GT_IND);
1837
1838     regNumber targetReg = treeNode->gtRegNum;
1839     GenTreePtr op1 = treeNode->gtOp.gtOp1;
1840     assert(!op1->isContained());
1841     regNumber operandReg = genConsumeReg(op1);
1842
1843     // Need an addtional Xmm register to read upper 4 bytes, which is different from targetReg
1844     assert(treeNode->gtRsvdRegs != RBM_NONE);
1845     assert(genCountBits(treeNode->gtRsvdRegs) == 2);
1846
1847     regNumber tmpReg = REG_NA;
1848     regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
1849     regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1850     tmpRegsMask &= ~tmpReg1Mask;
1851     regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1852     regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1853
1854     // Choose any register different from targetReg as tmpReg
1855     if (tmpReg1 != targetReg)
1856     {
1857         tmpReg = tmpReg1;
1858     }
1859     else
1860     {
1861         assert(targetReg != tmpReg2);
1862         tmpReg = tmpReg2;
1863     }
1864     assert(tmpReg != REG_NA);
1865     assert(tmpReg != targetReg);
1866
1867     // Load upper 4 bytes in tmpReg
1868     getEmitter()->emitIns_R_AR(ins_Load(TYP_FLOAT), EA_4BYTE, tmpReg, operandReg, 8);
1869
1870     // Load lower 8 bytes in targetReg
1871     getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
1872
1873     // combine upper 4 bytes and lower 8 bytes in targetReg
1874     getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
1875
1876     genProduceReg(treeNode);
1877 }
1878
1879 //-----------------------------------------------------------------------------
1880 // genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
1881 // Since Vector3 is not a hardware supported write size, it is performed
1882 // as two stores: 8 byte followed by 4-byte.
1883 //
1884 // Arguments:
1885 //    treeNode - tree node that is attempting to store TYP_SIMD12 field
1886 //
1887 // Return Value:
1888 //    None.
1889 //
1890 void
1891 CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
1892 {
1893     assert(treeNode->OperGet() == GT_STORE_LCL_FLD);
1894
1895     unsigned offs = treeNode->gtLclFld.gtLclOffs;
1896     unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
1897     assert(varNum < compiler->lvaCount);
1898
1899     GenTreePtr op1 = treeNode->gtOp.gtOp1;
1900     assert(!op1->isContained());
1901     regNumber operandReg = genConsumeReg(op1);
1902
1903     // Need an addtional Xmm register to extract upper 4 bytes from data.
1904     assert(treeNode->gtRsvdRegs != RBM_NONE);
1905     assert(genCountBits(treeNode->gtRsvdRegs) == 1);
1906     regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
1907
1908     // store lower 8 bytes
1909     getEmitter()->emitIns_S_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, varNum, offs);
1910
1911     // Extract upper 4-bytes from operandReg
1912     getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
1913
1914     // Store upper 4 bytes
1915     getEmitter()->emitIns_S_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, varNum, offs+8);
1916 }
1917
1918 //-----------------------------------------------------------------------------
1919 // genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
1920 // Since Vector3 is not a hardware supported write size, it is performed
1921 // as two reads: 8 byte followed by 4-byte.
1922 //
1923 // Arguments:
1924 //    treeNode - tree node that is attempting to load TYP_SIMD12 field
1925 //
1926 // Return Value:
1927 //    None.
1928 //
1929 void
1930 CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
1931 {
1932     assert(treeNode->OperGet() == GT_LCL_FLD);
1933
1934     regNumber targetReg  = treeNode->gtRegNum;
1935     unsigned offs = treeNode->gtLclFld.gtLclOffs;
1936     unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
1937     assert(varNum < compiler->lvaCount);
1938
1939     // Need an addtional Xmm register to read upper 4 bytes
1940     assert(treeNode->gtRsvdRegs != RBM_NONE);
1941     assert(genCountBits(treeNode->gtRsvdRegs) == 2);
1942
1943     regNumber tmpReg = REG_NA;
1944     regMaskTP tmpRegsMask = treeNode->gtRsvdRegs;
1945     regMaskTP tmpReg1Mask = genFindLowestBit(tmpRegsMask);
1946     tmpRegsMask &= ~tmpReg1Mask;
1947     regNumber tmpReg1 = genRegNumFromMask(tmpReg1Mask);
1948     regNumber tmpReg2 = genRegNumFromMask(tmpRegsMask);
1949
1950     // Choose any register different from targetReg as tmpReg
1951     if (tmpReg1 != targetReg)
1952     {
1953         tmpReg = tmpReg1;
1954     }
1955     else
1956     {
1957         assert(targetReg != tmpReg2);
1958         tmpReg = tmpReg2;
1959     }
1960     assert(tmpReg != REG_NA);
1961     assert(tmpReg != targetReg);
1962
1963     // Read upper 4 bytes to tmpReg
1964     getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_FLOAT, false), EA_4BYTE, tmpReg, varNum, offs+8);
1965
1966     // Read lower 8 bytes to targetReg
1967     getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
1968
1969     // combine upper 4 bytes and lower 8 bytes in targetReg
1970     getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
1971
1972     genProduceReg(treeNode);
1973 }
1974
1975 //-----------------------------------------------------------------------------
1976 // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
1977 //                            the given register, if any, or to memory.
1978 //
1979 // Arguments:
1980 //    simdNode - The GT_SIMD node
1981 //
1982 // Return Value:
1983 //    None.
1984 //
1985 // Notes:
1986 //    The upper half of all AVX registers is volatile, even the callee-save registers.
1987 //    When a 32-byte SIMD value is live across a call, the register allocator will use this intrinsic
1988 //    to cause the upper half to be saved.  It will first attempt to find another, unused, callee-save
1989 //    register.  If such a register cannot be found, it will save it to an available caller-save register.
1990 //    In that case, this node will be marked GTF_SPILL, which will cause genProduceReg to save the 16 byte
1991 //    value to the stack.  (Note that if there are no caller-save registers available, the entire 32 byte
1992 //    value will be spilled to the stack.)
1993 //
1994 void
1995 CodeGen::genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode)
1996 {
1997     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperSave);
1998
1999     GenTree* op1 = simdNode->gtGetOp1();
2000     assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
2001     regNumber targetReg = simdNode->gtRegNum;
2002     regNumber op1Reg = genConsumeReg(op1);
2003     assert(op1Reg != REG_NA);
2004     assert(targetReg != REG_NA);
2005     getEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, targetReg, op1Reg, 0x01);
2006
2007     genProduceReg(simdNode);
2008 }
2009
2010 //-----------------------------------------------------------------------------
2011 // genSIMDIntrinsicUpperRestore: Restore the upper half of a TYP_SIMD32 vector to
2012 //                               the given register, if any, or to memory.
2013 //
2014 // Arguments:
2015 //    simdNode - The GT_SIMD node
2016 //
2017 // Return Value:
2018 //    None.
2019 //
2020 // Notes:
2021 //    For consistency with genSIMDIntrinsicUpperSave, and to ensure that lclVar nodes always
2022 //    have their home register, this node has its targetReg on the lclVar child, and its source
2023 //    on the simdNode.
2024 //    Regarding spill, please see the note above on genSIMDIntrinsicUpperSave.  If we have spilled
2025 //    an upper-half to a caller save register, this node will be marked GTF_SPILLED.  However, unlike
2026 //    most spill scenarios, the saved tree will be different from the restored tree, but the spill
2027 //    restore logic, which is triggered by the call to genConsumeReg, requires us to provide the
2028 //    spilled tree (saveNode) in order to perform the reload.  We can easily find that tree,
2029 //    as it is in the spill descriptor for the register from which it was saved.
2030 //
2031 void
2032 CodeGen::genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode)
2033 {
2034     assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicUpperRestore);
2035
2036     GenTree* op1 = simdNode->gtGetOp1();
2037     assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32);
2038     regNumber srcReg = simdNode->gtRegNum;
2039     regNumber lclVarReg = genConsumeReg(op1);
2040     unsigned varNum = op1->AsLclVarCommon()->gtLclNum;
2041     assert(lclVarReg != REG_NA);
2042     assert(srcReg != REG_NA);
2043     if (simdNode->gtFlags & GTF_SPILLED)
2044     {
2045         GenTree* saveNode = regSet.rsSpillDesc[srcReg]->spillTree;
2046         noway_assert(saveNode != nullptr && (saveNode->gtRegNum == srcReg));
2047         genConsumeReg(saveNode);
2048     }
2049     getEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, srcReg, 0x01);
2050 }
2051
2052 //------------------------------------------------------------------------
2053 // genSIMDIntrinsic: Generate code for a SIMD Intrinsic.  This is the main
2054 // routine which in turn calls apropriate genSIMDIntrinsicXXX() routine.
2055 //
2056 // Arguments:
2057 //    simdNode - The GT_SIMD node
2058 //
2059 // Return Value:
2060 //    None.
2061 //
2062 // Notes:
2063 //    Currently, we only recognize SIMDVector<float> and SIMDVector<int>, and
2064 //    a limited set of methods.
2065 //
2066 void
2067 CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
2068 {
2069     // NYI for unsupported base types
2070     if (simdNode->gtSIMDBaseType != TYP_INT &&
2071         simdNode->gtSIMDBaseType != TYP_LONG &&
2072         simdNode->gtSIMDBaseType != TYP_FLOAT &&
2073         simdNode->gtSIMDBaseType != TYP_DOUBLE &&
2074         simdNode->gtSIMDBaseType != TYP_CHAR &&
2075         simdNode->gtSIMDBaseType != TYP_UBYTE  &&
2076         simdNode->gtSIMDBaseType != TYP_SHORT &&
2077         simdNode->gtSIMDBaseType != TYP_BYTE &&
2078         simdNode->gtSIMDBaseType != TYP_UINT &&
2079         simdNode->gtSIMDBaseType != TYP_ULONG
2080         )
2081     {
2082         noway_assert(!"SIMD intrinsic with unsupported base type.");
2083     }
2084
2085     switch(simdNode->gtSIMDIntrinsicID)
2086     {
2087     case SIMDIntrinsicInit:
2088         genSIMDIntrinsicInit(simdNode);
2089         break;
2090
2091     case SIMDIntrinsicInitN:
2092         genSIMDIntrinsicInitN(simdNode);
2093         break;
2094
2095     case SIMDIntrinsicSqrt:
2096     case SIMDIntrinsicCast:
2097         genSIMDIntrinsicUnOp(simdNode);
2098         break;
2099
2100     case SIMDIntrinsicAdd:
2101     case SIMDIntrinsicSub:
2102     case SIMDIntrinsicMul:
2103     case SIMDIntrinsicDiv:
2104     case SIMDIntrinsicBitwiseAnd:
2105     case SIMDIntrinsicBitwiseAndNot:
2106     case SIMDIntrinsicBitwiseOr:
2107     case SIMDIntrinsicBitwiseXor:
2108     case SIMDIntrinsicMin:
2109     case SIMDIntrinsicMax:
2110         genSIMDIntrinsicBinOp(simdNode);
2111         break;
2112
2113     case SIMDIntrinsicOpEquality:
2114     case SIMDIntrinsicOpInEquality:
2115     case SIMDIntrinsicEqual:
2116     case SIMDIntrinsicLessThan:
2117     case SIMDIntrinsicGreaterThan:
2118     case SIMDIntrinsicLessThanOrEqual:
2119     case SIMDIntrinsicGreaterThanOrEqual:
2120         genSIMDIntrinsicRelOp(simdNode);
2121         break;
2122
2123     case SIMDIntrinsicDotProduct:
2124         genSIMDIntrinsicDotProduct(simdNode);
2125         break;
2126
2127     case SIMDIntrinsicGetItem:
2128         genSIMDIntrinsicGetItem(simdNode);
2129         break;
2130
2131     case SIMDIntrinsicShuffleSSE2:
2132         genSIMDIntrinsicShuffleSSE2(simdNode);
2133         break;
2134
2135     case SIMDIntrinsicSetX:
2136     case SIMDIntrinsicSetY:
2137     case SIMDIntrinsicSetZ:
2138     case SIMDIntrinsicSetW:
2139         genSIMDIntrinsicSetItem(simdNode);
2140         break;
2141
2142     case SIMDIntrinsicUpperSave:
2143         genSIMDIntrinsicUpperSave(simdNode);
2144         break;
2145     case SIMDIntrinsicUpperRestore:
2146         genSIMDIntrinsicUpperRestore(simdNode);
2147         break;
2148
2149     default:
2150         noway_assert(!"Unimplemented SIMD intrinsic.");
2151         unreached();
2152     }
2153 }
2154
2155 #endif // FEATURE_SIMD
2156 #endif //_TARGET_AMD64_
2157 #endif // !LEGACY_BACKEND