Merge pull request #16832 from dotnetrt/StoreNonTemporal
authorCarol Eidt <carol.eidt@microsoft.com>
Wed, 14 Mar 2018 21:56:50 +0000 (14:56 -0700)
committerGitHub <noreply@github.com>
Wed, 14 Mar 2018 21:56:50 +0000 (14:56 -0700)
 Implement SSE2 StoreNonTemporal HW intrinsic - complete SSE2 ISA

1  2 
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp

@@@ -985,6 -985,8 +985,6 @@@ void CodeGen::genSSE2Intrinsic(GenTreeH
              assert(op1 != nullptr);
              assert(op2 == nullptr);
              instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
 -            // TODO-XArch-CQ -> use of type size of TYP_SIMD16 leads to
 -            // instruction register encoding errors for SSE legacy encoding
              emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
              break;
          }
              break;
          }
  
+         case NI_SSE2_StoreNonTemporal:
+         {
+             assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
+             assert(op1 != nullptr);
+             assert(op2 != nullptr);
+             op2Reg          = op2->gtRegNum;
+             instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
+             break;
+         }
          default:
              unreached();
              break;
@@@ -100,6 -100,7 +100,6 @@@ HARDWARE_INTRINSIC(SSE_Reciprocal
  HARDWARE_INTRINSIC(SSE_ReciprocalScalar,                             "ReciprocalScalar",                                 SSE,        -1,           16,          -1,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rcpss,     INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
  HARDWARE_INTRINSIC(SSE_ReciprocalSqrt,                               "ReciprocalSqrt",                                   SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtps,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar,                         "ReciprocalSqrtScalar",                             SSE,        -1,           16,          -1,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_rsqrtss,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 -HARDWARE_INTRINSIC(SSE_SetAllVector128,                              "SetAllVector128",                                  SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Helper,                            HW_Flag_NoCodeGen|HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE_SetScalarVector128,                           "SetScalarVector128",                               SSE,        -1,           16,           1,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movss,     INS_invalid},           HW_Category_Helper,                            HW_Flag_MultiIns|HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE_SetZeroVector128,                             "SetZeroVector128",                                 SSE,        -1,           16,           0,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_xorps,     INS_invalid},           HW_Category_Helper,                            HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE_Shuffle,                                      "Shuffle",                                          SSE,        -1,           16,           3,           {INS_invalid,   INS_invalid,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_shufps,    INS_invalid},           HW_Category_IMM,                               HW_Flag_FullRangeIMM)
@@@ -183,7 -184,7 +183,7 @@@ HARDWARE_INTRINSIC(SSE2_ConvertScalarTo
  HARDWARE_INTRINSIC(SSE2_ConvertToVector128Single,                    "ConvertToVector128Single",                         SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtdq2ps,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtpd2ps},          HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Single,              "ConvertScalarToVector128Single",                   SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsd2ss,  INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_SpecialCodeGen)
  HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt32,              "ConvertScalarToVector128UInt32",                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoRMWSemantics)
 -HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt64,              "ConvertScalarToVector128UInt64",                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_64BitOnly|HW_Flag_NoRMWSemantics)
 +HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt64,              "ConvertScalarToVector128UInt64",                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_64BitOnly|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE2_Divide,                                      "Divide",                                           SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divpd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
  HARDWARE_INTRINSIC(SSE2_DivideScalar,                                "DivideScalar",                                     SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
  HARDWARE_INTRINSIC(SSE2_Extract,                                     "Extract",                                          SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_pextrw,    INS_pextrw,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IMM,                               HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
@@@ -228,6 -229,7 +228,7 @@@ HARDWARE_INTRINSIC(SSE2_StoreAligned
  HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                          SSE2,       -1,           16,          2,            {INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_invalid,   INS_movntpd},           HW_Category_MemoryStore,                       HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE2_StoreHigh,                                   "StoreHigh",                                        SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhpd},            HW_Category_MemoryStore,                       HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE2_StoreLow,                                    "StoreLow",                                         SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movq,      INS_movq,      INS_invalid,   INS_movlpd},            HW_Category_MemoryStore,                       HW_Flag_NoRMWSemantics)
+ HARDWARE_INTRINSIC(SSE2_StoreNonTemporal,                            "StoreNonTemporal",                                 SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movnti,    INS_movnti,    INS_movnti,    INS_movnti,    INS_invalid,   INS_invalid},           HW_Category_Scalar,                            HW_Flag_NoRMWSemantics|HW_Flag_SecondArgMaybe64Bit)
  HARDWARE_INTRINSIC(SSE2_StoreScalar,                                 "StoreScalar",                                      SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movsdsse2},         HW_Category_MemoryStore,                       HW_Flag_NoRMWSemantics)
  HARDWARE_INTRINSIC(SSE2_Subtract,                                    "Subtract",                                         SSE2,       -1,           16,          2,            {INS_psubb,     INS_psubb,     INS_psubw,     INS_psubw,     INS_psubd,     INS_psubd,     INS_psubq,     INS_psubq,     INS_invalid,   INS_subpd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
  HARDWARE_INTRINSIC(SSE2_SubtractSaturate,                            "SubtractSaturate",                                 SSE2,       -1,           16,          2,            {INS_psubsb,    INS_psubusb,   INS_psubsw,    INS_psubusw,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
@@@ -124,7 -124,7 +124,7 @@@ InstructionSet Compiler::lookupHWIntrin
  //    isa        -- instruction set of the intrinsic.
  //
  // Return Value:
 -//    Id for the hardware intrinsic.
 +//    Id for the hardware intrinsic
  //
  // TODO-Throughput: replace sequential search by binary search
  NamedIntrinsic Compiler::lookupHWIntrinsic(const char* methodName, InstructionSet isa)
              if (isa == hwIntrinsicInfoArray[i].isa && strcmp(methodName, hwIntrinsicInfoArray[i].intrinsicName) == 0)
              {
                  result = hwIntrinsicInfoArray[i].intrinsicID;
 +                break;
              }
          }
      }
@@@ -546,7 -545,6 +546,6 @@@ bool Compiler::isFullyImplmentedISAClas
  {
      switch (isa)
      {
-         case InstructionSet_SSE2:
          case InstructionSet_SSE42:
          case InstructionSet_AVX:
          case InstructionSet_AVX2:
              return false;
  
          case InstructionSet_SSE:
+         case InstructionSet_SSE2:
          case InstructionSet_SSE3:
          case InstructionSet_SSSE3:
          case InstructionSet_SSE41:
@@@ -930,6 -929,14 +930,6 @@@ GenTree* Compiler::impSSEIntrinsic(Name
              break;
          }
  
 -        case NI_SSE_SetAllVector128:
 -            assert(sig->numArgs == 1);
 -            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
 -            op1     = impPopStack().val;
 -            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtCloneExpr(op1), gtNewIconNode(0), NI_SSE_Shuffle,
 -                                               TYP_FLOAT, simdSize);
 -            break;
 -
          case NI_SSE_StoreFence:
              assert(sig->numArgs == 0);
              assert(JITtype2varType(sig->retType) == TYP_VOID);
@@@ -1004,6 -1011,16 +1004,16 @@@ GenTree* Compiler::impSSE2Intrinsic(Nam
              retNode  = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
              break;
          }
+         case NI_SSE2_StoreNonTemporal:
+         {
+             assert(sig->numArgs == 2);
+             assert(JITtype2varType(sig->retType) == TYP_VOID);
+             op2     = impPopStack().val;
+             op1     = impPopStack().val;
+             retNode = gtNewSimdHWIntrinsicNode(TYP_VOID, op1, op2, NI_SSE2_StoreNonTemporal, op2->TypeGet(), 0);
+             break;
+         }
  
          default:
              JITDUMP("Not implemented hardware intrinsic");