Optimize Vector64 and Vector128.Create methods (#36267)
authorTanner Gooding <tagoo@outlook.com>
Wed, 13 May 2020 23:51:20 +0000 (16:51 -0700)
committerGitHub <noreply@github.com>
Wed, 13 May 2020 23:51:20 +0000 (16:51 -0700)
* Updating the Vector64 and Vector128 Create methods to be marked Intrinsic on ARM64

* Updating the JIT to emit constants for Vector64 and Vector128.Create

* Fixing lookupNamedIntrinsic and impIntrinsic to throw PNSE for unsupported mustExpand intrinsics

* Fixing impIntrinsic to directly use gtNewMustThrowException

* Move gtNewMustThrowException to not depend on FEATURE_HW_INTRINSICS

* Applying formatting patch

* Add basic support for GT_CLS_VAR_ADDR to the ARM64 JIT

* Update lookupNamedIntrinsic to handle System.Runtime.Intrinsics for unsupported platforms

* Fixing INS_ldr in emitIns_R_C to use isValidVectorLSDatasize

* Elaborate on why we specially recognize the HWIntrinsics even on platforms that don't support them

16 files changed:
src/coreclr/src/jit/compiler.h
src/coreclr/src/jit/emitarm64.cpp
src/coreclr/src/jit/gentree.cpp
src/coreclr/src/jit/hwintrinsic.cpp
src/coreclr/src/jit/hwintrinsicarm64.cpp
src/coreclr/src/jit/hwintrinsiccodegenarm64.cpp
src/coreclr/src/jit/hwintrinsiclistarm64.h
src/coreclr/src/jit/hwintrinsiclistxarch.h
src/coreclr/src/jit/importer.cpp
src/coreclr/src/jit/lower.h
src/coreclr/src/jit/lowerarmarch.cpp
src/coreclr/src/jit/lowerxarch.cpp
src/coreclr/src/jit/lsraarmarch.cpp
src/coreclr/src/jit/namedintrinsiclist.h
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs
src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs

index 917dbbb..a8e6220 100644 (file)
@@ -2657,7 +2657,6 @@ public:
                                                    NamedIntrinsic hwIntrinsicID);
     GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(
         var_types type, GenTree* op1, GenTree* op2, GenTree* op3, NamedIntrinsic hwIntrinsicID);
-    GenTree* gtNewMustThrowException(unsigned helper, var_types type, CORINFO_CLASS_HANDLE clsHnd);
     CORINFO_CLASS_HANDLE gtGetStructHandleForHWSIMD(var_types simdType, var_types simdBaseType);
     var_types getBaseTypeFromArgIfNeeded(NamedIntrinsic       intrinsic,
                                          CORINFO_CLASS_HANDLE clsHnd,
@@ -2665,6 +2664,8 @@ public:
                                          var_types            baseType);
 #endif // FEATURE_HW_INTRINSICS
 
+    GenTree* gtNewMustThrowException(unsigned helper, var_types type, CORINFO_CLASS_HANDLE clsHnd);
+
     GenTreeLclFld* gtNewLclFldNode(unsigned lnum, var_types type, unsigned offset);
     GenTree* gtNewInlineCandidateReturnExpr(GenTree* inlineCandidate, var_types type);
 
@@ -3713,6 +3714,10 @@ protected:
                               CorInfoIntrinsics     intrinsicID,
                               bool                  tailCall);
     NamedIntrinsic lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method);
+    GenTree* impUnsupportedNamedIntrinsic(unsigned              helper,
+                                          CORINFO_METHOD_HANDLE method,
+                                          CORINFO_SIG_INFO*     sig,
+                                          bool                  mustExpand);
 
 #ifdef FEATURE_HW_INTRINSICS
     GenTree* impHWIntrinsic(NamedIntrinsic        intrinsic,
@@ -3720,10 +3725,6 @@ protected:
                             CORINFO_METHOD_HANDLE method,
                             CORINFO_SIG_INFO*     sig,
                             bool                  mustExpand);
-    GenTree* impUnsupportedHWIntrinsic(unsigned              helper,
-                                       CORINFO_METHOD_HANDLE method,
-                                       CORINFO_SIG_INFO*     sig,
-                                       bool                  mustExpand);
     GenTree* impSimdAsHWIntrinsic(NamedIntrinsic        intrinsic,
                                   CORINFO_CLASS_HANDLE  clsHnd,
                                   CORINFO_METHOD_HANDLE method,
index 81bca41..5780ffb 100644 (file)
@@ -7548,7 +7548,7 @@ void emitter::emitIns_R_C(
             fmt = IF_LARGELDC;
             if (isVectorRegister(reg))
             {
-                assert(isValidScalarDatasize(size));
+                assert(isValidVectorLSDatasize(size));
                 // For vector (float/double) register, we should have an integer address reg to
                 // compute long address which consists of page address and page offset.
                 // For integer constant, this is not needed since the dest reg can be used to
@@ -7561,6 +7561,7 @@ void emitter::emitIns_R_C(
                 assert(isValidGeneralDatasize(size));
             }
             break;
+
         default:
             unreached();
     }
@@ -12718,7 +12719,7 @@ void emitter::emitInsLoadStoreOp(instruction ins, emitAttr attr, regNumber dataR
 
     if (addr->isContained())
     {
-        assert(addr->OperGet() == GT_LCL_VAR_ADDR || addr->OperGet() == GT_LEA);
+        assert(addr->OperGet() == GT_CLS_VAR_ADDR || addr->OperGet() == GT_LCL_VAR_ADDR || addr->OperGet() == GT_LEA);
 
         int   offset = 0;
         DWORD lsl    = 0;
@@ -12795,7 +12796,13 @@ void emitter::emitInsLoadStoreOp(instruction ins, emitAttr attr, regNumber dataR
         }
         else // no Index register
         {
-            if (emitIns_valid_imm_for_ldst_offset(offset, emitTypeSize(indir->TypeGet())))
+            if (addr->OperGet() == GT_CLS_VAR_ADDR)
+            {
+                // Get a temp integer register to compute long address.
+                regNumber addrReg = indir->GetSingleTempReg();
+                emitIns_R_C(ins, attr, dataReg, addrReg, addr->AsClsVar()->gtClsVarHnd, 0);
+            }
+            else if (emitIns_valid_imm_for_ldst_offset(offset, emitTypeSize(indir->TypeGet())))
             {
                 // Then load/store dataReg from/to [memBase + offset]
                 emitIns_R_R_I(ins, attr, dataReg, memBase->GetRegNum(), offset);
index 8e341ce..afeb708 100644 (file)
@@ -18659,40 +18659,6 @@ GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(
         GenTreeHWIntrinsic(type, gtNewArgList(op1, op2, op3), hwIntrinsicID, TYP_UNKNOWN, 0);
 }
 
-//---------------------------------------------------------------------------------------
-// gtNewMustThrowException:
-//    create a throw node (calling into JIT helper) that must be thrown.
-//    The result would be a comma node: COMMA(jithelperthrow(void), x) where x's type should be specified.
-//
-// Arguments
-//    helper      -  JIT helper ID
-//    type        -  return type of the node
-//
-// Return Value
-//    pointer to the throw node
-//
-GenTree* Compiler::gtNewMustThrowException(unsigned helper, var_types type, CORINFO_CLASS_HANDLE clsHnd)
-{
-    GenTreeCall* node = gtNewHelperCallNode(helper, TYP_VOID);
-    node->gtCallMoreFlags |= GTF_CALL_M_DOES_NOT_RETURN;
-    if (type != TYP_VOID)
-    {
-        unsigned dummyTemp = lvaGrabTemp(true DEBUGARG("dummy temp of must thrown exception"));
-        if (type == TYP_STRUCT)
-        {
-            lvaSetStruct(dummyTemp, clsHnd, false);
-            type = lvaTable[dummyTemp].lvType; // struct type is normalized
-        }
-        else
-        {
-            lvaTable[dummyTemp].lvType = type;
-        }
-        GenTree* dummyNode = gtNewLclvNode(dummyTemp, type);
-        return gtNewOperNode(GT_COMMA, type, node, dummyNode);
-    }
-    return node;
-}
-
 // Returns true for the HW Instrinsic instructions that have MemoryLoad semantics, false otherwise
 bool GenTreeHWIntrinsic::OperIsMemoryLoad() const
 {
@@ -18783,6 +18749,40 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoadOrStore() const
 #endif // FEATURE_HW_INTRINSICS
 
 //---------------------------------------------------------------------------------------
+// gtNewMustThrowException:
+//    create a throw node (calling into JIT helper) that must be thrown.
+//    The result would be a comma node: COMMA(jithelperthrow(void), x) where x's type should be specified.
+//
+// Arguments
+//    helper      -  JIT helper ID
+//    type        -  return type of the node
+//
+// Return Value
+//    pointer to the throw node
+//
+GenTree* Compiler::gtNewMustThrowException(unsigned helper, var_types type, CORINFO_CLASS_HANDLE clsHnd)
+{
+    GenTreeCall* node = gtNewHelperCallNode(helper, TYP_VOID);
+    node->gtCallMoreFlags |= GTF_CALL_M_DOES_NOT_RETURN;
+    if (type != TYP_VOID)
+    {
+        unsigned dummyTemp = lvaGrabTemp(true DEBUGARG("dummy temp of must thrown exception"));
+        if (type == TYP_STRUCT)
+        {
+            lvaSetStruct(dummyTemp, clsHnd, false);
+            type = lvaTable[dummyTemp].lvType; // struct type is normalized
+        }
+        else
+        {
+            lvaTable[dummyTemp].lvType = type;
+        }
+        GenTree* dummyNode = gtNewLclvNode(dummyTemp, type);
+        return gtNewOperNode(GT_COMMA, type, node, dummyNode);
+    }
+    return node;
+}
+
+//---------------------------------------------------------------------------------------
 // InitializeStructReturnType:
 //    Initialize the Return Type Descriptor for a method that returns a struct type
 //
index 8f33c48..e0e8350 100644 (file)
@@ -89,50 +89,6 @@ var_types Compiler::getBaseTypeFromArgIfNeeded(NamedIntrinsic       intrinsic,
     return baseType;
 }
 
-//------------------------------------------------------------------------
-// impUnsupportedHWIntrinsic: returns a node for an unsupported HWIntrinsic
-//
-// Arguments:
-//    helper     - JIT helper ID for the exception to be thrown
-//    method     - method handle of the intrinsic function.
-//    sig        - signature of the intrinsic call
-//    mustExpand - true if the intrinsic must return a GenTree*; otherwise, false
-//
-// Return Value:
-//    a gtNewMustThrowException if mustExpand is true; otherwise, nullptr
-//
-GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned              helper,
-                                             CORINFO_METHOD_HANDLE method,
-                                             CORINFO_SIG_INFO*     sig,
-                                             bool                  mustExpand)
-{
-    // We've hit some error case and may need to return a node for the given error.
-    //
-    // When `mustExpand=false`, we are attempting to inline the intrinsic directly into another method. In this
-    // scenario, we need to return `nullptr` so that a GT_CALL to the intrinsic is emitted instead. This is to
-    // ensure that everything continues to behave correctly when optimizations are enabled (e.g. things like the
-    // inliner may expect the node we return to have a certain signature, and the `MustThrowException` node won't
-    // match that).
-    //
-    // When `mustExpand=true`, we are in a GT_CALL to the intrinsic and are attempting to JIT it. This will generally
-    // be in response to an indirect call (e.g. done via reflection) or in response to an earlier attempt returning
-    // `nullptr` (under `mustExpand=false`). In that scenario, we are safe to return the `MustThrowException` node.
-
-    if (mustExpand)
-    {
-        for (unsigned i = 0; i < sig->numArgs; i++)
-        {
-            impPopStack();
-        }
-
-        return gtNewMustThrowException(helper, JITtype2varType(sig->retType), sig->retTypeClass);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
 CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, var_types simdBaseType)
 {
     if (simdType == TYP_SIMD16)
index 45efb8a..7127b69 100644 (file)
@@ -357,6 +357,47 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
             assert(retNode->gtType == getSIMDTypeForSize(getSIMDTypeSizeInBytes(sig->retTypeSigClass)));
             break;
         }
+
+        case NI_Vector64_Create:
+        case NI_Vector128_Create:
+        {
+            // We shouldn't handle this as an intrinsic if the
+            // respective ISAs have been disabled by the user.
+
+            if (!compExactlyDependsOn(InstructionSet_AdvSimd))
+            {
+                break;
+            }
+
+            if (sig->numArgs == 1)
+            {
+                op1     = impPopStack().val;
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+            }
+            else if (sig->numArgs == 2)
+            {
+                op2     = impPopStack().val;
+                op1     = impPopStack().val;
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
+            }
+            else
+            {
+                assert(sig->numArgs >= 3);
+
+                GenTreeArgList* tmp = nullptr;
+
+                for (unsigned i = 0; i < sig->numArgs; i++)
+                {
+                    tmp        = gtNewArgList(impPopStack().val);
+                    tmp->gtOp2 = op1;
+                    op1        = tmp;
+                }
+
+                retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+            }
+            break;
+        }
+
         case NI_Vector64_get_Count:
         case NI_Vector128_get_Count:
         {
index 71ac68f..7aca222 100644 (file)
@@ -583,10 +583,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                 GetEmitter()->emitIns_R_I(ins, emitSize, targetReg, 0, INS_OPTS_4S);
                 break;
 
-            case NI_Vector64_Create:
-            case NI_Vector128_Create:
             case NI_AdvSimd_DuplicateToVector64:
             case NI_AdvSimd_DuplicateToVector128:
+            case NI_AdvSimd_Arm64_DuplicateToVector64:
             case NI_AdvSimd_Arm64_DuplicateToVector128:
             {
                 if (varTypeIsFloating(intrin.baseType))
@@ -596,6 +595,11 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                         const double dataValue = intrin.op1->AsDblCon()->gtDconVal;
                         GetEmitter()->emitIns_R_F(INS_fmov, emitSize, targetReg, dataValue, opt);
                     }
+                    else if (intrin.id == NI_AdvSimd_Arm64_DuplicateToVector64)
+                    {
+                        assert(intrin.baseType == TYP_DOUBLE);
+                        GetEmitter()->emitIns_R_R(ins, emitSize, targetReg, op1Reg, opt);
+                    }
                     else
                     {
                         GetEmitter()->emitIns_R_R_I(ins, emitSize, targetReg, op1Reg, 0, opt);
index f798a17..0ca6f8d 100644 (file)
@@ -23,7 +23,7 @@ HARDWARE_INTRINSIC(Vector64,        AsSByte,
 HARDWARE_INTRINSIC(Vector64,        AsSingle,                                   8,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector64,        AsUInt16,                                   8,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector64,        AsUInt32,                                   8,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Vector64,        Create,                                     8,           1,     {INS_dup,               INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_mov,            INS_mov,            INS_dup,            INS_dup},               HW_Category_SimpleSIMD,             HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Vector64,        Create,                                     8,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mov,            INS_mov,            INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector64,        CreateScalarUnsafe,                         8,           1,     {INS_ins,               INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_invalid,        INS_invalid,        INS_fmov,           INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(Vector64,        get_AllBitsSet,                             8,           0,     {INS_mvni,              INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni},              HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(Vector64,        get_Count,                                  8,           0,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
@@ -45,7 +45,7 @@ HARDWARE_INTRINSIC(Vector128,       AsSingle,                                  1
 HARDWARE_INTRINSIC(Vector128,       AsUInt16,                                  16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector128,       AsUInt32,                                  16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector128,       AsUInt64,                                  16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Vector128,       Create,                                    16,           1,     {INS_dup,               INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_dup,            INS_dup},               HW_Category_SimpleSIMD,             HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Vector128,       Create,                                    16,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector128,       CreateScalarUnsafe,                        16,           1,     {INS_ins,               INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_ins,            INS_fmov,           INS_fmov},              HW_Category_SimpleSIMD,             HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(Vector128,       get_AllBitsSet,                            16,           0,     {INS_mvni,              INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni,           INS_mvni},              HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(Vector128,       get_Count,                                 16,           0,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
@@ -197,6 +197,7 @@ HARDWARE_INTRINSIC(AdvSimd_Arm64,   CompareTest,                               1
 HARDWARE_INTRINSIC(AdvSimd_Arm64,   CompareTestScalar,                          8,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmtst,          INS_cmtst,          INS_invalid,        INS_cmtst},             HW_Category_SIMDScalar,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AdvSimd_Arm64,   Divide,                                    -1,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_fdiv,           INS_fdiv},              HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AdvSimd_Arm64,   DuplicateSelectedScalarToVector128,        16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_dup,            INS_dup,            INS_invalid,        INS_dup},               HW_Category_IMM,                    HW_Flag_SupportsContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(AdvSimd_Arm64,   DuplicateToVector64,                       16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mov,            INS_mov,            INS_invalid,        INS_fmov},              HW_Category_SimpleSIMD,             HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AdvSimd_Arm64,   DuplicateToVector128,                      16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_dup,            INS_dup,            INS_invalid,        INS_dup},               HW_Category_SimpleSIMD,             HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(AdvSimd_Arm64,   FusedMultiplyAdd,                          16,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_fmla},              HW_Category_SimpleSIMD,             HW_Flag_HasRMWSemantics)
 HARDWARE_INTRINSIC(AdvSimd_Arm64,   FusedMultiplySubtract,                     16,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_fmls},              HW_Category_SimpleSIMD,             HW_Flag_HasRMWSemantics)
index db2bacb..c041364 100644 (file)
@@ -43,7 +43,7 @@ HARDWARE_INTRINSIC(Vector128,       AsVector2,
 HARDWARE_INTRINSIC(Vector128,       AsVector3,                                  16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       AsVector4,                                  16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       AsVector128,                                16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(Vector128,       Create,                                     16,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector128,       Create,                                     16,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector128,       CreateScalarUnsafe,                         16,             1,      {INS_mov_i2xmm,         INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_movss,              INS_movsdsse2},         HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 // The instruction generated for float/double depends on which ISAs are supported
 HARDWARE_INTRINSIC(Vector128,       get_AllBitsSet,                             16,             0,      {INS_pcmpeqd,           INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_cmpps,              INS_cmppd},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -77,7 +77,7 @@ HARDWARE_INTRINSIC(Vector256,       AsVector256,
 HARDWARE_INTRINSIC(Vector256,       get_AllBitsSet,                             32,             0,      {INS_pcmpeqd,           INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_pcmpeqd,            INS_cmpps,              INS_cmppd},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       get_Count,                                  32,             0,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       get_Zero,                                   32,             0,      {INS_xorps,             INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps,              INS_xorps},             HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(Vector256,       Create,                                     32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector256,       Create,                                     32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
 HARDWARE_INTRINSIC(Vector256,       CreateScalarUnsafe,                         32,             1,      {INS_mov_i2xmm,         INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_mov_i2xmm,          INS_movss,              INS_movsdsse2},         HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector256,       GetElement,                                 32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       WithElement,                                32,             3,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
index 52501b3..014e42a 100644 (file)
@@ -3478,29 +3478,34 @@ GenTree* Compiler::impIntrinsic(GenTree*                newobjThis,
         {
             ni = lookupNamedIntrinsic(method);
 
-#ifdef FEATURE_HW_INTRINSICS
+            // We specially support the following on all platforms to allow for dead
+            // code optimization and to more generally support recursive intrinsics.
+
             if (ni == NI_IsSupported_True)
             {
+                assert(sig->numArgs == 0);
                 return gtNewIconNode(true);
             }
 
             if (ni == NI_IsSupported_False)
             {
+                assert(sig->numArgs == 0);
                 return gtNewIconNode(false);
             }
 
             if (ni == NI_Throw_PlatformNotSupportedException)
             {
-                return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
+                return impUnsupportedNamedIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
             }
 
+#ifdef FEATURE_HW_INTRINSICS
             if ((ni > NI_HW_INTRINSIC_START) && (ni < NI_HW_INTRINSIC_END))
             {
                 GenTree* hwintrinsic = impHWIntrinsic(ni, clsHnd, method, sig, mustExpand);
 
                 if (mustExpand && (hwintrinsic == nullptr))
                 {
-                    return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_NOT_IMPLEMENTED, method, sig, mustExpand);
+                    return impUnsupportedNamedIntrinsic(CORINFO_HELP_THROW_NOT_IMPLEMENTED, method, sig, mustExpand);
                 }
 
                 return hwintrinsic;
@@ -4272,7 +4277,8 @@ GenTree* Compiler::impIntrinsic(GenTree*                newobjThis,
 
     if (mustExpand && (retNode == nullptr))
     {
-        NO_WAY("JIT must expand the intrinsic!");
+        assert(!"Unhandled must expand intrinsic, throwing PlatformNotSupportedException");
+        return impUnsupportedNamedIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
     }
 
     // Optionally report if this intrinsic is special
@@ -4489,8 +4495,30 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
 
         result = SimdAsHWIntrinsicInfo::lookupId(&sig, className, methodName, enclosingClassName, sizeOfVectorT);
     }
+#endif // FEATURE_HW_INTRINSICS
     else if (strncmp(namespaceName, "System.Runtime.Intrinsics", 25) == 0)
     {
+        // We go down this path even when FEATURE_HW_INTRINSICS isn't enabled
+        // so we can specially handle IsSupported and recursive calls.
+
+        // This is required to appropriately handle the intrinsics on platforms
+        // which don't support them. On such a platform methods like Vector64.Create
+        // will be seen as `Intrinsic` and `mustExpand` due to having a code path
+        // which is recursive. When such a path is hit we expect it to be handled by
+        // the importer and we fire an assert if it wasn't and in previous versions
+        // of the JIT would fail fast. This was changed to throw a PNSE instead but
+        // we still assert as most intrinsics should have been recognized/handled.
+
+        // In order to avoid the assert, we specially handle the IsSupported checks
+        // (to better allow dead-code optimizations) and we explicitly throw a PNSE
+        // as we know that is the desired behavior for the HWIntrinsics when not
+        // supported. For cases like Vector64.Create, this is fine because it will
+        // be behind a relevant IsSupported check and will never be hit and the
+        // software fallback will be executed instead.
+
+        CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef FEATURE_HW_INTRINSICS
         namespaceName += 25;
         const char* platformNamespaceName;
 
@@ -4509,21 +4537,40 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
 
             result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName);
         }
-        else if (strcmp(methodName, "get_IsSupported") == 0)
-        {
-            return NI_IsSupported_False;
-        }
-        else
+#endif // FEATURE_HW_INTRINSICS
+
+        if (result == NI_Illegal)
         {
-            return gtIsRecursiveCall(method) ? NI_Throw_PlatformNotSupportedException : NI_Illegal;
+            if (strcmp(methodName, "get_IsSupported") == 0)
+            {
+                // This allows the relevant code paths to be dropped as dead code even
+                // on platforms where FEATURE_HW_INTRINSICS is not supported.
+
+                result = NI_IsSupported_False;
+            }
+            else if (gtIsRecursiveCall(method))
+            {
+                // For the framework itself, any recursive intrinsics will either be
+                // only supported on a single platform or will be guarded by a relevant
+                // IsSupported check so the throw PNSE will be valid or dropped.
+
+                result = NI_Throw_PlatformNotSupportedException;
+            }
         }
     }
-#endif // FEATURE_HW_INTRINSICS
 
     if (result == NI_Illegal)
     {
         JITDUMP("Not recognized\n");
     }
+    else if (result == NI_IsSupported_False)
+    {
+        JITDUMP("Unsupported - return false");
+    }
+    else if (result == NI_Throw_PlatformNotSupportedException)
+    {
+        JITDUMP("Unsupported - throw PlatformNotSupportedException");
+    }
     else
     {
         JITDUMP("Recognized\n");
@@ -4531,6 +4578,50 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
     return result;
 }
 
+//------------------------------------------------------------------------
+// impUnsupportedNamedIntrinsic: Throws an exception for an unsupported named intrinsic
+//
+// Arguments:
+//    helper     - JIT helper ID for the exception to be thrown
+//    method     - method handle of the intrinsic function.
+//    sig        - signature of the intrinsic call
+//    mustExpand - true if the intrinsic must return a GenTree*; otherwise, false
+//
+// Return Value:
+//    a gtNewMustThrowException if mustExpand is true; otherwise, nullptr
+//
+GenTree* Compiler::impUnsupportedNamedIntrinsic(unsigned              helper,
+                                                CORINFO_METHOD_HANDLE method,
+                                                CORINFO_SIG_INFO*     sig,
+                                                bool                  mustExpand)
+{
+    // We've hit some error case and may need to return a node for the given error.
+    //
+    // When `mustExpand=false`, we are attempting to inline the intrinsic directly into another method. In this
+    // scenario, we need to return `nullptr` so that a GT_CALL to the intrinsic is emitted instead. This is to
+    // ensure that everything continues to behave correctly when optimizations are enabled (e.g. things like the
+    // inliner may expect the node we return to have a certain signature, and the `MustThrowException` node won't
+    // match that).
+    //
+    // When `mustExpand=true`, we are in a GT_CALL to the intrinsic and are attempting to JIT it. This will generally
+    // be in response to an indirect call (e.g. done via reflection) or in response to an earlier attempt returning
+    // `nullptr` (under `mustExpand=false`). In that scenario, we are safe to return the `MustThrowException` node.
+
+    if (mustExpand)
+    {
+        for (unsigned i = 0; i < sig->numArgs; i++)
+        {
+            impPopStack();
+        }
+
+        return gtNewMustThrowException(helper, JITtype2varType(sig->retType), sig->retTypeClass);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 /*****************************************************************************/
 
 GenTree* Compiler::impArrayAccessIntrinsic(
index 2fa2691..ae19cd2 100644 (file)
@@ -317,6 +317,139 @@ private:
     void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
     void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
     void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
+
+    union VectorConstant {
+        int8_t   i8[32];
+        uint8_t  u8[32];
+        int16_t  i16[16];
+        uint16_t u16[16];
+        int32_t  i32[8];
+        uint32_t u32[8];
+        int64_t  i64[4];
+        uint64_t u64[4];
+        float    f32[8];
+        double   f64[4];
+    };
+
+    //----------------------------------------------------------------------------------------------
+    // ProcessArgForHWIntrinsicCreate: Processes an argument for the Lowering::LowerHWIntrinsicCreate method
+    //
+    //  Arguments:
+    //     arg      - The argument to process
+    //     argIdx   - The index of the argument being processed
+    //     vecCns   - The vector constant being constructed
+    //     baseType - The base type of the vector constant
+    //
+    //  Returns:
+    //     true if arg was a constant; otherwise, false
+    static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, VectorConstant& vecCns, var_types baseType)
+    {
+        switch (baseType)
+        {
+            case TYP_BYTE:
+            case TYP_UBYTE:
+            {
+                if (arg->IsCnsIntOrI())
+                {
+                    vecCns.i8[argIdx] = static_cast<int8_t>(arg->AsIntCon()->gtIconVal);
+                    return true;
+                }
+                else
+                {
+                    // We expect the VectorConstant to have been already zeroed
+                    assert(vecCns.i8[argIdx] == 0);
+                }
+                break;
+            }
+
+            case TYP_SHORT:
+            case TYP_USHORT:
+            {
+                if (arg->IsCnsIntOrI())
+                {
+                    vecCns.i16[argIdx] = static_cast<int16_t>(arg->AsIntCon()->gtIconVal);
+                    return true;
+                }
+                else
+                {
+                    // We expect the VectorConstant to have been already zeroed
+                    assert(vecCns.i16[argIdx] == 0);
+                }
+                break;
+            }
+
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                if (arg->IsCnsIntOrI())
+                {
+                    vecCns.i32[argIdx] = static_cast<int32_t>(arg->AsIntCon()->gtIconVal);
+                    return true;
+                }
+                else
+                {
+                    // We expect the VectorConstant to have been already zeroed
+                    assert(vecCns.i32[argIdx] == 0);
+                }
+                break;
+            }
+
+            case TYP_LONG:
+            case TYP_ULONG:
+            {
+                if (arg->OperIs(GT_CNS_LNG))
+                {
+                    vecCns.i64[argIdx] = static_cast<int64_t>(arg->AsLngCon()->gtLconVal);
+                    return true;
+                }
+                else
+                {
+                    // We expect the VectorConstant to have been already zeroed
+                    assert(vecCns.i64[argIdx] == 0);
+                }
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                if (arg->IsCnsFltOrDbl())
+                {
+                    vecCns.f32[argIdx] = static_cast<float>(arg->AsDblCon()->gtDconVal);
+                    return true;
+                }
+                else
+                {
+                    // We expect the VectorConstant to have been already zeroed
+                    // We check against the i32, rather than f32, to account for -0.0
+                    assert(vecCns.i32[argIdx] == 0);
+                }
+                break;
+            }
+
+            case TYP_DOUBLE:
+            {
+                if (arg->IsCnsFltOrDbl())
+                {
+                    vecCns.f64[argIdx] = static_cast<double>(arg->AsDblCon()->gtDconVal);
+                    return true;
+                }
+                else
+                {
+                    // We expect the VectorConstant to have been already zeroed
+                    // We check against the i64, rather than f64, to account for -0.0
+                    assert(vecCns.i64[argIdx] == 0);
+                }
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        return false;
+    }
 #endif // FEATURE_HW_INTRINSICS
 
     // Utility functions
index 7b7706e..d5272ab 100644 (file)
@@ -535,8 +535,255 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
         node->gtType = TYP_SIMD16;
     }
 
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+
+    switch (intrinsicId)
+    {
+        case NI_Vector64_Create:
+        case NI_Vector128_Create:
+        {
+            // We don't directly support the Vector64.Create or Vector128.Create methods in codegen
+            // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect
+            // that the node is modified to either not be a HWIntrinsic node or that it is no longer
+            // the same intrinsic as when it came in.
+
+            LowerHWIntrinsicCreate(node);
+            assert(!node->OperIsHWIntrinsic() || (node->gtHWIntrinsicId != intrinsicId));
+            LowerNode(node);
+            return;
+        }
+
+        default:
+            break;
+    }
+
     ContainCheckHWIntrinsic(node);
 }
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicCreate: Lowers a Vector64 or Vector128 Create call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    var_types      simdType    = node->gtType;
+    var_types      baseType    = node->gtSIMDBaseType;
+    unsigned       simdSize    = node->gtSIMDSize;
+    VectorConstant vecCns      = {};
+
+    if ((simdSize == 8) && (simdType == TYP_DOUBLE))
+    {
+        simdType = TYP_SIMD8;
+    }
+
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    GenTreeArgList* argList = nullptr;
+    GenTree*        op1     = node->gtGetOp1();
+    GenTree*        op2     = node->gtGetOp2();
+
+    // Spare GenTrees to be used for the lowering logic below
+    // Defined upfront to avoid naming conflicts, etc...
+    GenTree* idx  = nullptr;
+    GenTree* tmp1 = nullptr;
+    GenTree* tmp2 = nullptr;
+    GenTree* tmp3 = nullptr;
+
+    assert(op1 != nullptr);
+
+    unsigned argCnt    = 0;
+    unsigned cnsArgCnt = 0;
+
+    if (op1->OperIsList())
+    {
+        assert(op2 == nullptr);
+
+        for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
+        {
+            if (HandleArgForHWIntrinsicCreate(argList->Current(), argCnt, vecCns, baseType))
+            {
+                cnsArgCnt += 1;
+            }
+            argCnt += 1;
+        }
+    }
+    else
+    {
+        if (HandleArgForHWIntrinsicCreate(op1, argCnt, vecCns, baseType))
+        {
+            cnsArgCnt += 1;
+        }
+        argCnt += 1;
+
+        if (op2 != nullptr)
+        {
+            if (HandleArgForHWIntrinsicCreate(op2, argCnt, vecCns, baseType))
+            {
+                cnsArgCnt += 1;
+            }
+            argCnt += 1;
+        }
+        else if (cnsArgCnt == 1)
+        {
+            // These intrinsics are meant to set the same value to every element
+            // so we'll just specially handle it here and copy it into the remaining
+            // indices.
+
+            for (unsigned i = 1; i < simdSize / genTypeSize(baseType); i++)
+            {
+                HandleArgForHWIntrinsicCreate(op1, i, vecCns, baseType);
+            }
+        }
+    }
+    assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(baseType))));
+
+    if (argCnt == cnsArgCnt)
+    {
+        if (op1->OperIsList())
+        {
+            for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
+            {
+                BlockRange().Remove(argList->Current());
+            }
+        }
+        else
+        {
+            BlockRange().Remove(op1);
+
+            if (op2 != nullptr)
+            {
+                BlockRange().Remove(op2);
+            }
+        }
+
+        CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, simdSize, emitDataAlignment::Required);
+        GenTree* clsVarAddr      = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
+        BlockRange().InsertBefore(node, clsVarAddr);
+
+        node->ChangeOper(GT_IND);
+        node->gtOp1 = clsVarAddr;
+
+        // TODO-ARM64-CQ: We should be able to modify at least the paths that use Insert to trivially support partial
+        // vector constants. With this, we can create a constant if say 50% of the inputs are also constant and just
+        // insert the non-constant values which should still allow some gains.
+
+        return;
+    }
+    else if (argCnt == 1)
+    {
+        // We have the following (where simd is simd8 or simd16):
+        //          /--*  op1  T
+        //   node = *  HWINTRINSIC   simd   T Create
+
+        // We will be constructing the following parts:
+        //           /--*  op1  T
+        //   node  = *  HWINTRINSIC   simd   T DuplicateToVector
+
+        // This is roughly the following managed code:
+        //   return AdvSimd.Arm64.DuplicateToVector(op1);
+
+        if (varTypeIsLong(baseType) || (baseType == TYP_DOUBLE))
+        {
+            node->gtHWIntrinsicId =
+                (simdType == TYP_SIMD8) ? NI_AdvSimd_Arm64_DuplicateToVector64 : NI_AdvSimd_Arm64_DuplicateToVector128;
+        }
+        else
+        {
+            node->gtHWIntrinsicId =
+                (simdType == TYP_SIMD8) ? NI_AdvSimd_DuplicateToVector64 : NI_AdvSimd_DuplicateToVector128;
+        }
+        return;
+    }
+
+    // We have the following (where simd is simd8 or simd16):
+    //          /--*  op1 T
+    //          +--*  ... T
+    //          +--*  opN T
+    //   node = *  HWINTRINSIC   simd   T Create
+
+    if (op1->OperIsList())
+    {
+        argList = op1->AsArgList();
+        op1     = argList->Current();
+        argList = argList->Rest();
+    }
+
+    // We will be constructing the following parts:
+    //          /--*  op1  T
+    //   tmp1 = *  HWINTRINSIC   simd8  T CreateScalarUnsafe
+    //   ...
+
+    // This is roughly the following managed code:
+    //   var tmp1 = Vector64.CreateScalarUnsafe(op1);
+    //   ...
+
+    NamedIntrinsic createScalarUnsafe =
+        (simdType == TYP_SIMD8) ? NI_Vector64_CreateScalarUnsafe : NI_Vector128_CreateScalarUnsafe;
+
+    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, createScalarUnsafe, baseType, simdSize);
+    BlockRange().InsertAfter(op1, tmp1);
+    LowerNode(tmp1);
+
+    unsigned N   = 0;
+    GenTree* opN = nullptr;
+
+    for (N = 1; N < argCnt - 1; N++)
+    {
+        // We will be constructing the following parts:
+        //   ...
+        //   idx  =    CNS_INT       int    N
+        //          /--*  tmp1 simd
+        //          +--*  idx  int
+        //          +--*  opN  T
+        //   tmp1 = *  HWINTRINSIC   simd   T Insert
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   tmp1 = AdvSimd.Insert(tmp1, N, opN);
+        //   ...
+
+        opN = argList->Current();
+
+        idx = comp->gtNewIconNode(N, TYP_INT);
+        BlockRange().InsertBefore(opN, idx);
+
+        tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, idx, opN, NI_AdvSimd_Insert, baseType, simdSize);
+        BlockRange().InsertAfter(opN, tmp1);
+        LowerNode(tmp1);
+
+        argList = argList->Rest();
+    }
+
+    assert(N == (argCnt - 1));
+
+    // We will be constructing the following parts:
+    //   idx  =    CNS_INT       int    N
+    //          /--*  tmp1 simd
+    //          +--*  idx  int
+    //          +--*  opN  T
+    //   node = *  HWINTRINSIC   simd   T Insert
+
+    // This is roughly the following managed code:
+    //   ...
+    //   tmp1 = AdvSimd.Insert(tmp1, N, opN);
+    //   ...
+
+    opN = (argCnt == 2) ? op2 : argList->Current();
+
+    idx = comp->gtNewIconNode(N, TYP_INT);
+    BlockRange().InsertBefore(opN, idx);
+
+    node->gtOp1 = comp->gtNewArgList(tmp1, idx, opN);
+    node->gtOp2 = nullptr;
+
+    node->gtHWIntrinsicId = NI_AdvSimd_Insert;
+}
 #endif // FEATURE_HW_INTRINSICS
 
 //------------------------------------------------------------------------
@@ -610,10 +857,12 @@ void Lowering::ContainCheckIndir(GenTreeIndir* indirNode)
     }
 #endif // FEATURE_SIMD
 
-    GenTree* addr          = indirNode->Addr();
-    bool     makeContained = true;
+    GenTree* addr = indirNode->Addr();
+
     if ((addr->OperGet() == GT_LEA) && IsSafeToContainMem(indirNode, addr))
     {
+        bool makeContained = true;
+
 #ifdef TARGET_ARM
         // ARM floating-point load/store doesn't support a form similar to integer
         // ldr Rdst, [Rbase + Roffset] with offset in a register. The only supported
@@ -637,12 +886,23 @@ void Lowering::ContainCheckIndir(GenTreeIndir* indirNode)
                 }
             }
         }
-#endif
+#endif // TARGET_ARM
+
         if (makeContained)
         {
             MakeSrcContained(indirNode, addr);
         }
     }
+#ifdef TARGET_ARM64
+    else if (addr->OperGet() == GT_CLS_VAR_ADDR)
+    {
+        // These nodes go into an addr mode:
+        // - GT_CLS_VAR_ADDR turns into a constant.
+
+        // make this contained, it turns into a constant that goes into an addr mode
+        MakeSrcContained(indirNode, addr);
+    }
+#endif // TARGET_ARM64
 }
 
 //------------------------------------------------------------------------
@@ -933,12 +1193,11 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
             }
             break;
 
-        case NI_Vector64_Create:
-        case NI_Vector128_Create:
         case NI_Vector64_CreateScalarUnsafe:
         case NI_Vector128_CreateScalarUnsafe:
         case NI_AdvSimd_DuplicateToVector64:
         case NI_AdvSimd_DuplicateToVector128:
+        case NI_AdvSimd_Arm64_DuplicateToVector64:
         case NI_AdvSimd_Arm64_DuplicateToVector128:
             if (intrin.op1->IsCnsIntOrI())
             {
index 14672d1..f2a66d1 100644 (file)
@@ -1100,141 +1100,8 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
     ContainCheckHWIntrinsic(node);
 }
 
-union VectorConstant {
-    int8_t   i8[32];
-    uint8_t  u8[32];
-    int16_t  i16[16];
-    uint16_t u16[16];
-    int32_t  i32[8];
-    uint32_t u32[8];
-    int64_t  i64[4];
-    uint64_t u64[4];
-    float    f32[8];
-    double   f64[4];
-};
-
-//----------------------------------------------------------------------------------------------
-// ProcessArgForHWIntrinsicCreate: Processes an argument for the Lowering::LowerHWIntrinsicCreate method
-//
-//  Arguments:
-//     arg      - The argument to process
-//     argIdx   - The index of the argument being processed
-//     vecCns   - The vector constant being constructed
-//     baseType - The base type of the vector constant
-//
-//  Returns:
-//     true if arg was a constant; otherwise, false
-static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, VectorConstant& vecCns, var_types baseType)
-{
-    switch (baseType)
-    {
-        case TYP_BYTE:
-        case TYP_UBYTE:
-        {
-            if (arg->IsCnsIntOrI())
-            {
-                vecCns.i8[argIdx] = static_cast<int8_t>(arg->AsIntCon()->gtIconVal);
-                return true;
-            }
-            else
-            {
-                // We expect the VectorConstant to have been already zeroed
-                assert(vecCns.i8[argIdx] == 0);
-            }
-            break;
-        }
-
-        case TYP_SHORT:
-        case TYP_USHORT:
-        {
-            if (arg->IsCnsIntOrI())
-            {
-                vecCns.i16[argIdx] = static_cast<int16_t>(arg->AsIntCon()->gtIconVal);
-                return true;
-            }
-            else
-            {
-                // We expect the VectorConstant to have been already zeroed
-                assert(vecCns.i16[argIdx] == 0);
-            }
-            break;
-        }
-
-        case TYP_INT:
-        case TYP_UINT:
-        {
-            if (arg->IsCnsIntOrI())
-            {
-                vecCns.i32[argIdx] = static_cast<int32_t>(arg->AsIntCon()->gtIconVal);
-                return true;
-            }
-            else
-            {
-                // We expect the VectorConstant to have been already zeroed
-                assert(vecCns.i32[argIdx] == 0);
-            }
-            break;
-        }
-
-        case TYP_LONG:
-        case TYP_ULONG:
-        {
-            if (arg->OperIs(GT_CNS_LNG))
-            {
-                vecCns.i64[argIdx] = static_cast<int64_t>(arg->AsLngCon()->gtLconVal);
-                return true;
-            }
-            else
-            {
-                // We expect the VectorConstant to have been already zeroed
-                assert(vecCns.i64[argIdx] == 0);
-            }
-            break;
-        }
-
-        case TYP_FLOAT:
-        {
-            if (arg->IsCnsFltOrDbl())
-            {
-                vecCns.f32[argIdx] = static_cast<float>(arg->AsDblCon()->gtDconVal);
-                return true;
-            }
-            else
-            {
-                // We expect the VectorConstant to have been already zeroed
-                // We check against the i32, rather than f32, to account for -0.0
-                assert(vecCns.i32[argIdx] == 0);
-            }
-            break;
-        }
-
-        case TYP_DOUBLE:
-        {
-            if (arg->IsCnsFltOrDbl())
-            {
-                vecCns.f64[argIdx] = static_cast<double>(arg->AsDblCon()->gtDconVal);
-                return true;
-            }
-            else
-            {
-                // We expect the VectorConstant to have been already zeroed
-                // We check against the i64, rather than f64, to account for -0.0
-                assert(vecCns.i64[argIdx] == 0);
-            }
-            break;
-        }
-
-        default:
-        {
-            unreached();
-        }
-    }
-
-    return false;
-}
-
 //----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicCreate: Lowers a Vector64, Vector128, or Vector256 Create call
+// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 Create call
 //
 //  Arguments:
 //     node - The hardware intrinsic node.
index fab94aa..af75578 100644 (file)
@@ -74,23 +74,32 @@ int LinearScan::BuildIndir(GenTreeIndir* indirTree)
 
     if (addr->isContained())
     {
-        assert(addr->OperGet() == GT_LEA);
-        GenTreeAddrMode* lea = addr->AsAddrMode();
-        index                = lea->Index();
-        cns                  = lea->Offset();
-
-        // On ARM we may need a single internal register
-        // (when both conditions are true then we still only need a single internal register)
-        if ((index != nullptr) && (cns != 0))
+        if (addr->OperGet() == GT_LEA)
         {
-            // ARM does not support both Index and offset so we need an internal register
-            buildInternalIntRegisterDefForNode(indirTree);
+            GenTreeAddrMode* lea = addr->AsAddrMode();
+            index                = lea->Index();
+            cns                  = lea->Offset();
+
+            // On ARM we may need a single internal register
+            // (when both conditions are true then we still only need a single internal register)
+            if ((index != nullptr) && (cns != 0))
+            {
+                // ARM does not support both Index and offset so we need an internal register
+                buildInternalIntRegisterDefForNode(indirTree);
+            }
+            else if (!emitter::emitIns_valid_imm_for_ldst_offset(cns, emitTypeSize(indirTree)))
+            {
+                // This offset can't be contained in the ldr/str instruction, so we need an internal register
+                buildInternalIntRegisterDefForNode(indirTree);
+            }
         }
-        else if (!emitter::emitIns_valid_imm_for_ldst_offset(cns, emitTypeSize(indirTree)))
+#ifdef TARGET_ARM64
+        else if (addr->OperGet() == GT_CLS_VAR_ADDR)
         {
-            // This offset can't be contained in the ldr/str instruction, so we need an internal register
+            // Reserve int to load constant from memory (IF_LARGELDC)
             buildInternalIntRegisterDefForNode(indirTree);
         }
+#endif // TARGET_ARM64
     }
 
 #ifdef FEATURE_SIMD
index d105eab..457d434 100644 (file)
@@ -22,11 +22,14 @@ enum NamedIntrinsic : unsigned short
     NI_System_Type_get_IsValueType,
     NI_System_Type_IsAssignableFrom,
 
-#ifdef FEATURE_HW_INTRINSICS
+    // These are used by HWIntrinsics but are defined more generally
+    // to allow dead code optimization and handle the recursion case
+
     NI_IsSupported_True,
     NI_IsSupported_False,
     NI_Throw_PlatformNotSupportedException,
 
+#ifdef FEATURE_HW_INTRINSICS
     NI_HW_INTRINSIC_START,
 #if defined(TARGET_XARCH)
 #define HARDWARE_INTRINSIC(isa, name, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag)           \
index db15743..5caec48 100644 (file)
@@ -388,7 +388,7 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<long> Create(long value)
         {
-            if (Sse2.X64.IsSupported || AdvSimd.IsSupported)
+            if (Sse2.X64.IsSupported || AdvSimd.Arm64.IsSupported)
             {
                 return Create(value);
             }
@@ -546,7 +546,7 @@ namespace System.Runtime.Intrinsics
         [CLSCompliant(false)]
         public static unsafe Vector128<ulong> Create(ulong value)
         {
-            if (Sse2.X64.IsSupported || AdvSimd.IsSupported)
+            if (Sse2.X64.IsSupported || AdvSimd.Arm64.IsSupported)
             {
                 return Create(value);
             }
@@ -587,32 +587,10 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<byte> Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<byte> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                result = AdvSimd.Insert(result, 3, e3);
-                result = AdvSimd.Insert(result, 4, e4);
-                result = AdvSimd.Insert(result, 5, e5);
-                result = AdvSimd.Insert(result, 6, e6);
-                result = AdvSimd.Insert(result, 7, e7);
-                result = AdvSimd.Insert(result, 8, e8);
-                result = AdvSimd.Insert(result, 9, e9);
-                result = AdvSimd.Insert(result, 10, e10);
-                result = AdvSimd.Insert(result, 11, e11);
-                result = AdvSimd.Insert(result, 12, e12);
-                result = AdvSimd.Insert(result, 13, e13);
-                result = AdvSimd.Insert(result, 14, e14);
-                return AdvSimd.Insert(result, 15, e15);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
 
@@ -650,18 +628,10 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<double> Create(double e0, double e1)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<double> result = CreateScalarUnsafe(e0);
-                return AdvSimd.Insert(result, 1, e1);
-            }
-#endif
 
             return SoftwareFallback(e0, e1);
 
@@ -691,24 +661,10 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<short> Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<short> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                result = AdvSimd.Insert(result, 3, e3);
-                result = AdvSimd.Insert(result, 4, e4);
-                result = AdvSimd.Insert(result, 5, e5);
-                result = AdvSimd.Insert(result, 6, e6);
-                return AdvSimd.Insert(result, 7, e7);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
 
@@ -740,20 +696,10 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<int> Create(int e0, int e1, int e2, int e3)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<int> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                return AdvSimd.Insert(result, 3, e3);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3);
 
@@ -779,18 +725,10 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<long> Create(long e0, long e1)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.X64.IsSupported)
+            if (Sse2.X64.IsSupported || AdvSimd.Arm64.IsSupported)
             {
                 return Create(e0, e1);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<long> result = CreateScalarUnsafe(e0);
-                return AdvSimd.Insert(result, 1, e1);
-            }
-#endif
 
             return SoftwareFallback(e0, e1);
 
@@ -829,32 +767,10 @@ namespace System.Runtime.Intrinsics
         [CLSCompliant(false)]
         public static unsafe Vector128<sbyte> Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<sbyte> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                result = AdvSimd.Insert(result, 3, e3);
-                result = AdvSimd.Insert(result, 4, e4);
-                result = AdvSimd.Insert(result, 5, e5);
-                result = AdvSimd.Insert(result, 6, e6);
-                result = AdvSimd.Insert(result, 7, e7);
-                result = AdvSimd.Insert(result, 8, e8);
-                result = AdvSimd.Insert(result, 9, e9);
-                result = AdvSimd.Insert(result, 10, e10);
-                result = AdvSimd.Insert(result, 11, e11);
-                result = AdvSimd.Insert(result, 12, e12);
-                result = AdvSimd.Insert(result, 13, e13);
-                result = AdvSimd.Insert(result, 14, e14);
-                return AdvSimd.Insert(result, 15, e15);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15);
 
@@ -894,20 +810,10 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector128<float> Create(float e0, float e1, float e2, float e3)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse.IsSupported)
+            if (Sse.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<float> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                return AdvSimd.Insert(result, 3, e3);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3);
 
@@ -940,24 +846,10 @@ namespace System.Runtime.Intrinsics
         [CLSCompliant(false)]
         public static unsafe Vector128<ushort> Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<ushort> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                result = AdvSimd.Insert(result, 3, e3);
-                result = AdvSimd.Insert(result, 4, e4);
-                result = AdvSimd.Insert(result, 5, e5);
-                result = AdvSimd.Insert(result, 6, e6);
-                return AdvSimd.Insert(result, 7, e7);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
 
@@ -990,20 +882,10 @@ namespace System.Runtime.Intrinsics
         [CLSCompliant(false)]
         public static unsafe Vector128<uint> Create(uint e0, uint e1, uint e2, uint e3)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.IsSupported)
+            if (Sse2.IsSupported || AdvSimd.IsSupported)
             {
                 return Create(e0, e1, e2, e3);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<uint> result = CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                return AdvSimd.Insert(result, 3, e3);
-            }
-#endif
 
             return SoftwareFallback(e0, e1, e2, e3);
 
@@ -1030,18 +912,10 @@ namespace System.Runtime.Intrinsics
         [CLSCompliant(false)]
         public static unsafe Vector128<ulong> Create(ulong e0, ulong e1)
         {
-#if !TARGET_ARM && !TARGET_ARM64
-            if (Sse2.X64.IsSupported)
+            if (Sse2.X64.IsSupported || AdvSimd.Arm64.IsSupported)
             {
                 return Create(e0, e1);
             }
-#else
-            if (AdvSimd.IsSupported)
-            {
-                Vector128<ulong> result = CreateScalarUnsafe(e0);
-                return AdvSimd.Insert(result, 1, e1);
-            }
-#endif
 
             return SoftwareFallback(e0, e1);
 
index 56cf62d..f376ebf 100644 (file)
@@ -160,19 +160,29 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector64<byte> Create(byte value)
         {
-            byte* pResult = stackalloc byte[8]
-            {
-                value,
-                value,
-                value,
-                value,
-                value,
-                value,
-                value,
-                value,
-            };
+            if (AdvSimd.IsSupported)
+            {
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<byte>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<byte> SoftwareFallback(byte value)
+            {
+                byte* pResult = stackalloc byte[8]
+                {
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<byte>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Double}" /> instance with all elements initialized to the specified value.</summary>
@@ -181,7 +191,17 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector64<double> Create(double value)
         {
-            return Unsafe.As<double, Vector64<double>>(ref value);
+            if (AdvSimd.IsSupported)
+            {
+                return Create(value);
+            }
+
+            return SoftwareFallback(value);
+
+            static Vector64<double> SoftwareFallback(double value)
+            {
+                return Unsafe.As<double, Vector64<double>>(ref value);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Int16}" /> instance with all elements initialized to the specified value.</summary>
@@ -191,15 +211,25 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector64<short> Create(short value)
         {
-            short* pResult = stackalloc short[4]
+            if (AdvSimd.IsSupported)
             {
-                value,
-                value,
-                value,
-                value,
-            };
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<short>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<short> SoftwareFallback(short value)
+            {
+                short* pResult = stackalloc short[4]
+                {
+                    value,
+                    value,
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<short>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Int32}" /> instance with all elements initialized to the specified value.</summary>
@@ -209,13 +239,23 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector64<int> Create(int value)
         {
-            int* pResult = stackalloc int[2]
+            if (AdvSimd.IsSupported)
             {
-                value,
-                value,
-            };
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<int>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<int> SoftwareFallback(int value)
+            {
+                int* pResult = stackalloc int[2]
+                {
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<int>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Int64}" /> instance with all elements initialized to the specified value.</summary>
@@ -224,30 +264,50 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector64<long> Create(long value)
         {
-            return Unsafe.As<long, Vector64<long>>(ref value);
+            if (AdvSimd.Arm64.IsSupported)
+            {
+                return Create(value);
+            }
+
+            return SoftwareFallback(value);
+
+            static Vector64<long> SoftwareFallback(long value)
+            {
+                return Unsafe.As<long, Vector64<long>>(ref value);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{SByte}" /> instance with all elements initialized to the specified value.</summary>
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_set1_pi8</remarks>
         /// <returns>A new <see cref="Vector64{SByte}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector64<sbyte> Create(sbyte value)
         {
-            sbyte* pResult = stackalloc sbyte[8]
+            if (AdvSimd.IsSupported)
             {
-                value,
-                value,
-                value,
-                value,
-                value,
-                value,
-                value,
-                value,
-            };
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<sbyte>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<sbyte> SoftwareFallback(sbyte value)
+            {
+                sbyte* pResult = stackalloc sbyte[8]
+                {
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<sbyte>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Single}" /> instance with all elements initialized to the specified value.</summary>
@@ -256,59 +316,99 @@ namespace System.Runtime.Intrinsics
         [Intrinsic]
         public static unsafe Vector64<float> Create(float value)
         {
-            float* pResult = stackalloc float[2]
+            if (AdvSimd.IsSupported)
             {
-                value,
-                value,
-            };
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<float>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<float> SoftwareFallback(float value)
+            {
+                float* pResult = stackalloc float[2]
+                {
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<float>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{UInt16}" /> instance with all elements initialized to the specified value.</summary>
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_set1_pi16</remarks>
         /// <returns>A new <see cref="Vector64{UInt16}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector64<ushort> Create(ushort value)
         {
-            ushort* pResult = stackalloc ushort[4]
+            if (AdvSimd.IsSupported)
             {
-                value,
-                value,
-                value,
-                value,
-            };
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<ushort>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<ushort> SoftwareFallback(ushort value)
+            {
+                ushort* pResult = stackalloc ushort[4]
+                {
+                    value,
+                    value,
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<ushort>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{UInt32}" /> instance with all elements initialized to the specified value.</summary>
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_set1_pi32</remarks>
         /// <returns>A new <see cref="Vector64{UInt32}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector64<uint> Create(uint value)
         {
-            uint* pResult = stackalloc uint[2]
+            if (AdvSimd.IsSupported)
             {
-                value,
-                value,
-            };
+                return Create(value);
+            }
 
-            return Unsafe.AsRef<Vector64<uint>>(pResult);
+            return SoftwareFallback(value);
+
+            static Vector64<uint> SoftwareFallback(uint value)
+            {
+                uint* pResult = stackalloc uint[2]
+                {
+                    value,
+                    value,
+                };
+
+                return Unsafe.AsRef<Vector64<uint>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{UInt64}" /> instance with all elements initialized to the specified value.</summary>
         /// <param name="value">The value that all elements will be initialized to.</param>
         /// <returns>A new <see cref="Vector64{UInt64}" /> with all elements initialized to <paramref name="value" />.</returns>
-        [CLSCompliant(false)]
         [Intrinsic]
+        [CLSCompliant(false)]
         public static unsafe Vector64<ulong> Create(ulong value)
         {
-            return Unsafe.As<ulong, Vector64<ulong>>(ref value);
+            if (AdvSimd.Arm64.IsSupported)
+            {
+                return Create(value);
+            }
+
+            return SoftwareFallback(value);
+
+            static Vector64<ulong> SoftwareFallback(ulong value)
+            {
+                return Unsafe.As<ulong, Vector64<ulong>>(ref value);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Byte}" /> instance with each element initialized to the corresponding specified value.</summary>
@@ -322,33 +422,32 @@ namespace System.Runtime.Intrinsics
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_setr_pi8</remarks>
         /// <returns>A new <see cref="Vector64{Byte}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         public static unsafe Vector64<byte> Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<byte> result = Vector64.CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                result = AdvSimd.Insert(result, 3, e3);
-                result = AdvSimd.Insert(result, 4, e4);
-                result = AdvSimd.Insert(result, 5, e5);
-                result = AdvSimd.Insert(result, 6, e6);
-                return AdvSimd.Insert(result, 7, e7);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
 
-            byte* pResult = stackalloc byte[8]
+            return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
+
+            static Vector64<byte> SoftwareFallback(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7)
             {
-                e0,
-                e1,
-                e2,
-                e3,
-                e4,
-                e5,
-                e6,
-                e7,
-            };
+                byte* pResult = stackalloc byte[8]
+                {
+                    e0,
+                    e1,
+                    e2,
+                    e3,
+                    e4,
+                    e5,
+                    e6,
+                    e7,
+                };
 
-            return Unsafe.AsRef<Vector64<byte>>(pResult);
+                return Unsafe.AsRef<Vector64<byte>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Int16}" /> instance with each element initialized to the corresponding specified value.</summary>
@@ -358,25 +457,28 @@ namespace System.Runtime.Intrinsics
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_setr_pi16</remarks>
         /// <returns>A new <see cref="Vector64{Int16}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         public static unsafe Vector64<short> Create(short e0, short e1, short e2, short e3)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<short> result = Vector64.CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                return AdvSimd.Insert(result, 3, e3);
+                return Create(e0, e1, e2, e3);
             }
 
-            short* pResult = stackalloc short[4]
+            return SoftwareFallback(e0, e1, e2, e3);
+
+            static Vector64<short> SoftwareFallback(short e0, short e1, short e2, short e3)
             {
-                e0,
-                e1,
-                e2,
-                e3,
-            };
+                short* pResult = stackalloc short[4]
+                {
+                    e0,
+                    e1,
+                    e2,
+                    e3,
+                };
 
-            return Unsafe.AsRef<Vector64<short>>(pResult);
+                return Unsafe.AsRef<Vector64<short>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Int32}" /> instance with each element initialized to the corresponding specified value.</summary>
@@ -384,21 +486,26 @@ namespace System.Runtime.Intrinsics
         /// <param name="e1">The value that element 1 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_setr_pi32</remarks>
         /// <returns>A new <see cref="Vector64{Int32}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         public static unsafe Vector64<int> Create(int e0, int e1)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<int> result = Vector64.CreateScalarUnsafe(e0);
-                return AdvSimd.Insert(result, 1, e1);
+                return Create(e0, e1);
             }
 
-            int* pResult = stackalloc int[2]
+            return SoftwareFallback(e0, e1);
+
+            static Vector64<int> SoftwareFallback(int e0, int e1)
             {
-                e0,
-                e1,
-            };
+                int* pResult = stackalloc int[2]
+                {
+                    e0,
+                    e1,
+                };
 
-            return Unsafe.AsRef<Vector64<int>>(pResult);
+                return Unsafe.AsRef<Vector64<int>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{SByte}" /> instance with each element initialized to the corresponding specified value.</summary>
@@ -412,55 +519,59 @@ namespace System.Runtime.Intrinsics
         /// <param name="e7">The value that element 7 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_setr_pi8</remarks>
         /// <returns>A new <see cref="Vector64{SByte}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector64<sbyte> Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<sbyte> result = Vector64.CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                result = AdvSimd.Insert(result, 3, e3);
-                result = AdvSimd.Insert(result, 4, e4);
-                result = AdvSimd.Insert(result, 5, e5);
-                result = AdvSimd.Insert(result, 6, e6);
-                return AdvSimd.Insert(result, 7, e7);
+                return Create(e0, e1, e2, e3, e4, e5, e6, e7);
             }
 
-            sbyte* pResult = stackalloc sbyte[8]
+            return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7);
+
+            static Vector64<sbyte> SoftwareFallback(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7)
             {
-                e0,
-                e1,
-                e2,
-                e3,
-                e4,
-                e5,
-                e6,
-                e7,
-            };
+                sbyte* pResult = stackalloc sbyte[8]
+                {
+                    e0,
+                    e1,
+                    e2,
+                    e3,
+                    e4,
+                    e5,
+                    e6,
+                    e7,
+                };
 
-            return Unsafe.AsRef<Vector64<sbyte>>(pResult);
+                return Unsafe.AsRef<Vector64<sbyte>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Single}" /> instance with each element initialized to the corresponding specified value.</summary>
         /// <param name="e0">The value that element 0 will be initialized to.</param>
         /// <param name="e1">The value that element 1 will be initialized to.</param>
         /// <returns>A new <see cref="Vector64{Single}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         public static unsafe Vector64<float> Create(float e0, float e1)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<float> result = Vector64.CreateScalarUnsafe(e0);
-                return AdvSimd.Insert(result, 1, e1);
+                return Create(e0, e1);
             }
 
-            float* pResult = stackalloc float[2]
+            return SoftwareFallback(e0, e1);
+
+            static Vector64<float> SoftwareFallback(float e0, float e1)
             {
-                e0,
-                e1,
-            };
+                float* pResult = stackalloc float[2]
+                {
+                    e0,
+                    e1,
+                };
 
-            return Unsafe.AsRef<Vector64<float>>(pResult);
+                return Unsafe.AsRef<Vector64<float>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{UInt16}" /> instance with each element initialized to the corresponding specified value.</summary>
@@ -470,26 +581,29 @@ namespace System.Runtime.Intrinsics
         /// <param name="e3">The value that element 3 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_setr_pi16</remarks>
         /// <returns>A new <see cref="Vector64{UInt16}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector64<ushort> Create(ushort e0, ushort e1, ushort e2, ushort e3)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<ushort> result = Vector64.CreateScalarUnsafe(e0);
-                result = AdvSimd.Insert(result, 1, e1);
-                result = AdvSimd.Insert(result, 2, e2);
-                return AdvSimd.Insert(result, 3, e3);
+                return Create(e0, e1, e2, e3);
             }
 
-            ushort* pResult = stackalloc ushort[4]
+            return SoftwareFallback(e0, e1, e2, e3);
+
+            static Vector64<ushort> SoftwareFallback(ushort e0, ushort e1, ushort e2, ushort e3)
             {
-                e0,
-                e1,
-                e2,
-                e3,
-            };
+                ushort* pResult = stackalloc ushort[4]
+                {
+                    e0,
+                    e1,
+                    e2,
+                    e3,
+                };
 
-            return Unsafe.AsRef<Vector64<ushort>>(pResult);
+                return Unsafe.AsRef<Vector64<ushort>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{UInt32}" /> instance with each element initialized to the corresponding specified value.</summary>
@@ -497,22 +611,27 @@ namespace System.Runtime.Intrinsics
         /// <param name="e1">The value that element 1 will be initialized to.</param>
         /// <remarks>On x86, this method corresponds to __m64 _mm_setr_pi32</remarks>
         /// <returns>A new <see cref="Vector64{UInt32}" /> with each element initialized to corresponding specified value.</returns>
+        [Intrinsic]
         [CLSCompliant(false)]
         public static unsafe Vector64<uint> Create(uint e0, uint e1)
         {
             if (AdvSimd.IsSupported)
             {
-                Vector64<uint> result = Vector64.CreateScalarUnsafe(e0);
-                return AdvSimd.Insert(result, 1, e1);
+                return Create(e0, e1);
             }
 
-            uint* pResult = stackalloc uint[2]
+            return SoftwareFallback(e0, e1);
+
+            static Vector64<uint> SoftwareFallback(uint e0, uint e1)
             {
-                e0,
-                e1,
-            };
+                uint* pResult = stackalloc uint[2]
+                {
+                    e0,
+                    e1,
+                };
 
-            return Unsafe.AsRef<Vector64<uint>>(pResult);
+                return Unsafe.AsRef<Vector64<uint>>(pResult);
+            }
         }
 
         /// <summary>Creates a new <see cref="Vector64{Byte}" /> instance with the first element initialized to the specified value and the remaining elements initialized to zero.</summary>