From afc39ca80a16287a7f4d67a19364a1a9e0b504f2 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 11 Dec 2018 13:11:00 -0800 Subject: [PATCH] Moving the remaining Vector128/256 helper intrinsics to be implemented using other intrinsics (dotnet/coreclr#21451) * Updating Vector256.Create(V128, V128) to be implemented using other intrinsics * Updating Vector128.Create(T, ...) and Vector256.Create(T, ...) to be implemented using other intrinsics * Renaming CreateSoftware to SoftwareFallback and adding clarifying comments to the helper method code * Fixing Vector128.Create(int, int, int, int) to properly consume the elements for the Sse2 codepath Commit migrated from https://github.com/dotnet/coreclr/commit/aaa00748a9029dc23a3b5455e81a3133d91afe56 --- .../src/System/Runtime/Intrinsics/Vector128.cs | 500 ++++++++++++--- .../src/System/Runtime/Intrinsics/Vector256.cs | 688 ++++++++++++++------- .../src/System/Runtime/Intrinsics/Vector256_1.cs | 9 + 3 files changed, 876 insertions(+), 321 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index b877bbe..a57f999 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -43,6 +43,9 @@ namespace System.Runtime.Intrinsics if (Sse2.IsSupported) { + // We first unpack as bytes to duplicate value into the lower 2 bytes, then we treat it as a ushort and unpack again to duplicate those + // bits into the lower 2 words, we can finally treat it as a uint and shuffle the lower dword to duplicate value across the entire result + Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > result = Sse2.UnpackLow(result.AsUInt16(), result.AsUInt16()).AsByte(); // < v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > @@ -89,8 +92,11 @@ namespace System.Runtime.Intrinsics return Sse3.MoveAndDuplicate(result); // < v, v > } - if (Sse.IsSupported) + if (Sse2.IsSupported) { + // Treating the value as a set of singles and emitting MoveLowToHigh is more efficient than dealing with the elements directly as double + // However, we still need to check if Sse2 is supported since CreateScalarUnsafe needs it to for movsd, when value is not already in register + Vector128 result = CreateScalarUnsafe(value); // < v, ? > return Sse.MoveLowToHigh(result.AsSingle(), result.AsSingle()).AsDouble(); // < v, v > } @@ -123,6 +129,9 @@ namespace System.Runtime.Intrinsics if (Sse2.IsSupported) { + // We first unpack as ushort to duplicate value into the lower 2 words, then we can treat it as a uint and shuffle the lower dword to + // duplicate value across the entire result + Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ? > return Sse2.Shuffle(result.AsInt32(), 0x00).AsInt16(); // < v, v, v, v, v, v, v, v > @@ -237,6 +246,9 @@ namespace System.Runtime.Intrinsics if (Sse2.IsSupported) { + // We first unpack as bytes to duplicate value into the lower 2 bytes, then we treat it as a ushort and unpack again to duplicate those + // bits into the lower 2 words, we can finally treat it as a uint and shuffle the lower dword to duplicate value across the entire result + Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > result = Sse2.UnpackLow(result.AsInt16(), result.AsInt16()).AsSByte(); // < v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > @@ -326,6 +338,9 @@ namespace System.Runtime.Intrinsics if (Sse2.IsSupported) { + // We first unpack as ushort to duplicate value into the lower 2 words, then we can treat it as a uint and shuffle the lower dword to + // duplicate value across the entire result + Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ? > return Sse2.Shuffle(result.AsUInt32(), 0x00).AsUInt16(); // < v, v, v, v, v, v, v, v > @@ -439,44 +454,117 @@ namespace System.Runtime.Intrinsics /// The value that element 14 will be initialized to. /// The value that element 15 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector128 Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15) { - var pResult = stackalloc byte[16] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - e8, - e9, - e10, - e11, - e12, - e13, - e14, - e15, - }; + if (Sse41.IsSupported) + { + Vector128 result = CreateScalarUnsafe(e0); // < 0, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e1, 1); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e8, 8); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e9, 9); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e10, 10); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e11, 11); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ??, ??, ??, ?? > + result = Sse41.Insert(result, e12, 12); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ??, ??, ?? > + result = Sse41.Insert(result, e13, 13); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ??, ?? > + result = Sse41.Insert(result, e14, 14); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ?? > + return Sse41.Insert(result, e15, 15); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + } - return Unsafe.AsRef>(pResult); + if (Sse2.IsSupported) + { + // We deal with the elements in order, unpacking the ordered pairs of bytes into vectors. We then treat those vectors as ushort and + // unpack them again, then again treating those results as uint, and a final time treating them as ulong. This efficiently gets all + // bytes ordered into the result. + + Vector128 lo16, hi16; + Vector128 lo32, hi32; + Vector128 lo64, hi64; + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsUInt16(); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsUInt16(); // < 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + lo32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e4), CreateScalarUnsafe(e5)).AsUInt16(); // < 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e6), CreateScalarUnsafe(e7)).AsUInt16(); // < 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo64 = Sse2.UnpackLow(lo32, hi32).AsUInt64(); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e8), CreateScalarUnsafe(e9)).AsUInt16(); // < 8, 9, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e10), CreateScalarUnsafe(e11)).AsUInt16(); // < 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + lo32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 8, 9, 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e12), CreateScalarUnsafe(e13)).AsUInt16(); // < 12, 13, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e14), CreateScalarUnsafe(e15)).AsUInt16(); // < 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + hi64 = Sse2.UnpackLow(lo32, hi32).AsUInt64(); // < 8, 9, 10, 11, 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ?? > + + return Sse2.UnpackLow(lo64, hi64).AsByte(); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + } + + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + + Vector128 SoftwareFallback(byte i0, byte i1, byte i2, byte i3, byte i4, byte i5, byte i6, byte i7, byte i8, byte i9, byte i10, byte i11, byte i12, byte i13, byte i14, byte i15) + { + var pResult = stackalloc byte[16] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + i8, + i9, + i10, + i11, + i12, + i13, + i14, + i15, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. /// The value that element 0 will be initialized to. /// The value that element 1 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector128 Create(double e0, double e1) { - var pResult = stackalloc double[2] + if (Sse2.IsSupported) { - e0, - e1, - }; + // Treating the value as a set of singles and emitting MoveLowToHigh is more efficient than dealing with the elements directly as double + // However, we still need to check if Sse2 is supported since CreateScalarUnsafe needs it to for movsd, when value is not already in register - return Unsafe.AsRef>(pResult); + return Sse.MoveLowToHigh(CreateScalarUnsafe(e0).AsSingle(), CreateScalarUnsafe(e1).AsSingle()).AsDouble(); + } + + return SoftwareFallback(e0, e1); + + Vector128 SoftwareFallback(double i0, double i1) + { + var pResult = stackalloc double[2] + { + i0, + i1, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -489,21 +577,39 @@ namespace System.Runtime.Intrinsics /// The value that element 6 will be initialized to. /// The value that element 7 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector128 Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7) { - var pResult = stackalloc short[8] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - }; + if (Sse2.IsSupported) + { + Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ?, ?, ?, ?, ? > + result = Sse2.Insert(result, e1, 1); // < 0, 1, ?, ?, ?, ?, ?, ? > + result = Sse2.Insert(result, e2, 2); // < 0, 1, 2, ?, ?, ?, ?, ? > + result = Sse2.Insert(result, e3, 3); // < 0, 1, 2, 3, ?, ?, ?, ? > + result = Sse2.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ?, ?, ? > + result = Sse2.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ?, ? > + result = Sse2.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ? > + return Sse2.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7 > + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); + + Vector128 SoftwareFallback(short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7) + { + var pResult = stackalloc short[8] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -512,32 +618,74 @@ namespace System.Runtime.Intrinsics /// The value that element 2 will be initialized to. /// The value that element 3 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector128 Create(int e0, int e1, int e2, int e3) { - var pResult = stackalloc int[4] + if (Sse41.IsSupported) { - e0, - e1, - e2, - e3, - }; + Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ? > + result = Sse41.Insert(result, e1, 1); // < 0, 1, ?, ? > + result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ? > + return Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3 > + } - return Unsafe.AsRef>(pResult); + if (Sse2.IsSupported) + { + // We deal with the elements in order, unpacking the ordered pairs of int into vectors. We then treat those vectors as ulong and + // unpack them again. This efficiently gets all ints ordered into the result. + + Vector128 lo64, hi64; + lo64 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsInt64(); // < 0, 1, ?, ? > + hi64 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsInt64(); // < 2, 3, ?, ? > + return Sse2.UnpackLow(lo64, hi64).AsInt32(); // < 0, 1, 2, 3 > + } + + return SoftwareFallback(e0, e1, e2, e3); + + Vector128 SoftwareFallback(int i0, int i1, int i2, int i3) + { + var pResult = stackalloc int[4] + { + i0, + i1, + i2, + i3, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. /// The value that element 0 will be initialized to. /// The value that element 1 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector128 Create(long e0, long e1) { - var pResult = stackalloc long[2] + if (Sse41.X64.IsSupported) { - e0, - e1, - }; + Vector128 result = CreateScalarUnsafe(e0); // < 0, ? > + return Sse41.X64.Insert(result, e1, 1); // < 0, 1 > + } - return Unsafe.AsRef>(pResult); + if (Sse2.X64.IsSupported) + { + return Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)); // < 0, 1 > + } + + return SoftwareFallback(e0, e1); + + Vector128 SoftwareFallback(long i0, long i1) + { + var pResult = stackalloc long[2] + { + i0, + i1, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -558,30 +706,89 @@ namespace System.Runtime.Intrinsics /// The value that element 14 will be initialized to. /// The value that element 15 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector128 Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15) { - var pResult = stackalloc sbyte[16] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - e8, - e9, - e10, - e11, - e12, - e13, - e14, - e15, - }; + if (Sse41.IsSupported) + { + Vector128 result = CreateScalarUnsafe(e0); // < 0, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e1, 1); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e8, 8); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, ??, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e9, 9); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ??, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e10, 10); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ??, ??, ??, ??, ?? > + result = Sse41.Insert(result, e11, 11); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ??, ??, ??, ?? > + result = Sse41.Insert(result, e12, 12); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ??, ??, ?? > + result = Sse41.Insert(result, e13, 13); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ??, ?? > + result = Sse41.Insert(result, e14, 14); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ?? > + return Sse41.Insert(result, e15, 15); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + } - return Unsafe.AsRef>(pResult); + if (Sse2.IsSupported) + { + // We deal with the elements in order, unpacking the ordered pairs of bytes into vectors. We then treat those vectors as ushort and + // unpack them again, then again treating those results as uint, and a final time treating them as ulong. This efficiently gets all + // bytes ordered into the result. + + Vector128 lo16, hi16; + Vector128 lo32, hi32; + Vector128 lo64, hi64; + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsInt16(); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsInt16(); // < 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + lo32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e4), CreateScalarUnsafe(e5)).AsInt16(); // < 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e6), CreateScalarUnsafe(e7)).AsInt16(); // < 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo64 = Sse2.UnpackLow(lo32, hi32).AsInt64(); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e8), CreateScalarUnsafe(e9)).AsInt16(); // < 8, 9, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e10), CreateScalarUnsafe(e11)).AsInt16(); // < 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + lo32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 8, 9, 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e12), CreateScalarUnsafe(e13)).AsInt16(); // < 12, 13, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e14), CreateScalarUnsafe(e15)).AsInt16(); // < 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + hi32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > + + hi64 = Sse2.UnpackLow(lo32, hi32).AsInt64(); // < 8, 9, 10, 11, 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ?? > + + return Sse2.UnpackLow(lo64, hi64).AsSByte(); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + } + + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + + Vector128 SoftwareFallback(sbyte i0, sbyte i1, sbyte i2, sbyte i3, sbyte i4, sbyte i5, sbyte i6, sbyte i7, sbyte i8, sbyte i9, sbyte i10, sbyte i11, sbyte i12, sbyte i13, sbyte i14, sbyte i15) + { + var pResult = stackalloc sbyte[16] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + i8, + i9, + i10, + i11, + i12, + i13, + i14, + i15, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -590,17 +797,39 @@ namespace System.Runtime.Intrinsics /// The value that element 2 will be initialized to. /// The value that element 3 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector128 Create(float e0, float e1, float e2, float e3) { - var pResult = stackalloc float[4] + if (Sse41.IsSupported) { - e0, - e1, - e2, - e3, - }; + Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ? > + result = Sse41.Insert(result, CreateScalarUnsafe(e1), 0x10); // < 0, 1, ?, ? > + result = Sse41.Insert(result, CreateScalarUnsafe(e2), 0x20); // < 0, 1, 2, ? > + return Sse41.Insert(result, CreateScalarUnsafe(e3), 0x30); // < 0, 1, 2, 3 > + } - return Unsafe.AsRef>(pResult); + if (Sse.IsSupported) + { + Vector128 lo64, hi64; + lo64 = Sse.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)); // < 0, 1, ?, ? > + hi64 = Sse.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)); // < 2, 3, ?, ? > + return Sse.MoveLowToHigh(lo64, hi64); // < 0, 1, 2, 3 > + } + + return SoftwareFallback(e0, e1, e2, e3); + + Vector128 SoftwareFallback(float i0, float i1, float i2, float i3) + { + var pResult = stackalloc float[4] + { + i0, + i1, + i2, + i3, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -613,22 +842,40 @@ namespace System.Runtime.Intrinsics /// The value that element 6 will be initialized to. /// The value that element 7 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector128 Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7) { - var pResult = stackalloc ushort[8] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - }; + if (Sse2.IsSupported) + { + Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ?, ?, ?, ?, ? > + result = Sse2.Insert(result, e1, 1); // < 0, 1, ?, ?, ?, ?, ?, ? > + result = Sse2.Insert(result, e2, 2); // < 0, 1, 2, ?, ?, ?, ?, ? > + result = Sse2.Insert(result, e3, 3); // < 0, 1, 2, 3, ?, ?, ?, ? > + result = Sse2.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ?, ?, ? > + result = Sse2.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ?, ? > + result = Sse2.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ? > + return Sse2.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7 > + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); + + Vector128 SoftwareFallback(ushort i0, ushort i1, ushort i2, ushort i3, ushort i4, ushort i5, ushort i6, ushort i7) + { + var pResult = stackalloc ushort[8] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -637,34 +884,76 @@ namespace System.Runtime.Intrinsics /// The value that element 2 will be initialized to. /// The value that element 3 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector128 Create(uint e0, uint e1, uint e2, uint e3) { - var pResult = stackalloc uint[4] + if (Sse41.IsSupported) { - e0, - e1, - e2, - e3, - }; + Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ? > + result = Sse41.Insert(result, e1, 1); // < 0, 1, ?, ? > + result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ? > + return Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3 > + } - return Unsafe.AsRef>(pResult); + if (Sse2.IsSupported) + { + // We deal with the elements in order, unpacking the ordered pairs of int into vectors. We then treat those vectors as ulong and + // unpack them again. This efficiently gets all ints ordered into the result. + + Vector128 lo64, hi64; + lo64 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsUInt64(); // < 0, 1, ?, ? > + hi64 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsUInt64(); // < 2, 3, ?, ? > + return Sse2.UnpackLow(lo64, hi64).AsUInt32(); // < 0, 1, 2, 3 > + } + + return SoftwareFallback(e0, e1, e2, e3); + + Vector128 SoftwareFallback(uint i0, uint i1, uint i2, uint i3) + { + var pResult = stackalloc uint[4] + { + i0, + i1, + i2, + i3, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. /// The value that element 0 will be initialized to. /// The value that element 1 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector128 Create(ulong e0, ulong e1) { - var pResult = stackalloc ulong[2] + if (Sse41.X64.IsSupported) { - e0, - e1, - }; + Vector128 result = CreateScalarUnsafe(e0); // < 0, ? > + return Sse41.X64.Insert(result, e1, 1); // < 0, 1 > + } - return Unsafe.AsRef>(pResult); + if (Sse2.X64.IsSupported) + { + return Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)); // < 0, 1 > + } + + return SoftwareFallback(e0, e1); + + Vector128 SoftwareFallback(ulong i0, ulong i1) + { + var pResult = stackalloc ulong[2] + { + i0, + i1, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance from two instances. @@ -829,6 +1118,8 @@ namespace System.Runtime.Intrinsics { if (Sse2.IsSupported) { + // ConvertScalarToVector128 only deals with 32/64-bit inputs and we need to ensure all upper-bits are zeroed, so we call + // the UInt32 overload to ensure zero extension. We can then just treat the result as byte and return. return Sse2.ConvertScalarToVector128UInt32(value).AsByte(); } @@ -871,6 +1162,8 @@ namespace System.Runtime.Intrinsics { if (Sse2.IsSupported) { + // ConvertScalarToVector128 only deals with 32/64-bit inputs and we need to ensure all upper-bits are zeroed, so we cast + // to ushort and call the UInt32 overload to ensure zero extension. We can then just treat the result as short and return. return Sse2.ConvertScalarToVector128UInt32((ushort)(value)).AsInt16(); } @@ -934,7 +1227,8 @@ namespace System.Runtime.Intrinsics { if (Sse2.IsSupported) { - // Convert to byte so that we zero-extend, rather than sign-extend + // ConvertScalarToVector128 only deals with 32/64-bit inputs and we need to ensure all upper-bits are zeroed, so we cast + // to byte and call the UInt32 overload to ensure zero extension. We can then just treat the result as sbyte and return. return Sse2.ConvertScalarToVector128UInt32((byte)(value)).AsSByte(); } @@ -978,6 +1272,8 @@ namespace System.Runtime.Intrinsics { if (Sse2.IsSupported) { + // ConvertScalarToVector128 only deals with 32/64-bit inputs and we need to ensure all upper-bits are zeroed, so we call + // the UInt32 overload to ensure zero extension. We can then just treat the result as ushort and return. return Sse2.ConvertScalarToVector128UInt32(value).AsUInt16(); } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 34066a5..f1f7078 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -497,45 +497,58 @@ namespace System.Runtime.Intrinsics /// The value that element 30 will be initialized to. /// The value that element 31 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15, byte e16, byte e17, byte e18, byte e19, byte e20, byte e21, byte e22, byte e23, byte e24, byte e25, byte e26, byte e27, byte e28, byte e29, byte e30, byte e31) { - var pResult = stackalloc byte[32] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - e8, - e9, - e10, - e11, - e12, - e13, - e14, - e15, - e16, - e17, - e18, - e19, - e20, - e21, - e22, - e23, - e24, - e25, - e26, - e27, - e28, - e29, - e30, - e31, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + Vector128 hi128 = Vector128.Create(e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); + + Vector256 SoftwareFallback(byte i0, byte i1, byte i2, byte i3, byte i4, byte i5, byte i6, byte i7, byte i8, byte i9, byte i10, byte i11, byte i12, byte i13, byte i14, byte i15, byte i16, byte i17, byte i18, byte i19, byte i20, byte i21, byte i22, byte i23, byte i24, byte i25, byte i26, byte i27, byte i28, byte i29, byte i30, byte i31) + { + var pResult = stackalloc byte[32] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + i8, + i9, + i10, + i11, + i12, + i13, + i14, + i15, + i16, + i17, + i18, + i19, + i20, + i21, + i22, + i23, + i24, + i25, + i26, + i27, + i28, + i29, + i30, + i31, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -544,17 +557,30 @@ namespace System.Runtime.Intrinsics /// The value that element 2 will be initialized to. /// The value that element 3 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(double e0, double e1, double e2, double e3) { - var pResult = stackalloc double[4] + if (Avx.IsSupported) { - e0, - e1, - e2, - e3, - }; + Vector128 lo128 = Vector128.Create(e0, e1); + Vector128 hi128 = Vector128.Create(e2, e3); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3); + + Vector256 SoftwareFallback(double i0, double i1, double i2, double i3) + { + var pResult = stackalloc double[4] + { + i0, + i1, + i2, + i3, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -575,29 +601,42 @@ namespace System.Runtime.Intrinsics /// The value that element 14 will be initialized to. /// The value that element 15 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15) { - var pResult = stackalloc short[16] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - e8, - e9, - e10, - e11, - e12, - e13, - e14, - e15, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7); + Vector128 hi128 = Vector128.Create(e8, e9, e10, e11, e12, e13, e14, e15); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + + Vector256 SoftwareFallback(short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7, short i8, short i9, short i10, short i11, short i12, short i13, short i14, short i15) + { + var pResult = stackalloc short[16] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + i8, + i9, + i10, + i11, + i12, + i13, + i14, + i15, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -610,21 +649,34 @@ namespace System.Runtime.Intrinsics /// The value that element 6 will be initialized to. /// The value that element 7 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(int e0, int e1, int e2, int e3, int e4, int e5, int e6, int e7) { - var pResult = stackalloc int[8] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3); + Vector128 hi128 = Vector128.Create(e4, e5, e6, e7); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); + + Vector256 SoftwareFallback(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7) + { + var pResult = stackalloc int[8] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -633,17 +685,30 @@ namespace System.Runtime.Intrinsics /// The value that element 2 will be initialized to. /// The value that element 3 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(long e0, long e1, long e2, long e3) { - var pResult = stackalloc long[4] + if (Sse2.X64.IsSupported && Avx.IsSupported) { - e0, - e1, - e2, - e3, - }; + Vector128 lo128 = Vector128.Create(e0, e1); + Vector128 hi128 = Vector128.Create(e2, e3); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3); + + Vector256 SoftwareFallback(long i0, long i1, long i2, long i3) + { + var pResult = stackalloc long[4] + { + i0, + i1, + i2, + i3, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -680,46 +745,59 @@ namespace System.Runtime.Intrinsics /// The value that element 30 will be initialized to. /// The value that element 31 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15, sbyte e16, sbyte e17, sbyte e18, sbyte e19, sbyte e20, sbyte e21, sbyte e22, sbyte e23, sbyte e24, sbyte e25, sbyte e26, sbyte e27, sbyte e28, sbyte e29, sbyte e30, sbyte e31) { - var pResult = stackalloc sbyte[32] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - e8, - e9, - e10, - e11, - e12, - e13, - e14, - e15, - e16, - e17, - e18, - e19, - e20, - e21, - e22, - e23, - e24, - e25, - e26, - e27, - e28, - e29, - e30, - e31, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + Vector128 hi128 = Vector128.Create(e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); + + Vector256 SoftwareFallback(sbyte i0, sbyte i1, sbyte i2, sbyte i3, sbyte i4, sbyte i5, sbyte i6, sbyte i7, sbyte i8, sbyte i9, sbyte i10, sbyte i11, sbyte i12, sbyte i13, sbyte i14, sbyte i15, sbyte i16, sbyte i17, sbyte i18, sbyte i19, sbyte i20, sbyte i21, sbyte i22, sbyte i23, sbyte i24, sbyte i25, sbyte i26, sbyte i27, sbyte i28, sbyte i29, sbyte i30, sbyte i31) + { + var pResult = stackalloc sbyte[32] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + i8, + i9, + i10, + i11, + i12, + i13, + i14, + i15, + i16, + i17, + i18, + i19, + i20, + i21, + i22, + i23, + i24, + i25, + i26, + i27, + i28, + i29, + i30, + i31, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -732,21 +810,34 @@ namespace System.Runtime.Intrinsics /// The value that element 6 will be initialized to. /// The value that element 7 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(float e0, float e1, float e2, float e3, float e4, float e5, float e6, float e7) { - var pResult = stackalloc float[8] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3); + Vector128 hi128 = Vector128.Create(e4, e5, e6, e7); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); + + Vector256 SoftwareFallback(float i0, float i1, float i2, float i3, float i4, float i5, float i6, float i7) + { + var pResult = stackalloc float[8] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -767,30 +858,43 @@ namespace System.Runtime.Intrinsics /// The value that element 14 will be initialized to. /// The value that element 15 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7, ushort e8, ushort e9, ushort e10, ushort e11, ushort e12, ushort e13, ushort e14, ushort e15) { - var pResult = stackalloc ushort[16] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - e8, - e9, - e10, - e11, - e12, - e13, - e14, - e15, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7); + Vector128 hi128 = Vector128.Create(e8, e9, e10, e11, e12, e13, e14, e15); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); + + Vector256 SoftwareFallback(ushort i0, ushort i1, ushort i2, ushort i3, ushort i4, ushort i5, ushort i6, ushort i7, ushort i8, ushort i9, ushort i10, ushort i11, ushort i12, ushort i13, ushort i14, ushort i15) + { + var pResult = stackalloc ushort[16] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + i8, + i9, + i10, + i11, + i12, + i13, + i14, + i15, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -803,22 +907,35 @@ namespace System.Runtime.Intrinsics /// The value that element 6 will be initialized to. /// The value that element 7 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(uint e0, uint e1, uint e2, uint e3, uint e4, uint e5, uint e6, uint e7) { - var pResult = stackalloc uint[8] - { - e0, - e1, - e2, - e3, - e4, - e5, - e6, - e7, - }; + if (Avx.IsSupported) + { + Vector128 lo128 = Vector128.Create(e0, e1, e2, e3); + Vector128 hi128 = Vector128.Create(e4, e5, e6, e7); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); + + Vector256 SoftwareFallback(uint i0, uint i1, uint i2, uint i3, uint i4, uint i5, uint i6, uint i7) + { + var pResult = stackalloc uint[8] + { + i0, + i1, + i2, + i3, + i4, + i5, + i6, + i7, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance with each element initialized to the corresponding specified value. @@ -827,172 +944,305 @@ namespace System.Runtime.Intrinsics /// The value that element 2 will be initialized to. /// The value that element 3 will be initialized to. /// A new with each element initialized to corresponding specified value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(ulong e0, ulong e1, ulong e2, ulong e3) { - var pResult = stackalloc ulong[4] + if (Sse2.X64.IsSupported && Avx.IsSupported) { - e0, - e1, - e2, - e3, - }; + Vector128 lo128 = Vector128.Create(e0, e1); + Vector128 hi128 = Vector128.Create(e2, e3); + return Create(lo128, hi128); + } - return Unsafe.AsRef>(pResult); + return SoftwareFallback(e0, e1, e2, e3); + + Vector256 SoftwareFallback(ulong i0, ulong i1, ulong i2, ulong i3) + { + var pResult = stackalloc ulong[4] + { + i0, + i1, + i2, + i3, + }; + + return Unsafe.AsRef>(pResult); + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } + + return SoftwareFallback(lower, upper); + + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; - return result256; + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } + + return SoftwareFallback(lower, upper); - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; + + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; - return result256; + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } + + return SoftwareFallback(lower, upper); + + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; - return result256; + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + return SoftwareFallback(lower, upper); - return result256; + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; + + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; + + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + return SoftwareFallback(lower, upper); - return result256; + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; + + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; + + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + return SoftwareFallback(lower, upper); + + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; - return result256; + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; + + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } + + return SoftwareFallback(lower, upper); + + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; - return result256; + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } + + return SoftwareFallback(lower, upper); - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; - return result256; + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; + + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + return SoftwareFallback(lower, upper); - return result256; + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; + + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; + + return result256; + } } /// Creates a new instance from two instances. /// The value that the lower 128-bits will be initialized to. /// The value that the upper 128-bits will be initialized to. /// A new initialized from and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe Vector256 Create(Vector128 lower, Vector128 upper) { - Vector256 result256 = Vector256.Zero; + if (Avx.IsSupported) + { + Vector256 result = lower.ToVector256Unsafe(); + return result.WithUpper(upper); + } + + return SoftwareFallback(lower, upper); - ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); - result128 = lower; - Unsafe.Add(ref result128, 1) = upper; + Vector256 SoftwareFallback(Vector128 x, Vector128 y) + { + Vector256 result256 = Vector256.Zero; + + ref Vector128 result128 = ref Unsafe.As, Vector128>(ref result256); + result128 = x; + Unsafe.Add(ref result128, 1) = y; - return result256; + return result256; + } } /// Creates a new instance with the first element initialized to the specified value and the remaining elements initialized to zero. diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs index 6c7a108..44f2928 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs @@ -315,11 +315,14 @@ namespace System.Runtime.Intrinsics if (Avx2.IsSupported && ((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)))) { + // All integral types generate the same instruction, so just pick one rather than handling each T separately return Avx2.InsertVector128(AsByte(), value.AsByte(), 0).As(); } if (Avx.IsSupported) { + // All floating-point types generate the same instruction, so just pick one rather than handling each T separately + // We also just fallback to this for integral types if AVX2 isn't supported, since that is still faster than software return Avx.InsertVector128(AsSingle(), value.AsSingle(), 0).As(); } @@ -344,11 +347,14 @@ namespace System.Runtime.Intrinsics if (Avx2.IsSupported && ((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)))) { + // All integral types generate the same instruction, so just pick one rather than handling each T separately return Avx2.ExtractVector128(AsByte(), 1).As(); } if (Avx.IsSupported) { + // All floating-point types generate the same instruction, so just pick one rather than handling each T separately + // We also just fallback to this for integral types if AVX2 isn't supported, since that is still faster than software return Avx.ExtractVector128(AsSingle(), 1).As(); } @@ -373,11 +379,14 @@ namespace System.Runtime.Intrinsics if (Avx2.IsSupported && ((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)))) { + // All integral types generate the same instruction, so just pick one rather than handling each T separately return Avx2.InsertVector128(AsByte(), value.AsByte(), 1).As(); } if (Avx.IsSupported) { + // All floating-point types generate the same instruction, so just pick one rather than handling each T separately + // We also just fallback to this for integral types if AVX2 isn't supported, since that is still faster than software return Avx.InsertVector128(AsSingle(), value.AsSingle(), 1).As(); } -- 2.7.4