HARDWARE_INTRINSIC(AVX_GetLowerHalf, "GetLowerHalf", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_Insert, "Insert", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen|HW_Flag_SecondArgMaybe64Bit)
HARDWARE_INTRINSIC(AVX_InsertVector128, "InsertVector128", AVX, -1, 32, 3, {INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128, INS_vinsertf128},HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_OneTypeGeneric|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
return retNode;
}
+//------------------------------------------------------------------------
+// normalizeAndGetHalfIndex: compute the half index of a Vector256<baseType>
+// and normalize the index to the specific range
+//
+// Arguments:
+// indexPtr -- OUT paramter, the pointer to the original index value
+// baseType -- the base type of the Vector256<T>
+//
+// Return Value:
+// retuen the middle index of a Vector256<baseType>
+// return the normalized index via indexPtr
+//
+static int normalizeAndGetHalfIndex(int* indexPtr, var_types baseType)
+{
+ assert(varTypeIsArithmetic(baseType));
+ // clear the unused bits to normalize the index into the range of [0, length of Vector256<baseType>)
+ *indexPtr = (*indexPtr) & (32 / genTypeSize(baseType) - 1);
+ return (16 / genTypeSize(baseType));
+}
+
GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
switch (intrinsic)
{
+ case NI_AVX_Extract:
+ {
+ // Avx.Extract executes software implementation when the imm8 argument is not compile-time constant
+ assert(!mustExpand);
+
+ GenTree* lastOp = impPopStack().val;
+ GenTree* vectorOp = impSIMDPopStack(TYP_SIMD32);
+ assert(lastOp->IsCnsIntOrI());
+ int ival = (int)lastOp->AsIntCon()->IconValue();
+ baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
+ var_types retType = JITtype2varType(sig->retType);
+ assert(varTypeIsArithmetic(baseType));
+
+ int midIndex = normalizeAndGetHalfIndex(&ival, baseType);
+ NamedIntrinsic extractIntrinsic = varTypeIsShort(baseType) ? NI_SSE2_Extract : NI_SSE41_Extract;
+ GenTree* half = nullptr;
+
+ if (ival >= halfIndex)
+ {
+ half = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1), NI_AVX_ExtractVector128,
+ baseType, 32);
+ ival -= halfIndex;
+ }
+ else
+ {
+ half = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, NI_AVX_GetLowerHalf, baseType, 32);
+ }
+
+ retNode = gtNewSimdHWIntrinsicNode(retType, half, gtNewIconNode(ival), extractIntrinsic, baseType, 16);
+ break;
+ }
+
+ case NI_AVX_Insert:
+ {
+ // Avx.Extract executes software implementation when the imm8 argument is not compile-time constant
+ assert(!mustExpand);
+
+ GenTree* lastOp = impPopStack().val;
+ GenTree* dataOp = impPopStack().val;
+ GenTree* vectorOp = impSIMDPopStack(TYP_SIMD32);
+ assert(lastOp->IsCnsIntOrI());
+ int ival = (int)lastOp->AsIntCon()->IconValue();
+ baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
+ assert(varTypeIsArithmetic(baseType));
+
+ int midIndex = normalizeAndGetHalfIndex(&ival, baseType);
+ NamedIntrinsic insertIntrinsic = varTypeIsShort(baseType) ? NI_SSE2_Insert : NI_SSE41_Insert;
+
+ GenTree* clonedVectorOp;
+ vectorOp =
+ impCloneExpr(vectorOp, &clonedVectorOp, info.compCompHnd->getArgClass(sig, sig->args),
+ (unsigned)CHECK_SPILL_ALL, nullptr DEBUGARG("AVX Insert clones the vector operand"));
+
+ if (ival >= halfIndex)
+ {
+ GenTree* halfVector = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1),
+ NI_AVX_ExtractVector128, baseType, 32);
+ GenTree* ModifiedHalfVector =
+ gtNewSimdHWIntrinsicNode(TYP_SIMD16, halfVector, dataOp, gtNewIconNode(ival - halfIndex),
+ insertIntrinsic, baseType, 16);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, clonedVectorOp, ModifiedHalfVector, gtNewIconNode(1),
+ NI_AVX_InsertVector128, baseType, 32);
+ }
+ else
+ {
+ GenTree* halfVector = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, NI_AVX_GetLowerHalf, baseType, 32);
+ GenTree* ModifiedHalfVector =
+ gtNewSimdHWIntrinsicNode(TYP_SIMD32, halfVector, dataOp, gtNewIconNode(ival), insertIntrinsic,
+ baseType, 16);
+ retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, clonedVectorOp, ModifiedHalfVector, gtNewIconNode(15),
+ NI_AVX_Blend, TYP_FLOAT, 32);
+ }
+ break;
+ }
+
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
{
/// __int8 _mm256_extract_epi8 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static sbyte Extract(Vector256<sbyte> value, byte index)
{
- index &= 0x1F; // the instructions only need the lowest 5 bits.
- if (index > 15)
+ unsafe
{
- return Sse41.Extract(ExtractVector128(value, 1), (byte)(index - 16));
- }
- else
- {
- return Sse41.Extract(GetLowerHalf(value), index);
+ index &= 0x1F;
+ sbyte* buffer = stackalloc sbyte[32];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __int8 _mm256_extract_epi8 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static byte Extract(Vector256<byte> value, byte index)
{
- index &= 0x1F; // the instructions only need the lowest 5 bits.
- if (index > 15)
- {
- return Sse41.Extract(ExtractVector128(value, 1), (byte)(index - 16));
- }
- else
+ unsafe
{
- return Sse41.Extract(GetLowerHalf(value), index);
+ index &= 0x1F;
+ byte* buffer = stackalloc byte[32];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __int16 _mm256_extract_epi16 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static short Extract(Vector256<short> value, byte index)
{
- index &= 0xF; // the instructions only need the lowest 4 bits.
- if (index > 7)
+ unsafe
{
- return Sse2.Extract(ExtractVector128(value, 1), (byte)(index - 8));
- }
- else
- {
- return Sse2.Extract(GetLowerHalf(value), index);
- }
+ index &= 0xF;
+ short* buffer = stackalloc short[16];
+ Store(buffer, value);
+ return buffer[index];
+ }
}
/// <summary>
/// __int16 _mm256_extract_epi16 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ushort Extract(Vector256<ushort> value, byte index)
{
- index &= 0xF; // the instructions only need the lowest 4 bits.
- if (index > 7)
+ unsafe
{
- return Sse2.Extract(ExtractVector128(value, 1), (byte)(index - 8));
- }
- else
- {
- return Sse2.Extract(GetLowerHalf(value), index);
+ index &= 0xF;
+ ushort* buffer = stackalloc ushort[16];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __int32 _mm256_extract_epi32 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int Extract(Vector256<int> value, byte index)
{
- index &= 0x7; // the instructions only need the lowest 3 bits.
- if (index > 3)
- {
- return Sse41.Extract(ExtractVector128(value, 1), (byte)(index - 4));
- }
- else
+ unsafe
{
- return Sse41.Extract(GetLowerHalf(value), index);
+ index &= 0x7;
+ int* buffer = stackalloc int[8];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __int32 _mm256_extract_epi32 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint Extract(Vector256<uint> value, byte index)
{
- index &= 0x7; // the instructions only need the lowest 3 bits.
- if (index > 3)
+ unsafe
{
- return Sse41.Extract(ExtractVector128(value, 1), (byte)(index - 4));
- }
- else
- {
- return Sse41.Extract(GetLowerHalf(value), index);
+ index &= 0x7;
+ uint* buffer = stackalloc uint[8];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __int64 _mm256_extract_epi64 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static long Extract(Vector256<long> value, byte index)
{
- index &= 0x3; // the instructions only need the lowest 2 bits.
- if (index > 1)
+ if (IntPtr.Size != 8)
{
- return Sse41.Extract(ExtractVector128(value, 1), (byte)(index - 2));
+ throw new PlatformNotSupportedException();
}
- else
+ unsafe
{
- return Sse41.Extract(GetLowerHalf(value), index);
+ index &= 0x3;
+ long* buffer = stackalloc long[4];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __int64 _mm256_extract_epi64 (__m256i a, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ulong Extract(Vector256<ulong> value, byte index)
{
- index &= 0x3; // the instructions only need the lowest 2 bits.
- if (index > 1)
+ if (IntPtr.Size != 8)
{
- return Sse41.Extract(ExtractVector128(value, 1), (byte)(index - 2));
+ throw new PlatformNotSupportedException();
}
- else
+ unsafe
{
- return Sse41.Extract(GetLowerHalf(value), index);
+ index &= 0x3;
+ ulong* buffer = stackalloc ulong[4];
+ Store(buffer, value);
+ return buffer[index];
}
}
/// __m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<sbyte> Insert(Vector256<sbyte> value, sbyte data, byte index)
{
- index &= 0x1F; // the instructions only need the lowest 5 bits.
- if (index > 15)
- {
- Vector128<sbyte> half = ExtractVector128(value, 1);
- half = Sse41.Insert(half, data, (byte)(index - 16));
- return InsertVector128(value, half, 1);
- }
- else
+ unsafe
{
- Vector128<sbyte> half = Sse41.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, sbyte>(Blend(StaticCast<sbyte, float>(value), StaticCast<sbyte, float>(ExtendToVector256(half)), 15));
+ index &= 0x1F;
+ sbyte* buffer = stackalloc sbyte[32];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> Insert(Vector256<byte> value, byte data, byte index)
{
- index &= 0x1F; // the instructions only need the lowest 5 bits.
- if (index > 15)
+ unsafe
{
- Vector128<byte> half = ExtractVector128(value, 1);
- half = Sse41.Insert(half, data, (byte)(index - 16));
- return InsertVector128(value, half, 1);
- }
- else
- {
- Vector128<byte> half = Sse41.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, byte>(Blend(StaticCast<byte, float>(value), StaticCast<byte, float>(ExtendToVector256(half)), 15));
+ index &= 0x1F;
+ byte* buffer = stackalloc byte[32];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> Insert(Vector256<short> value, short data, byte index)
{
- index &= 0xF; // the instructions only need the lowest 4 bits.
- if (index > 7)
- {
- Vector128<short> half = ExtractVector128(value, 1);
- half = Sse2.Insert(half, data, (byte)(index - 8));
- return InsertVector128(value, half, 1);
- }
- else
+ unsafe
{
- Vector128<short> half = Sse2.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, short>(Blend(StaticCast<short, float>(value), StaticCast<short, float>(ExtendToVector256(half)), 15));
+ index &= 0xF;
+ short* buffer = stackalloc short[16];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<ushort> Insert(Vector256<ushort> value, ushort data, byte index)
{
- index &= 0xF; // the instructions only need the lowest 4 bits.
- if (index > 7)
+ unsafe
{
- Vector128<ushort> half = ExtractVector128(value, 1);
- half = Sse2.Insert(half, data, (byte)(index - 8));
- return InsertVector128(value, half, 1);
- }
- else
- {
- Vector128<ushort> half = Sse2.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, ushort>(Blend(StaticCast<ushort, float>(value), StaticCast<ushort, float>(ExtendToVector256(half)), 15));
+ index &= 0xF;
+ ushort* buffer = stackalloc ushort[16];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<int> Insert(Vector256<int> value, int data, byte index)
{
- index &= 0x7; // the instructions only need the lowest 3 bits.
- if (index > 3)
- {
- Vector128<int> half = ExtractVector128(value, 1);
- half = Sse41.Insert(half, data, (byte)(index - 4));
- return InsertVector128(value, half, 1);
- }
- else
+ unsafe
{
- Vector128<int> half = Sse41.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, int>(Blend(StaticCast<int, float>(value), StaticCast<int, float>(ExtendToVector256(half)), 15));
+ index &= 0x7;
+ int* buffer = stackalloc int[8];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<uint> Insert(Vector256<uint> value, uint data, byte index)
{
- index &= 0x7; // the instructions only need the lowest 3 bits.
- if (index > 3)
- {
- Vector128<uint> half = ExtractVector128(value, 1);
- half = Sse41.Insert(half, data, (byte)(index - 4));
- return InsertVector128(value, half, 1);
- }
- else
+ unsafe
{
- Vector128<uint> half = Sse41.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, uint>(Blend(StaticCast<uint, float>(value), StaticCast<uint, float>(ExtendToVector256(half)), 15));
+ index &= 0x7;
+ uint* buffer = stackalloc uint[8];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<long> Insert(Vector256<long> value, long data, byte index)
{
- index &= 0x3; // the instructions only need the lowest 2 bits.
- if (index > 1)
+ unsafe
{
- Vector128<long> half = ExtractVector128(value, 1);
- half = Sse41.Insert(half, data, (byte)(index - 2));
- return InsertVector128(value, half, 1);
- }
- else
- {
- Vector128<long> half = Sse41.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, long>(Blend(StaticCast<long, float>(value), StaticCast<long, float>(ExtendToVector256(half)), 15));
+ index &= 0x3;
+ long* buffer = stackalloc long[4];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}
/// __m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
/// HELPER
/// </summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<ulong> Insert(Vector256<ulong> value, ulong data, byte index)
{
- index &= 0x3; // the instructions only need the lowest 2 bits.
- if (index > 1)
- {
- Vector128<ulong> half = ExtractVector128(value, 1);
- half = Sse41.Insert(half, data, (byte)(index - 2));
- return InsertVector128(value, half, 1);
- }
- else
+ unsafe
{
- Vector128<ulong> half = Sse41.Insert(GetLowerHalf(value), data, index);
- return StaticCast<float, ulong>(Blend(StaticCast<ulong, float>(value), StaticCast<ulong, float>(ExtendToVector256(half)), 15));
+ index &= 0x3;
+ ulong* buffer = stackalloc ulong[4];
+ Store(buffer, value);
+ buffer[index] = data;
+ return LoadVector256(buffer);
}
}