* Adding Shuffle().
* Fixing Vector256 bug + Simplifying logic by switching to AND instead of ConditionalSelect
* Adding regression test.
case INS_psadbw:
case INS_vpermps:
case INS_vpermpd:
+ case INS_vpermpd_reg:
case INS_vpermd:
case INS_vpermq:
+ case INS_vpermq_reg:
case INS_vperm2i128:
case INS_vperm2f128:
case INS_vextractf128:
result.insLatency += PERFSCORE_LATENCY_3C;
break;
+ case INS_vpermw:
+ result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+ result.insLatency += PERFSCORE_LATENCY_6C;
+ break;
+
case INS_pextrb:
case INS_pextrd:
case INS_pextrw:
#if defined(TARGET_XARCH)
uint8_t control = 0;
bool crossLane = false;
- bool needsZero = varTypeIsSmallInt(simdBaseType);
+ bool needsZero = varTypeIsSmallInt(simdBaseType) && (simdSize != 64);
uint64_t value = 0;
simd_t vecCns = {};
simd_t mskCns = {};
retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX2_Permute4x64, simdBaseJitType, simdSize);
}
}
+ else if (simdSize == 64)
+ {
+ if (elementSize == 4)
+ {
+ for (uint32_t i = 0; i < elementCount; i++)
+ {
+ vecCns.u32[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize);
+ }
+
+ op2 = gtNewVconNode(type);
+ op2->AsVecCon()->gtSimdVal = vecCns;
+
+ // swap the operands to match the encoding requirements
+ retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512F_PermuteVar16x32, simdBaseJitType, simdSize);
+ }
+ else if (elementSize == 2)
+ {
+ for (uint32_t i = 0; i < elementCount; i++)
+ {
+ vecCns.u16[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize);
+ }
+
+ op2 = gtNewVconNode(type);
+ op2->AsVecCon()->gtSimdVal = vecCns;
+
+ // swap the operands to match the encoding requirements
+ retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512BW_PermuteVar32x16, simdBaseJitType, simdSize);
+ }
+ else
+ {
+ assert(elementSize == 8);
+
+ for (uint32_t i = 0; i < elementCount; i++)
+ {
+ vecCns.u64[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize);
+ }
+
+ op2 = gtNewVconNode(type);
+ op2->AsVecCon()->gtSimdVal = vecCns;
+
+ // swap the operands to match the encoding requirements
+ retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512F_Permute8x64, simdBaseJitType, simdSize);
+ }
+ assert(retNode != nullptr);
+
+ // TODO-XArch-AVX512: Switch to VPERMI2*
+ if (needsZero)
+ {
+ op2 = gtNewVconNode(type);
+ op2->AsVecCon()->gtSimdVal = mskCns;
+ retNode = gtNewSimdBinOpNode(GT_AND, type, op2, retNode, simdBaseJitType, simdSize);
+ }
+
+ return retNode;
+ }
else
{
if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
if (needsZero)
{
- assert(!compIsaSupportedDebugOnly(InstructionSet_SSSE3));
-
- op2 = gtNewVconNode(type);
- op2->AsVecCon()->gtSimd16Val = mskCns.v128[0];
+ assert((simdSize == 32) || !compIsaSupportedDebugOnly(InstructionSet_SSSE3));
- GenTree* zero = gtNewZeroConNode(type);
- retNode = gtNewSimdCndSelNode(type, op2, retNode, zero, simdBaseJitType, simdSize);
+ op2 = gtNewVconNode(type);
+ op2->AsVecCon()->gtSimdVal = mskCns;
+ retNode = gtNewSimdBinOpNode(GT_AND, type, op2, retNode, simdBaseJitType, simdSize);
}
return retNode;
HARDWARE_INTRINSIC(AVX512F, Multiply, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512F, MultiplyLow, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX512F, Permute8x64, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq_reg, INS_vpermq_reg, INS_invalid, INS_vpermpd_reg}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport)
+HARDWARE_INTRINSIC(AVX512F, PermuteVar16x32, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermd, INS_vpermd, INS_invalid, INS_invalid, INS_vpermps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AVX512F, ShiftLeftLogical, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512F, ShiftRightArithmetic, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psrad, INS_invalid, INS_vpsraq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512F, ShiftRightLogical, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512BW, MultiplyLow, 64, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512BW, PackSignedSaturate, 64, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512BW, PackUnsignedSaturate, 64, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX512BW, PermuteVar32x16, 64, 2, {INS_invalid, INS_invalid, INS_vpermw, INS_vpermw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AVX512BW, ShiftLeftLogical, 64, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512BW, ShiftLeftLogical128BitLane, 64, 2, {INS_pslldq, INS_pslldq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512BW, ShiftRightArithmetic, 64, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
case NI_Vector128_Shuffle:
case NI_Vector256_Shuffle:
+ case NI_Vector512_Shuffle:
{
assert((sig->numArgs == 2) || (sig->numArgs == 3));
- assert((simdSize == 16) || (simdSize == 32));
+ assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));
GenTree* indices = impStackTop(0).val;
}
}
}
+ else if (simdSize == 64)
+ {
+ if (varTypeIsByte(simdBaseType))
+ {
+ // TYP_BYTE, TYP_UBYTE need AVX512_VBMI.
+ break;
+ }
+ }
else
{
assert(simdSize == 16);
INST3(vpabsq, "pabsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Packed absolute value of 64-bit integers
INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs
INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs
+INST3(vpermq_reg, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register
+INST3(vpermpd_reg, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register
INST3(vpmaxsq, "pmaxsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit signed integers
INST3(vpmaxuq, "pmaxuq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit unsigned integers
INST3(vpminsq, "pminsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 64-bit signed integers
INST3(kortestq, "kortestq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction)
INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX)
INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX)
+INST3(vpermw, "permw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Doublewords Elements
INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction)
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using Xunit;
+
+public class Program
+{
+ [Fact]
+ public static int TestEntryPoint()
+ {
+
+ Vector256<int> v256Shuffle = Vector256.Create(100, 101, 102, 103, 104, 105, 106, 107);
+ Vector256<int> v256ShuffleExpectedResult = Vector256.Create(107, 105, 0, 101, 106, 104, 0, 100);
+ Vector256<int> v256ShuffleActualResult = Vector256Shuffle(v256Shuffle);
+ if(v256ShuffleExpectedResult != v256ShuffleActualResult)
+ {
+ return 1;
+ }
+
+ Vector512<int> v512Shuffle = Vector512.Create(100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115);
+ Vector512<int> v512ShuffleExpectedResult = Vector512.Create(115, 113, 111, 0, 107, 105, 103, 101, 114, 112, 110, 108, 0, 104, 102, 100);
+ Vector512<int> v512ShuffleActualResult = Vector512Shuffle(v512Shuffle);
+ if (v512ShuffleExpectedResult != v512ShuffleActualResult)
+ {
+ return 1;
+ }
+ return 100;
+ }
+
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static Vector256<int> Vector256Shuffle(Vector256<int> v1)
+ {
+ return Vector256.Shuffle(v1, Vector256.Create(7, 5, 132, 1, 6, 4, -3, 0));
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static Vector512<int> Vector512Shuffle(Vector512<int> v1)
+ {
+ return Vector512.Shuffle(v1, Vector512.Create(15, 13, 11, 99, 7, 5, 3, 1, 14, 12, 10, 8, -11, 4, 2, 0));
+ }
+}
--- /dev/null
+<Project Sdk="Microsoft.NET.Sdk">
+ <PropertyGroup>
+ <DebugType>None</DebugType>
+ <Optimize>True</Optimize>
+ </PropertyGroup>
+ <ItemGroup>
+ <Compile Include="$(MSBuildProjectName).cs" />
+ </ItemGroup>
+</Project>