Adding Shuffle(). (#85129)
authorDeepakRajendrakumaran <deepak.rajendrakumaran@intel.com>
Sat, 22 Apr 2023 23:48:08 +0000 (16:48 -0700)
committerGitHub <noreply@github.com>
Sat, 22 Apr 2023 23:48:08 +0000 (16:48 -0700)
* Adding Shuffle().

* Fixing Vector256 bug + Simplifying logic by switching to AND instead of ConditionalSelect

* Adding regression test.

src/coreclr/jit/emitxarch.cpp
src/coreclr/jit/gentree.cpp
src/coreclr/jit/hwintrinsiclistxarch.h
src/coreclr/jit/hwintrinsicxarch.cpp
src/coreclr/jit/instrsxarch.h
src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.cs [new file with mode: 0644]
src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.csproj [new file with mode: 0644]

index 4e25a445bbaa80e680e9b6bc7717efa9f1512272..6eb3bc531efd79aeea52e6f647fd82e288c08858 100644 (file)
@@ -18066,8 +18066,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_psadbw:
         case INS_vpermps:
         case INS_vpermpd:
+        case INS_vpermpd_reg:
         case INS_vpermd:
         case INS_vpermq:
+        case INS_vpermq_reg:
         case INS_vperm2i128:
         case INS_vperm2f128:
         case INS_vextractf128:
@@ -18086,6 +18088,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             result.insLatency += PERFSCORE_LATENCY_3C;
             break;
 
+        case INS_vpermw:
+            result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+            result.insLatency += PERFSCORE_LATENCY_6C;
+            break;
+
         case INS_pextrb:
         case INS_pextrd:
         case INS_pextrw:
index 43b580f5596984fff4eaae62b64d9a4924a7fa4d..bd7d5c049599c0ee6a977597a26254d1ba4c459c 100644 (file)
@@ -23217,7 +23217,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
 #if defined(TARGET_XARCH)
     uint8_t  control   = 0;
     bool     crossLane = false;
-    bool     needsZero = varTypeIsSmallInt(simdBaseType);
+    bool     needsZero = varTypeIsSmallInt(simdBaseType) && (simdSize != 64);
     uint64_t value     = 0;
     simd_t   vecCns    = {};
     simd_t   mskCns    = {};
@@ -23351,6 +23351,61 @@ GenTree* Compiler::gtNewSimdShuffleNode(
             retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX2_Permute4x64, simdBaseJitType, simdSize);
         }
     }
+    else if (simdSize == 64)
+    {
+        if (elementSize == 4)
+        {
+            for (uint32_t i = 0; i < elementCount; i++)
+            {
+                vecCns.u32[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize);
+            }
+
+            op2                        = gtNewVconNode(type);
+            op2->AsVecCon()->gtSimdVal = vecCns;
+
+            // swap the operands to match the encoding requirements
+            retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512F_PermuteVar16x32, simdBaseJitType, simdSize);
+        }
+        else if (elementSize == 2)
+        {
+            for (uint32_t i = 0; i < elementCount; i++)
+            {
+                vecCns.u16[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize);
+            }
+
+            op2                        = gtNewVconNode(type);
+            op2->AsVecCon()->gtSimdVal = vecCns;
+
+            // swap the operands to match the encoding requirements
+            retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512BW_PermuteVar32x16, simdBaseJitType, simdSize);
+        }
+        else
+        {
+            assert(elementSize == 8);
+
+            for (uint32_t i = 0; i < elementCount; i++)
+            {
+                vecCns.u64[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize);
+            }
+
+            op2                        = gtNewVconNode(type);
+            op2->AsVecCon()->gtSimdVal = vecCns;
+
+            // swap the operands to match the encoding requirements
+            retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512F_Permute8x64, simdBaseJitType, simdSize);
+        }
+        assert(retNode != nullptr);
+
+        // TODO-XArch-AVX512: Switch to VPERMI2*
+        if (needsZero)
+        {
+            op2                        = gtNewVconNode(type);
+            op2->AsVecCon()->gtSimdVal = mskCns;
+            retNode                    = gtNewSimdBinOpNode(GT_AND, type, op2, retNode, simdBaseJitType, simdSize);
+        }
+
+        return retNode;
+    }
     else
     {
         if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3))
@@ -23397,13 +23452,11 @@ GenTree* Compiler::gtNewSimdShuffleNode(
 
     if (needsZero)
     {
-        assert(!compIsaSupportedDebugOnly(InstructionSet_SSSE3));
-
-        op2                          = gtNewVconNode(type);
-        op2->AsVecCon()->gtSimd16Val = mskCns.v128[0];
+        assert((simdSize == 32) || !compIsaSupportedDebugOnly(InstructionSet_SSSE3));
 
-        GenTree* zero = gtNewZeroConNode(type);
-        retNode       = gtNewSimdCndSelNode(type, op2, retNode, zero, simdBaseJitType, simdSize);
+        op2                        = gtNewVconNode(type);
+        op2->AsVecCon()->gtSimdVal = mskCns;
+        retNode                    = gtNewSimdBinOpNode(GT_AND, type, op2, retNode, simdBaseJitType, simdSize);
     }
 
     return retNode;
index 78d3b011d8023a32c1825fb0b1faa3b370500007..303d9fecbfc39b0b38cdcc1d74bff67fcef3d795 100644 (file)
@@ -864,6 +864,8 @@ HARDWARE_INTRINSIC(AVX512F,         Min,
 HARDWARE_INTRINSIC(AVX512F,         Multiply,                                   64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_pmuldq,             INS_pmuludq,            INS_mulps,              INS_mulpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX512F,         MultiplyLow,                                64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_pmulld,             INS_pmulld,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX512F,         Or,                                         64,              2,     {INS_por,               INS_por,                INS_por,                INS_por,                INS_por,                INS_por,                INS_vporq,              INS_vporq,              INS_orps,               INS_orpd},              HW_Category_SimpleSIMD,             HW_Flag_Commutative)
+HARDWARE_INTRINSIC(AVX512F,         Permute8x64,                                64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_vpermq_reg,         INS_vpermq_reg,         INS_invalid,            INS_vpermpd_reg},       HW_Category_SimpleSIMD,             HW_Flag_SpecialImport)
+HARDWARE_INTRINSIC(AVX512F,         PermuteVar16x32,                            64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_vpermd,             INS_vpermd,             INS_invalid,            INS_invalid,            INS_vpermps,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(AVX512F,         ShiftLeftLogical,                           64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_pslld,              INS_pslld,              INS_psllq,              INS_psllq,              INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512F,         ShiftRightArithmetic,                       64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_psrad,              INS_invalid,            INS_vpsraq,             INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512F,         ShiftRightLogical,                          64,              2,     {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_psrld,              INS_psrld,              INS_psrlq,              INS_psrlq,              INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
@@ -913,6 +915,7 @@ HARDWARE_INTRINSIC(AVX512BW,        MultiplyHighRoundScale,
 HARDWARE_INTRINSIC(AVX512BW,        MultiplyLow,                                64,              2,     {INS_invalid,           INS_invalid,            INS_pmullw,             INS_pmullw,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX512BW,        PackSignedSaturate,                         64,              2,     {INS_packsswb,          INS_invalid,            INS_packssdw,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX512BW,        PackUnsignedSaturate,                       64,              2,     {INS_invalid,           INS_packuswb,           INS_invalid,            INS_packusdw,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX512BW,        PermuteVar32x16,                            64,              2,     {INS_invalid,           INS_invalid,            INS_vpermw,             INS_vpermw,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_SpecialImport)
 HARDWARE_INTRINSIC(AVX512BW,        ShiftLeftLogical,                           64,              2,     {INS_invalid,           INS_invalid,            INS_psllw,              INS_psllw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512BW,        ShiftLeftLogical128BitLane,                 64,              2,     {INS_pslldq,            INS_pslldq,             INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX512BW,        ShiftRightArithmetic,                       64,              2,     {INS_invalid,           INS_invalid,            INS_psraw,              INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_IMM,                    HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
index 4e2335e6f09686564f893396e61da368293e46f8..a3ac250fe3e66d9730eaf5cc7aa9c113e160c36d 100644 (file)
@@ -2219,9 +2219,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
 
         case NI_Vector128_Shuffle:
         case NI_Vector256_Shuffle:
+        case NI_Vector512_Shuffle:
         {
             assert((sig->numArgs == 2) || (sig->numArgs == 3));
-            assert((simdSize == 16) || (simdSize == 32));
+            assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64));
 
             GenTree* indices = impStackTop(0).val;
 
@@ -2277,6 +2278,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                     }
                 }
             }
+            else if (simdSize == 64)
+            {
+                if (varTypeIsByte(simdBaseType))
+                {
+                    // TYP_BYTE, TYP_UBYTE need AVX512_VBMI.
+                    break;
+                }
+            }
             else
             {
                 assert(simdSize == 16);
index 78bca42d4b98f06f9caaa24d1e22ff4936ba3e4d..9ea0aaba98bdd8f1f87b529525c669db660c7f74 100644 (file)
@@ -620,6 +620,8 @@ INST3(vmovdqu64,        "movdqu64",         IUM_WR, SSEFLT(0x7F),           BAD_
 INST3(vpabsq,           "pabsq",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x1F),                   INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX)                                                                                                                                  // Packed absolute value of 64-bit integers
 INST3(vpandq,           "pandq",            IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xDB),                  INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND of two xmm regs
 INST3(vpandnq,          "pandnq",           IUM_WR, BAD_CODE,               BAD_CODE,     PCKDBL(0xDF),                  INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // Packed bit-wise AND NOT of two xmm regs
+INST3(vpermq_reg,       "permq",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x36),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                                                                  // Permute 64-bit of input register
+INST3(vpermpd_reg,      "permpd",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x16),                   INS_TT_FULL,                         Input_64Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                                                                  // Permute 64-bit of input register
 INST3(vpmaxsq,          "pmaxsq",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x3D),                   INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // packed maximum 64-bit signed integers
 INST3(vpmaxuq,          "pmaxuq",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x3F),                   INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // packed maximum 64-bit unsigned integers
 INST3(vpminsq,          "pminsq",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x39),                   INS_TT_FULL,                         Input_64Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsDstDstSrcAVXInstruction)                                                                                           // packed minimum 64-bit signed integers
@@ -640,6 +642,7 @@ INST3(kortestd,         "kortestd",         IUM_WR, BAD_CODE,               BAD_
 INST3(kortestq,         "kortestq",         IUM_WR, BAD_CODE,               BAD_CODE,     PCKFLT(0x98),                  INS_TT_NONE,                                          REX_W1                       | Encoding_VEX | Resets_OF      | Resets_SF     | Writes_ZF     | Resets_AF     | Resets_PF     | Writes_CF      | KInstruction)
 INST3(vmovdqu8,         "movdqu8",          IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX)
 INST3(vmovdqu16,        "movdqu16",         IUM_WR, SSEFLT(0x7F),           BAD_CODE,     SSEFLT(0x6F),                  INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1_EVEX                  | Encoding_EVEX)
+INST3(vpermw,           "permw",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE38(0x8D),                   INS_TT_FULL_MEM,                     Input_16Bit    | REX_W1                       | Encoding_EVEX  | INS_Flags_IsDstSrcSrcAVXInstruction)                                                                                           // Permute Packed Doublewords Elements
 INST3(vpcmpb,           "pcmpb",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3F),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
 INST3(vpcmpw,           "pcmpw",            IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3F),                   INS_TT_FULL,                         Input_16Bit    | REX_W1_EVEX                  | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
 INST3(vpcmpub,          "pcmpub",           IUM_WR, BAD_CODE,               BAD_CODE,     SSE3A(0x3E),                   INS_TT_FULL_MEM,                     Input_8Bit     | REX_W0_EVEX                  | Encoding_EVEX  | INS_Flags_IsMskSrcSrcEvexInstruction)
diff --git a/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.cs b/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.cs
new file mode 100644 (file)
index 0000000..042df98
--- /dev/null
@@ -0,0 +1,47 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+//
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using Xunit;
+
+public class Program
+{
+    [Fact]
+    public static int TestEntryPoint()
+    {
+
+        Vector256<int> v256Shuffle = Vector256.Create(100, 101, 102, 103, 104, 105, 106, 107);
+        Vector256<int> v256ShuffleExpectedResult = Vector256.Create(107, 105, 0, 101, 106, 104, 0, 100);
+        Vector256<int> v256ShuffleActualResult = Vector256Shuffle(v256Shuffle);
+        if(v256ShuffleExpectedResult != v256ShuffleActualResult)
+        {
+            return 1;
+        }
+
+        Vector512<int> v512Shuffle = Vector512.Create(100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115);
+        Vector512<int> v512ShuffleExpectedResult = Vector512.Create(115, 113, 111, 0, 107, 105, 103, 101, 114, 112, 110, 108, 0, 104, 102, 100);
+        Vector512<int> v512ShuffleActualResult = Vector512Shuffle(v512Shuffle);
+        if (v512ShuffleExpectedResult != v512ShuffleActualResult)
+        {
+            return 1;
+        }
+        return 100;
+    }
+
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    public static Vector256<int> Vector256Shuffle(Vector256<int> v1)
+    {
+        return Vector256.Shuffle(v1, Vector256.Create(7, 5, 132, 1, 6, 4, -3, 0));
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    public static  Vector512<int> Vector512Shuffle(Vector512<int> v1)
+    {
+        return Vector512.Shuffle(v1, Vector512.Create(15, 13, 11, 99, 7, 5, 3, 1, 14, 12, 10, 8, -11, 4, 2, 0));
+    }
+}
diff --git a/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.csproj b/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.csproj
new file mode 100644 (file)
index 0000000..501217e
--- /dev/null
@@ -0,0 +1,9 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <DebugType>None</DebugType>
+    <Optimize>True</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+</Project>