From 22d1476bfa80a5e418ed3e60fda3a2ffcffe2e12 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 28 Apr 2019 14:31:01 +0000 Subject: [PATCH] [X86][AVX] Combine non-lane crossing binary shuffles using X86ISD::VPERMV3 Some of the combines might be further improved if we lower more shuffles with X86ISD::VPERMV3 directly, instead of waiting to combine the results. llvm-svn: 359400 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 22 ++ .../CodeGen/X86/avx512-shuffles/partial_permute.ll | 296 ++++++++++----------- llvm/test/CodeGen/X86/insertelement-ones.ll | 21 +- .../CodeGen/X86/shuffle-strided-with-offset-256.ll | 95 ++++--- .../test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll | 259 ++++++++++++++---- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 137 +++++++--- .../test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll | 8 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 8 +- llvm/test/CodeGen/X86/vec_smulo.ll | 74 +++--- llvm/test/CodeGen/X86/vec_umulo.ll | 72 ++--- llvm/test/CodeGen/X86/vector-fshl-256.ll | 7 +- llvm/test/CodeGen/X86/vector-fshr-256.ll | 6 +- llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 145 +++++++--- llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll | 118 +++++--- llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 288 ++++++++++++-------- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 32 ++- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 25 +- llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 97 +++++-- llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 191 ++++++++++--- llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll | 5 +- .../X86/vector-shuffle-combining-avx512vbmi.ll | 6 +- llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 14 +- llvm/test/CodeGen/X86/vector-trunc-widen.ll | 36 ++- llvm/test/CodeGen/X86/vector-trunc.ll | 44 ++- 24 files changed, 1329 insertions(+), 677 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 535a7b2..32e3a80 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31874,6 +31874,28 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } + // If we have a dual input shuffle then lower to VPERMV3. + if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 || + MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) { + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); + V1 = DAG.getBitcast(MaskVT, V1); + V2 = DAG.getBitcast(MaskVT, V2); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); + return DAG.getBitcast(RootVT, Res); + } + // Failed to find any combines. return SDValue(); } diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 53e0259..c1d37a7 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -150,11 +150,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -164,12 +162,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -181,13 +177,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7] +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -200,12 +194,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13] -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -217,13 +209,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7] +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -237,10 +227,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,0,3,0,5,0,7,1] +; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm4 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -252,12 +242,12 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm1 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,0,3,0,5,0,7,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -269,9 +259,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -281,10 +271,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -296,11 +286,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -1102,9 +1092,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3] +; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3] +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1116,10 +1107,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3] +; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1132,9 +1124,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0] +; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0] +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1146,10 +1139,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0] +; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1161,9 +1155,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,2,3] -; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7] +; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1172,11 +1166,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) { define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,3] -; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm3 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7] +; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1188,11 +1182,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = mem[1,1,2,3] ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1811,12 +1805,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,1,0,2] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3] +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -2785,10 +2779,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2797,12 +2790,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 -; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1] +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2814,12 +2807,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* % define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2832,10 +2825,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* % ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2847,11 +2841,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* % define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2] +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2864,11 +2859,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* % ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 -; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2880,12 +2875,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* % define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0] +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2897,10 +2892,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2910,11 +2904,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* % ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 -; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2926,12 +2920,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* % define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0] +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3136,11 +3130,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6] +; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1 +; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3153,12 +3147,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %v ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 +; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3203,9 +3196,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,5,3,7] +; CHECK-NEXT: vpermi2ps %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3217,11 +3210,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,3,7] +; CHECK-NEXT: vpermi2ps %xmm3, %xmm0, %xmm4 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1 +; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3235,12 +3228,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2 -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,5,3,7] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 +; CHECK-NEXT: vpermi2ps %xmm2, %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3409,9 +3401,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,3,3] ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = mem[3,1,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3421,12 +3413,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) { define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = mem[0,2,3,3] -; CHECK-NEXT: vpermpd {{.*#+}} ymm3 = mem[3,1,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3439,12 +3431,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,2,3,3] ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll index 2f2acf7..11b065f 100644 --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -296,13 +296,20 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: retq +; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10,11,12,13],ymm1[14],ymm0[15] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,22,7,8,9,10,11,12,13,14,31] +; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq %1 = insertelement <16 x i16> %a, i16 -1, i32 0 %2 = insertelement <16 x i16> %1, i16 -1, i32 6 %3 = insertelement <16 x i16> %2, i16 -1, i32 15 diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 51b8c68..8cbcfc4 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -47,16 +47,46 @@ define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v16i16_to_v8i16_1: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> store <8 x i16> %strided.vec, <8 x i16>* %S @@ -192,12 +222,9 @@ define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -333,12 +360,9 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -485,12 +509,9 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -607,12 +628,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -843,12 +861,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll index 1f2adab..ef5b866 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll @@ -24,16 +24,57 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v32i8_to_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v32i8_to_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, <16 x i8>* %S @@ -115,16 +156,54 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v16i16_to_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i16_to_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> store <8 x i16> %strided.vec, <8 x i16>* %S @@ -293,16 +372,57 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX-NEXT: vmovq %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v32i8_to_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v32i8_to_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, <8 x i8>* %S @@ -1038,23 +1158,17 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -1149,16 +1263,57 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX-NEXT: vmovd %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v32i8_to_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovd %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v32i8_to_v4i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> store <4 x i8> %strided.vec, <4 x i8>* %S diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 3dd705d..fd9787b 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -24,16 +24,57 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v32i8_to_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v32i8_to_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, <16 x i8>* %S @@ -115,16 +156,54 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq ; -; AVX512-LABEL: shuffle_v16i16_to_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i16_to_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BWVL-NEXT: retq +; +; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VBMIVL: # %bb.0: +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> store <8 x i16> %strided.vec, <8 x i16>* %S @@ -329,23 +408,17 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VBMIVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VBMIVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] +; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 +; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll index f26892d..1a6bdd3 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll @@ -855,11 +855,11 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3] +; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 14284fd..19031bb 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -846,11 +846,11 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3] +; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index e4a6524..e46c09d 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -407,15 +407,15 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; ; AVX512-LABEL: smulo_v3i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k1 +; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) @@ -537,15 +537,15 @@ define <4 x i32> @smulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) noun ; ; AVX512-LABEL: smulo_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k1 +; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -796,15 +796,15 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun ; ; AVX512-LABEL: smulo_v6i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0 -; AVX512-NEXT: vpcmpneqd %ymm0, %ymm2, %k1 +; AVX512-NEXT: vpcmpneqd %ymm0, %ymm4, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -995,15 +995,15 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; ; AVX512-LABEL: smulo_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0 -; AVX512-NEXT: vpcmpneqd %ymm0, %ymm2, %k1 +; AVX512-NEXT: vpcmpneqd %ymm0, %ymm4, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %ymm1, (%rdi) @@ -2103,19 +2103,19 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; ; AVX512-LABEL: smulo_v4i24: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpslld $8, %xmm1, %xmm1 ; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX512-NEXT: vpcmpneqd %xmm0, %xmm2, %k0 +; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k0 ; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 ; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 6920e41..74b7712 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -367,13 +367,13 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun ; ; AVX512-LABEL: umulo_v3i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 +; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -483,13 +483,13 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) noun ; ; AVX512-LABEL: umulo_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 +; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} @@ -703,13 +703,13 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun ; ; AVX512-LABEL: umulo_v6i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512-NEXT: vptestmd %ymm2, %ymm2, %k1 +; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 +; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -873,13 +873,13 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; ; AVX512-LABEL: umulo_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512-NEXT: vptestmd %ymm2, %ymm2, %k1 +; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 +; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -1878,17 +1878,17 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun ; AVX512-LABEL: umulo_v4i24: ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index b9f3a65..349e545 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1961,9 +1961,10 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm2 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLBW-NEXT: vpor %ymm1, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VLBW-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 +; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index d459d5d..c27828b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1971,9 +1971,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm2 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VLBW-NEXT: vpermi2w %ymm1, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 1a2851e..c351387 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -286,11 +286,24 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07( ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23] +; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2 +; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -352,11 +365,23 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20( ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -394,12 +419,25 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20( ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] +; AVX1OR2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -1136,11 +1174,23 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23( ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> ret <16 x i8> %shuffle } @@ -1381,12 +1431,25 @@ define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq %1 = lshr <8 x i16> %a0, %2 = lshr <8 x i16> %a1, %3 = bitcast <8 x i16> %1 to <16 x i8> @@ -1647,13 +1710,27 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; -; AVX-LABEL: PR12412: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1OR2-LABEL: PR12412: +; AVX1OR2: # %bb.0: # %entry +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: PR12412: +; AVX512VLBW: # %bb.0: # %entry +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: PR12412: +; AVX512VLVBMI: # %bb.0: # %entry +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq entry: %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> ret <16 x i8> %0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index d4bb1b2..5fde2d1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -360,11 +360,17 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v4i32_0124: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v4i32_0124: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_0124: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4] +; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -401,12 +407,18 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v4i32_0142: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v4i32_0142: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_0142: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2] +; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -446,12 +458,18 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v4i32_0412: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v4i32_0412: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_0412: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2] +; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -483,11 +501,17 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_4012: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4i32_4012: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_4012: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2] +; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -537,12 +561,18 @@ define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v4i32_0451: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v4i32_0451: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_0451: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1] +; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -593,12 +623,18 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v4i32_4015: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v4i32_4015: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_4015: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5] +; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -1841,16 +1877,10 @@ define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) { ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v4f32_bitcast_4401: +; AVX: # %bb.0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1] +; AVX-NEXT: retq %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> %2 = bitcast <4 x i32> %1 to <2 x double> %3 = bitcast <4 x float> %a to <2 x double> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 6cd0aba..b441e0c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -974,11 +974,23 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_0c1d2e3f: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1004,11 +1016,23 @@ define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_48596a7b: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_48596a7b: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1021,12 +1045,18 @@ define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_08196e7f: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_08196e7f: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_08196e7f: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,6,14,7,15] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1039,12 +1069,18 @@ define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_0c1d6879: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_0c1d6879: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_0c1d6879: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,6,8,7,9] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1077,18 +1113,11 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v8i16_109832ba: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1165,19 +1194,11 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15] -; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1227,18 +1248,11 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1272,11 +1286,17 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i16_032dXXXX: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_032dXXXX: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1317,11 +1337,23 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_012dXXXX: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7] +; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1356,11 +1388,24 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i16_XXXXcde3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11] +; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 +; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1389,11 +1434,24 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_cde3XXXX: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,0,1,2,3] +; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 +; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1430,11 +1488,17 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i16_012dcde3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_012dcde3: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1520,20 +1584,11 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX2-FAST-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] -; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-FAST-NEXT: retq +; AVX512VL-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,1,4,5,7,9] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1568,12 +1623,18 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i16_XX4X8acX: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_XX4X8acX: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10] +; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -2449,12 +2510,19 @@ define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_fu3ucc5u: -; AVX: # %bb.0: -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i16_fu3ucc5u: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1OR2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4] +; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_fu3ucc5u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,5,11,12,4,4,13,14] +; AVX512VL-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 +; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 7c3436b..c578faa7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1005,11 +1005,17 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31] +; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1023,11 +1029,17 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 11a6187..8dfcffb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2978,12 +2978,25 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] -; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] +; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16] +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 3d0244f..4fe07ca 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -318,22 +318,40 @@ define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0423: -; ALL: # %bb.0: -; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4f64_0423: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_0423: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4f64_0423: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,2,3] +; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0462: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4f64_0462: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0462: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,6,2] +; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -483,11 +501,23 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_1076: -; ALL: # %bb.0: -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4f64_1076: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_1076: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4f64_1076: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,7,6] +; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -906,12 +936,19 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_0142: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2] -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0142: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_0142: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,2] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1185,11 +1222,17 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_1076: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_1076: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,7,6] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 0676820..688ddce 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -283,11 +283,23 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_08084c4c: -; ALL: # %bb.0: -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_08084c4c: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_08084c4c: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] +; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -933,11 +945,23 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_3210fedc: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_3210fedc: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1017,11 +1041,24 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) { } define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_ba987654: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_ba987654: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1468,11 +1505,23 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i32_08084c4c: +; AVX2: # %bb.0: +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] +; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2188,11 +2237,23 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_3210fedc: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_3210fedc: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2243,21 +2304,47 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_ba987654: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_ba987654: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_ba983210: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_ba983210: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2903,14 +2990,32 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: broadcast_concat_crash: -; AVX2OR512VL: # %bb.0: # %entry -; AVX2OR512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3] -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2OR512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX2OR512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: broadcast_concat_crash: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3] +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: broadcast_concat_crash: +; AVX512VL-SLOW: # %bb.0: # %entry +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3] +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: broadcast_concat_crash: +; AVX512VL-FAST: # %bb.0: # %entry +; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] +; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 +; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: retq entry: %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> %bc = bitcast <8 x float> %tmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index cadb725..2f1c598 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -595,9 +595,8 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_ ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,65,67,69,71,73,75,77,79,17,19,21,23,25,27,29,31,81,83,85,87,89,91,93,95,33,35,37,39,41,43,45,47,97,99,101,103,105,107,109,111,49,51,53,55,57,59,61,63,113,115,117,119,121,123,125,127] +; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0 ; AVX512VBMI-NEXT: retq %1 = lshr <32 x i16> %a0, %2 = lshr <32 x i16> %a1, diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index 156bd24..84e531f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -84,10 +84,8 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1 define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17] -; CHECK-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29] -; CHECK-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; CHECK-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> , <16 x i8> %x1, i16 -1) %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %res0, <16 x i8> %res0, i16 -1) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 69e4841..3897e55 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -769,9 +769,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper @@ -782,10 +782,10 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 +; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll index 77fb1c1..6a240ab 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -1649,13 +1649,35 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; -; AVX512-LABEL: trunc2x4i32_8i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc2x4i32_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i32_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i32_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i32_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512BWVL-NEXT: retq entry: %0 = trunc <4 x i32> %a to <4 x i16> %1 = trunc <4 x i32> %b to <4 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index f40c498..6e8d2b4 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1560,12 +1560,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512BWVL-LABEL: trunc2x4i64_8i16: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm2 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2w %xmm1, %xmm2, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: @@ -1657,13 +1655,35 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; -; AVX512-LABEL: trunc2x4i32_8i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc2x4i32_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i32_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i32_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i32_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 +; AVX512BWVL-NEXT: retq entry: %0 = trunc <4 x i32> %a to <4 x i16> %1 = trunc <4 x i32> %b to <4 x i16> -- 2.7.4