From 15a31389b2ead8fa7052a4378b76b5d686d29ad7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 14 Dec 2020 17:49:33 +0000 Subject: [PATCH] [X86][AVX] LowerBUILD_VECTOR - reduce 256/512-bit build vectors with zero/undef upper elements + pad. As discussed on D92645, we don't do a good job of recognising when we don't require the full width of a ymm/zmm build vector because the upper elements are undef/zero. This commit allows us to make use of implicit zeroing of upper elements with AVX instructions, which we emulate in DAG with a INSERT_SUBVECTOR into the bottom of a undef/zero vector of the original type. This exposed a limitation in getTargetConstantBitsFromNode which didn't extract bits from INSERT_SUBVECTORs of different element widths which I've included as well to prevent a couple of regressions. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 84 +++++++---- .../CodeGen/X86/avx512-shuffles/partial_permute.ll | 159 ++++++++------------- llvm/test/CodeGen/X86/pr29112.ll | 10 +- llvm/test/CodeGen/X86/pr46532.ll | 2 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 12 +- llvm/test/CodeGen/X86/trunc-subvector.ll | 4 +- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 14 +- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 30 ++-- llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 24 ++-- llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll | 10 +- llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 44 +++--- 11 files changed, 186 insertions(+), 207 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 21af1a5..78d08b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6673,23 +6673,26 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, // Insert constant bits from a base and sub vector sources. if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) { - // TODO - support insert_subvector through bitcasts. - if (EltSizeInBits != VT.getScalarSizeInBits()) - return false; + // If bitcasts to larger elements we might lose track of undefs - don't + // allow any to be safe. + unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); + bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits; - APInt UndefSubElts; - SmallVector EltSubBits; - if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, + APInt UndefSrcElts, UndefSubElts; + SmallVector EltSrcBits, EltSubBits; + if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits, UndefSubElts, EltSubBits, - AllowWholeUndefs, AllowPartialUndefs) && - getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, - UndefElts, EltBits, AllowWholeUndefs, - AllowPartialUndefs)) { + AllowWholeUndefs && AllowUndefs, + AllowPartialUndefs && AllowUndefs) && + getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits, + UndefSrcElts, EltSrcBits, + AllowWholeUndefs && AllowUndefs, + AllowPartialUndefs && AllowUndefs)) { unsigned BaseIdx = Op.getConstantOperandVal(2); - UndefElts.insertBits(UndefSubElts, BaseIdx); + UndefSrcElts.insertBits(UndefSubElts, BaseIdx); for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) - EltBits[BaseIdx + i] = EltSubBits[i]; - return true; + EltSrcBits[BaseIdx + i] = EltSubBits[i]; + return CastBitData(UndefSrcElts, EltSrcBits); } } @@ -10165,17 +10168,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) return VectorConstant; - BuildVectorSDNode *BV = cast(Op.getNode()); - if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) - return AddSub; - if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) - return HorizontalOp; - if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) - return Broadcast; - if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) - return BitOp; - unsigned EVTBits = EltVT.getSizeInBits(); + APInt UndefMask = APInt::getNullValue(NumElems); APInt ZeroMask = APInt::getNullValue(NumElems); APInt NonZeroMask = APInt::getNullValue(NumElems); bool IsAllConstants = true; @@ -10183,8 +10177,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); - if (Elt.isUndef()) + if (Elt.isUndef()) { + UndefMask.setBit(i); continue; + } Values.insert(Elt); if (!isa(Elt) && !isa(Elt)) { IsAllConstants = false; @@ -10197,13 +10193,45 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } + // All undef vector. Return an UNDEF. All zero vectors were handled above. + if (NonZeroMask == 0) { + assert(UndefMask.isAllOnesValue() && "Fully undef mask expected"); + return DAG.getUNDEF(VT); + } + + BuildVectorSDNode *BV = cast(Op.getNode()); + + // If the upper elts of a ymm/zmm are undef/zero then we might be better off + // lowering to a smaller build vector and padding with undef/zero. + if ((VT.is256BitVector() || VT.is512BitVector()) && + !isFoldableUseOfShuffle(BV)) { + unsigned UpperElems = NumElems / 2; + APInt UndefOrZeroMask = UndefMask | ZeroMask; + unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes(); + if (NumUpperUndefsOrZeros >= UpperElems) { + if (VT.is512BitVector() && + NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4))) + UpperElems = NumElems - (NumElems / 4); + bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems; + MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems); + SDValue NewBV = + DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems)); + return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl); + } + } + + if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) + return AddSub; + if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) + return HorizontalOp; + if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) + return Broadcast; + if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) + return BitOp; + unsigned NumZero = ZeroMask.countPopulation(); unsigned NumNonZero = NonZeroMask.countPopulation(); - // All undef vector. Return an UNDEF. All zero vectors were handled above. - if (NumNonZero == 0) - return DAG.getUNDEF(VT); - // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index c445a52..58fd4c9 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -6,8 +6,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8] -; CHECK-NEXT: # ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -18,8 +17,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8] -; CHECK-NEXT: # ymm3 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -48,8 +46,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14] -; CHECK-NEXT: # ymm3 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -78,8 +75,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9] -; CHECK-NEXT: # ymm3 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -108,8 +104,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0] -; CHECK-NEXT: # ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -120,8 +115,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0] -; CHECK-NEXT: # ymm3 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -898,8 +892,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,8,4,12,9,4,14,15,14,8,4,12,9,4,14,15] -; CHECK-NEXT: # ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -1196,9 +1189,8 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1207,8 +1199,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1235,8 +1226,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,15,3,2,3,6,8,3,0,15,3,2,3,6,8] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1263,8 +1253,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,15,15,2,6,10,14,7,2,15,15,2,6,10,14,7] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1291,9 +1280,8 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1302,8 +1290,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} @@ -1952,8 +1939,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,4,6,1,6,4,6,1] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -1980,8 +1966,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,3,6,3,6,3,6,3] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -1995,8 +1980,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3] -; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2009,9 +1993,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,0,0,7,6,0,0,7] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7] +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2020,8 +2003,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,0,0,7,6,0,0,7] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2048,8 +2030,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,7,7,5,3,7,7,5] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2076,8 +2057,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,1,0,6,4,1,0,6] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2104,9 +2084,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,6,5,3,7,6,5,3] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3] +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2115,8 +2094,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [7,6,5,3,7,6,5,3] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2896,9 +2874,8 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -2907,10 +2884,9 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7] +; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -2937,10 +2913,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %v define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [10,12,3,12,4,15,1,14,10,12,3,12,4,15,1,14] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14] +; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -2996,9 +2971,8 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %v define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3007,10 +2981,9 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -3651,9 +3624,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x doub define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,3,7,3,7,3,7,3] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3] +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -3662,10 +3634,9 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,3,7,3,7,3,7,3] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3] +; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -3678,8 +3649,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,7,3] -; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3693,10 +3663,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,7,6,2,0,7,6] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6] +; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -3749,9 +3718,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,1,4,0,2,1,4] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4] +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -3760,10 +3728,9 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,0,2,1,4] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4] +; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -3821,10 +3788,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,6,2,2,2,6,2,2] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2] +; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -3851,8 +3817,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,8,7,8,5,8,7,8] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8] ; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3862,8 +3827,7 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,8,7,8,5,8,7,8] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8] ; CHECK-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 @@ -3892,10 +3856,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,5,0,6,3,5,0,6] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6] +; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll index 8db4d77..1cfa810 100644 --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -11,16 +11,14 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: vmovaps %xmm1, %xmm9 -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17] -; CHECK-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22] -; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925] -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 ; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 +; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u> +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 ; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll index 3e0638b..39a0449 100644 --- a/llvm/test/CodeGen/X86/pr46532.ll +++ b/llvm/test/CodeGen/X86/pr46532.ll @@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() { ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rax) -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,0,0] ; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) while.1.body.preheader: %0 = load i8*, i8** undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2 diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index e6821da..288fbb6 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -465,9 +465,9 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u> +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1 +; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BWVL-NEXT: retq @@ -484,9 +484,9 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] -; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u> +; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1 +; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMIVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll index 7373bbf..9db2f6b 100644 --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -79,7 +79,7 @@ define <2 x i32> @test5(<8 x i32> %v) { ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [3,4,4,4] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper @@ -177,7 +177,7 @@ define <2 x i32> @test10(<8 x i32> %v) { ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [3,4,4,4] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 19ea280..ac11448 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -354,7 +354,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -396,7 +396,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,9,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -437,7 +437,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,10,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -478,7 +478,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -519,7 +519,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -560,7 +560,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -601,7 +601,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 4a4a9d1..a9d9798 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -812,7 +812,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -859,7 +859,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -906,7 +906,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -953,7 +953,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1000,7 +1000,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1047,7 +1047,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1094,7 +1094,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1141,7 +1141,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1188,7 +1188,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1235,7 +1235,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1282,7 +1282,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1329,7 +1329,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1376,7 +1376,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1423,7 +1423,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1470,7 +1470,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 7bbff87..f1af4fa 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -136,7 +136,7 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00040000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -153,7 +153,7 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00500000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -170,7 +170,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8f32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1470,7 +1470,7 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00040000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1487,7 +1487,7 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00500000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1504,7 +1504,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -3280,15 +3280,13 @@ define <8 x i32> @lowhalf_v8i32(<8 x i32> %x, <8 x i32> %y) { ; AVX2-LABEL: lowhalf_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,3,6] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: lowhalf_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14] -; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,14,3,14] ; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> @@ -3308,15 +3306,13 @@ define <8 x float> @lowhalf_v8f32(<8 x float> %x, <8 x float> %y) { ; AVX2-LABEL: lowhalf_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,3,6] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: lowhalf_v8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14] -; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm2 = [2,14,3,14] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 60ae9b5..8e1abdb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -388,9 +388,8 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) { ; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,6,7,10,0,1,2,3,4,6,7,10] -; ALL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10] +; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; ALL-NEXT: retq %res = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> @@ -401,9 +400,8 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) { define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) { ; ALL-LABEL: test_v16f32_0_1_3_6: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,3,6,0,1,3,6,0,1,3,6,0,1,3,6] -; ALL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,3,6] +; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index f582a31..8288ae8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -94,13 +94,13 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00040000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00040000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -110,13 +110,13 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00500000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00500000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -126,13 +126,13 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_06000000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_06000000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,6,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -912,13 +912,13 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00040000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00040000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -928,13 +928,13 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00500000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00500000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -944,13 +944,13 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_06000000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_06000000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,6,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -2187,17 +2187,15 @@ define <8 x double> @shuffle_v2f64_v8f64_01010101(<2 x double> %a) { define <4 x double> @test_v8f64_2346 (<8 x double> %v) { ; AVX512F-LABEL: test_v8f64_2346: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [2,3,4,6,2,3,4,6] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [2,3,4,6] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8f64_2346: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [2,0,3,0,4,0,6,0,2,0,3,0,4,0,6,0] -; AVX512F-32-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [2,0,3,0,4,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> @@ -2221,17 +2219,15 @@ define <2 x double> @test_v8f64_34 (<8 x double> %v) { define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) { ; AVX512F-LABEL: test_v8i64_1257: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,2,5,7,1,2,5,7] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,5,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8i64_1257: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,2,0,5,0,7,0,1,0,2,0,5,0,7,0] -; AVX512F-32-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,2,0,5,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> undef, <4 x i32> -- 2.7.4