From d0d48a91f8bc8a8c72feb8bec6164ed1ecf6e612 Mon Sep 17 00:00:00 2001 From: Han Zhu Date: Wed, 21 Sep 2022 18:01:49 -0700 Subject: [PATCH] [X86] Lower vector interleave into unpck and perm [This Godbolt link](https://godbolt.org/z/s17Kv1s9T) shows different codegen between clang and gcc for a transpose operation. clang result: ``` vmovdqu xmm0, xmmword ptr [rcx + rax] vmovdqu xmm1, xmmword ptr [rcx + rax + 16] vmovdqu xmm2, xmmword ptr [r8 + rax] vmovdqu xmm3, xmmword ptr [r8 + rax + 16] vpunpckhbw xmm4, xmm2, xmm0 vpunpcklbw xmm0, xmm2, xmm0 vpunpcklbw xmm2, xmm3, xmm1 vpunpckhbw xmm1, xmm3, xmm1 vmovdqu xmmword ptr [rdi + 2*rax + 48], xmm1 vmovdqu xmmword ptr [rdi + 2*rax + 32], xmm2 vmovdqu xmmword ptr [rdi + 2*rax], xmm0 vmovdqu xmmword ptr [rdi + 2*rax + 16], xmm4 ``` gcc result: ``` vmovdqu ymm3, YMMWORD PTR [rdi+rax] vpunpcklbw ymm1, ymm3, YMMWORD PTR [rsi+rax] vpunpckhbw ymm0, ymm3, YMMWORD PTR [rsi+rax] vperm2i128 ymm2, ymm1, ymm0, 32 vperm2i128 ymm1, ymm1, ymm0, 49 vmovdqu YMMWORD PTR [rcx+rax*2], ymm2 vmovdqu YMMWORD PTR [rcx+32+rax*2], ymm1 ``` clang's code is roughly 15% slower than gcc's when evaluated on an internal compression benchmark. The loop vectorizer generates the following shufflevector intrinsic: ``` %interleaved.vec = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> ``` which is lowered to SelectionDAG: ``` t2: v32i8,ch = CopyFromReg t0, Register:v32i8 %0 t6: v64i8 = concat_vectors t2, undef:v32i8 t4: v32i8,ch = CopyFromReg t0, Register:v32i8 %1 t7: v64i8 = concat_vectors t4, undef:v32i8 t8: v64i8 = vector_shuffle<0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95> t6, t7 ``` So far this `vector_shuffle` is good enough for us to pattern-match and transform, but as we go down the SelectionDAG pipeline, it got split into smaller shuffles. During dagcombine1, the shuffle is split by `foldShuffleOfConcatUndefs`. ``` // shuffle (concat X, undef), (concat Y, undef), Mask --> // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) t2: v32i8,ch = CopyFromReg t0, Register:v32i8 %0 t4: v32i8,ch = CopyFromReg t0, Register:v32i8 %1 t19: v32i8 = vector_shuffle<0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47> t2, t4 t15: ch,glue = CopyToReg t0, Register:v32i8 $ymm0, t19 t20: v32i8 = vector_shuffle<16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63> t2, t4 t17: ch,glue = CopyToReg t15, Register:v32i8 $ymm1, t20, t15:1 ``` With `foldShuffleOfConcatUndefs` commented out, the vector is still split later by the type legalizer, which comes after dagcombine1, because v64i8 is not a legal type in AVX2 (64 * 8 = 512 bits while ymm = 256 bits). There doesn't seem to be a good way to avoid this split. Lowering the `vector_shuffle` into unpck and perm during dagcombine1 is too early. Therefore, although somewhat inconvenient, we decided to go with pattern-matching a pair vector shuffles later in the SelectionDAG pipeline, as part of `lowerV32I8Shuffle`. The code looks at the two operands of the first shuffle it encounters, iterates through the users of the operands, and tries to find two shuffles that are consecutive interleaves. Once the pattern is found, it lowers them into unpcks and perms. It returns the perm for the shuffle that's currently being lowered (have ISel modify the DAG), and replaces the other shuffle in place. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D134477 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 115 ++++++++++++++++ llvm/test/CodeGen/X86/slow-pmulld.ll | 14 +- llvm/test/CodeGen/X86/vector-interleave.ll | 133 ++++++++----------- .../X86/vector-interleaved-store-i16-stride-2.ll | 118 +++++++++++------ .../X86/vector-interleaved-store-i32-stride-2.ll | 145 +++++++++------------ .../X86/vector-interleaved-store-i8-stride-2.ll | 43 +++--- 6 files changed, 335 insertions(+), 233 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1bac76e..e7cc2ea 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17775,6 +17775,90 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, DAG.getIntPtrConstant(0, DL)); } +// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2 +// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2 +// => +// ul = unpckl v1, v2 +// uh = unpckh v1, v2 +// a = vperm ul, uh +// b = vperm ul, uh +// +// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck +// and permute. We cannot directly match v3 because it is split into two +// 256-bit vectors in earlier isel stages. Therefore, this function matches a +// pair of 256-bit shuffles and makes sure the masks are consecutive. +// +// Once unpck and permute nodes are created, the permute corresponding to this +// shuffle is returned, while the other permute replaces the other half of the +// shuffle in the selection dag. +static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 && + VT != MVT::v32i8) + return SDValue(); + // + auto IsInterleavingPattern = [&](ArrayRef Mask, unsigned Begin0, + unsigned Begin1) { + size_t Size = Mask.size(); + assert(Size % 2 == 0 && "Expected even mask size"); + for (unsigned I = 0; I < Size; I += 2) { + if (Mask[I] != (int)(Begin0 + I / 2) || + Mask[I + 1] != (int)(Begin1 + I / 2)) + return false; + } + return true; + }; + // Check which half is this shuffle node + int NumElts = VT.getVectorNumElements(); + size_t FirstQtr = NumElts / 2; + size_t ThirdQtr = NumElts + NumElts / 2; + bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts); + bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr); + if (!IsFirstHalf && !IsSecondHalf) + return SDValue(); + + // Find the intersection between shuffle users of V1 and V2. + SmallVector Shuffles; + for (SDNode *User : V1->uses()) + if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 && + User->getOperand(1) == V2) + Shuffles.push_back(User); + // Limit user size to two for now. + if (Shuffles.size() != 2) + return SDValue(); + // Find out which half of the 512-bit shuffles is each smaller shuffle + auto *SVN1 = cast(Shuffles[0]); + auto *SVN2 = cast(Shuffles[1]); + SDNode *FirstHalf; + SDNode *SecondHalf; + if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) && + IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) { + FirstHalf = Shuffles[0]; + SecondHalf = Shuffles[1]; + } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) && + IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) { + FirstHalf = Shuffles[1]; + SecondHalf = Shuffles[0]; + } else { + return SDValue(); + } + // Lower into unpck and perm. Return the perm of this shuffle and replace + // the other. + SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh, + DAG.getTargetConstant(0x20, DL, MVT::i8)); + SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh, + DAG.getTargetConstant(0x31, DL, MVT::i8)); + if (IsFirstHalf) { + DAG.ReplaceAllUsesWith(SecondHalf, &Perm2); + return Perm1; + } + DAG.ReplaceAllUsesWith(FirstHalf, &Perm1); + return Perm2; +} /// Handle lowering of 4-lane 64-bit floating point shuffles. /// @@ -18082,6 +18166,16 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, DAG, Subtarget)) return V; + // Try to match an interleave of two v8f32s and lower them as unpck and + // permutes using ymms. This needs to go before we try to split the vectors. + // + // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits + // this path inadvertently. + if (Subtarget.hasAVX2() && !Subtarget.hasAVX512()) + if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. @@ -18120,6 +18214,13 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return ZExt; + // Try to match an interleave of two v8i32s and lower them as unpck and + // permutes using ymms. This needs to go before we try to split the vectors. + if (!Subtarget.hasAVX512()) + if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2, + Mask, DAG)) + return V; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. @@ -18325,6 +18426,13 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; + // Try to match an interleave of two v16i16s and lower them as unpck and + // permutes using ymms. + if (!Subtarget.hasAVX512()) + if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG); @@ -18438,6 +18546,13 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, Mask, Zeroable, DAG)) return V; + // Try to match an interleave of two v32i8s and lower them as unpck and + // permutes using ymms. + if (!Subtarget.hasAVX512()) + if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2, + Mask, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 7c69cd3..b6706f9 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -492,15 +492,11 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: ret{{[l|q]}} ; ; AVX2-32-LABEL: test_mul_v16i32_v16i16: diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll index 1fc847a..2c8c097 100644 --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -91,34 +91,34 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x ; ; AVX2-LABEL: interleave8x8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm2[0,1] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm4[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm4[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX2-NEXT: retq %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> @@ -241,15 +241,10 @@ define <16 x float> @interleave2x8f32(<8 x float> %a, <8 x float> %b) { ; ; AVX2-LABEL: interleave2x8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: retq %result = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> ret <16 x float> %result @@ -283,15 +278,10 @@ define <16 x i32> @interleave2x8i32(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2-LABEL: interleave2x8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: retq %result = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %result @@ -325,15 +315,10 @@ define <32 x i16> @interleave2x16i16(<16 x i16> %a, <16 x i16> %b) { ; ; AVX2-LABEL: interleave2x16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: retq %result = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %result @@ -389,24 +374,15 @@ define <64 x i16> @interleave2x32i16(<32 x i16> %a, <32 x i16> %b) { ; ; AVX2-LABEL: interleave2x32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-NEXT: vmovdqa %ymm5, %ymm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm4[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[0,1],ymm5[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm5[2,3] +; AVX2-NEXT: vmovdqa %ymm4, %ymm1 ; AVX2-NEXT: retq %result = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> ret <64 x i16> %result @@ -440,15 +416,10 @@ define <64 x i8> @interleave2x32i8(<32 x i8> %a, <32 x i8> %b) { ; ; AVX2-LABEL: interleave2x32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: retq %result = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> ret <64 x i8> %result diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll index 536a7a8..8b132fc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -146,21 +146,34 @@ define void @vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { ; SSE-NEXT: movdqa %xmm4, 16(%rdx) ; SSE-NEXT: retq ; -; AVX-LABEL: vf16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm4, 16(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vf16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vf16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: vf16: ; AVX512: # %bb.0: @@ -215,33 +228,54 @@ define void @vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { ; SSE-NEXT: movdqa %xmm8, 16(%rdx) ; SSE-NEXT: retq ; -; AVX-LABEL: vf32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX-NEXT: vmovdqa (%rdi), %xmm4 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm5, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm7, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 96(%rdx) -; AVX-NEXT: vmovdqa %xmm6, 112(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX-NEXT: vmovdqa %xmm8, 80(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vf32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovdqa %xmm5, 16(%rdx) +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-NEXT: vmovdqa %xmm7, 48(%rdx) +; AVX1-NEXT: vmovdqa %xmm3, 96(%rdx) +; AVX1-NEXT: vmovdqa %xmm6, 112(%rdx) +; AVX1-NEXT: vmovdqa %xmm2, 64(%rdx) +; AVX1-NEXT: vmovdqa %xmm8, 80(%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vf32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] +; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx) +; AVX2-NEXT: vmovdqa %ymm3, 96(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: vf32: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index a0c196f..9f4c85e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -136,18 +136,15 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; ; AVX2-LABEL: store_i32_stride2_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %xmm0 -; AVX2-NEXT: vmovaps 16(%rsi), %xmm1 -; AVX2-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX2-NEXT: vmovaps %xmm2, 48(%rdx) -; AVX2-NEXT: vmovaps %xmm0, (%rdx) -; AVX2-NEXT: vmovaps %xmm4, 16(%rdx) +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_i32_stride2_vf8: @@ -234,30 +231,23 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX2-LABEL: store_i32_stride2_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %xmm0 -; AVX2-NEXT: vmovaps 16(%rsi), %xmm1 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm2 -; AVX2-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX2-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-NEXT: vmovaps %xmm0, (%rdx) -; AVX2-NEXT: vmovaps %xmm5, 16(%rdx) -; AVX2-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX2-NEXT: vmovaps %xmm7, 48(%rdx) -; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX2-NEXT: vmovaps %xmm6, 112(%rdx) -; AVX2-NEXT: vmovaps %xmm2, 64(%rdx) -; AVX2-NEXT: vmovaps %xmm8, 80(%rdx) +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] +; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_i32_stride2_vf16: @@ -401,56 +391,39 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX2-LABEL: store_i32_stride2_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-NEXT: vmovaps 80(%rsi), %xmm3 -; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX2-NEXT: vmovaps 48(%rsi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX2-NEXT: vmovaps 112(%rsi), %xmm10 -; AVX2-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] -; AVX2-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX2-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-NEXT: vmovaps %xmm13, 192(%rdx) -; AVX2-NEXT: vmovaps %xmm0, 208(%rdx) -; AVX2-NEXT: vmovaps %xmm10, 224(%rdx) -; AVX2-NEXT: vmovaps %xmm14, 240(%rdx) -; AVX2-NEXT: vmovaps %xmm6, 64(%rdx) -; AVX2-NEXT: vmovaps %xmm11, 80(%rdx) -; AVX2-NEXT: vmovaps %xmm7, 96(%rdx) -; AVX2-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX2-NEXT: vmovaps %xmm4, (%rdx) -; AVX2-NEXT: vmovaps %xmm9, 16(%rdx) -; AVX2-NEXT: vmovaps %xmm5, 32(%rdx) -; AVX2-NEXT: vmovaps %xmm12, 48(%rdx) -; AVX2-NEXT: vmovaps %xmm3, 160(%rdx) -; AVX2-NEXT: vmovaps %xmm2, 176(%rdx) -; AVX2-NEXT: vmovaps %xmm1, 128(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm0, 144(%rdx) +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm6 +; AVX2-NEXT: vmovaps 96(%rsi), %ymm7 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[4],ymm7[4],ymm3[5],ymm7[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] +; AVX2-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX2-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_i32_stride2_vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll index c05b96d..678d63f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -178,21 +178,34 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; SSE-NEXT: movdqa %xmm4, 16(%rdx) ; SSE-NEXT: retq ; -; AVX-LABEL: store_i8_stride2_vf32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm4, 16(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: store_i8_stride2_vf32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovdqa %xmm4, 16(%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_i8_stride2_vf32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: store_i8_stride2_vf32: ; AVX512: # %bb.0: -- 2.7.4