From: Michael Zuckerman Date: Mon, 7 Aug 2017 13:22:39 +0000 (+0000) Subject: [X86][LLVM]Expanding Supports lowerInterleavedStore() in X86InterleavedAccess (VF16... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=680ac10aa7bc87c52bbd3e110d3cd227b0821044;p=platform%2Fupstream%2Fllvm.git [X86][LLVM]Expanding Supports lowerInterleavedStore() in X86InterleavedAccess (VF16 stride 4). This patch expands the support of lowerInterleavedStore to 16x8i stride 4. LLVM creates suboptimal shuffle code-gen for AVX2. In overall, this patch is a specific fix for the pattern (Strid=4 VF=16) and we plan to include more patterns in the future. The patch goal is to optimize the following sequence: At the end of the computation, we have ymm2, ymm0, ymm12 and ymm3 holding each 16 chars: c0, c1, , c16 m0, m1, , m16 y0, y1, , y16 k0, k1, ., k16 And these need to be transposed/interleaved and stored like so: c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 .... Differential Revision: https://reviews.llvm.org/D35829 llvm-svn: 310252 --- diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 6649308..0ebda33 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -69,8 +69,10 @@ class X86InterleavedAccessGroup { /// Out-V3 = P4, q4, r4, s4 void transpose_4x4(ArrayRef InputVectors, SmallVectorImpl &TransposedMatrix); - void interleave8bit_32x4(ArrayRef InputVectors, - SmallVectorImpl &TransposedMatrix); + void interleave8bitStride4(ArrayRef InputVectors, + SmallVectorImpl &TransposedMatrix, + unsigned NumSubVecElems); + public: /// In order to form an interleaved access group X86InterleavedAccessGroup /// requires a wide-load instruction \p 'I', a group of interleaved-vectors @@ -101,13 +103,14 @@ bool X86InterleavedAccessGroup::isSupported() const { Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); unsigned SupportedNumElem = 4; - if (ShuffleElemSize == 8) - SupportedNumElem = 32; unsigned WideInstSize; - // Currently, lowering is supported for the following vectors: - // 1. 4-element vectors of 64 bits on AVX. - // 2. 32-element vectors of 8 bits on AVX. + // Currently, lowering is supported for the following vectors with stride 4: + // 1. Store and load of 4-element vectors of 64 bits on AVX. + // 2. Store of 16/32-element vectors of 8 bits on AVX. + if (!Subtarget.hasAVX() || Factor != 4) + return false; + if (isa(Inst)) { if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize) @@ -117,11 +120,13 @@ bool X86InterleavedAccessGroup::isSupported() const { } else WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); - if (DL.getTypeSizeInBits(ShuffleEltTy) == 8 && !isa(Inst)) - return false; + // We support shuffle represents stride 4 for byte type with size of + // WideInstSize. + if (ShuffleElemSize == 8 && isa(Inst) && + (WideInstSize == 512 || WideInstSize == 1024)) + return true; - if (!Subtarget.hasAVX() || Factor != 4 || - (ShuffleElemSize != 64 && ShuffleElemSize != 8) || + if (ShuffleElemSize != 64 || WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem)) return false; @@ -192,9 +197,9 @@ static void createConcatShuffleMask(int NumElements, Mask.push_back(i + Offset + NumElements); } -void X86InterleavedAccessGroup::interleave8bit_32x4( - ArrayRef Matrix, - SmallVectorImpl &TransposedMatrix) { +void X86InterleavedAccessGroup::interleave8bitStride4( + ArrayRef Matrix, SmallVectorImpl &TransposedMatrix, + unsigned numberOfElement) { // Example: Assuming we start from the following vectors: // Matrix[0]= c0 c1 c2 c3 c4 ... c31 @@ -202,6 +207,13 @@ void X86InterleavedAccessGroup::interleave8bit_32x4( // Matrix[2]= y0 y1 y2 y3 y4 ... y31 // Matrix[3]= k0 k1 k2 k3 k4 ... k31 + Type *VecTyepVt = VectorType::get(Type::getInt8Ty(Shuffles[0]->getContext()), + numberOfElement); + Type *VecTyepVtHalf = VectorType::get( + Type::getInt16Ty(Shuffles[0]->getContext()), numberOfElement / 2); + MVT VT = MVT::getVT(VecTyepVt); + MVT HalfVT = MVT::getVT(VecTyepVtHalf); + TransposedMatrix.resize(4); SmallVector MaskHighTemp; @@ -216,8 +228,8 @@ void X86InterleavedAccessGroup::interleave8bit_32x4( // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86 // shuffle pattern. - createUnpackShuffleMask(MVT::v32i8, MaskHighTemp, false, false); - createUnpackShuffleMask(MVT::v32i8, MaskLowTemp, true, false); + createUnpackShuffleMask(VT, MaskHighTemp, false, false); + createUnpackShuffleMask(VT, MaskLowTemp, true, false); ArrayRef MaskHigh = makeArrayRef(MaskHighTemp); ArrayRef MaskLow = makeArrayRef(MaskLowTemp); @@ -232,8 +244,8 @@ void X86InterleavedAccessGroup::interleave8bit_32x4( // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86 // shuffle pattern. - createUnpackShuffleMask(MVT::v16i16, MaskLowTemp1, true, false); - createUnpackShuffleMask(MVT::v16i16, MaskHighTemp1, false, false); + createUnpackShuffleMask(HalfVT, MaskLowTemp1, true, false); + createUnpackShuffleMask(HalfVT, MaskHighTemp1, false, false); scaleShuffleMask(2, makeArrayRef(MaskHighTemp1), MaskHighTemp2); scaleShuffleMask(2, makeArrayRef(MaskLowTemp1), MaskLowTemp2); ArrayRef MaskHighWord = makeArrayRef(MaskHighTemp2); @@ -267,6 +279,13 @@ void X86InterleavedAccessGroup::interleave8bit_32x4( Value *Low1 = Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord); + if (VT == MVT::v16i8) { + TransposedMatrix[0] = Low; + TransposedMatrix[1] = High; + TransposedMatrix[2] = Low1; + TransposedMatrix[3] = High1; + return; + } // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15 // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23 @@ -349,8 +368,9 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { case 4: transpose_4x4(DecomposedVectors, TransposedVectors); break; + case 16: case 32: - interleave8bit_32x4(DecomposedVectors, TransposedVectors); + interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems); break; default: return false; diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 4bef4e2..6a3fe0c 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -375,75 +375,51 @@ ret void define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) { ; AVX1-LABEL: interleaved_store_vf16_i8_stride4: ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm4, (%rdi) +; AVX1-NEXT: vmovaps %ymm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf16_i8_stride4: ; AVX2: # BB#0: -; AVX2-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,u,u,255,0,u,u,255,0,u,u,255,0,u,u,0,255,u,u,0,255,u,u,0,255,u,u,0,255,u,u> -; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7],ymm4[8],ymm2[9],ymm4[10],ymm2[11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u] -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u,u] -; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) -; AVX2-NEXT: vmovdqa %ymm2, (%rdi) +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride4: ; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,u,u,255,0,u,u,255,0,u,u,255,0,u,u,0,255,u,u,0,255,u,u,0,255,u,u,0,255,u,u> -; AVX512-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7],ymm4[8],ymm2[9],ymm4[10],ymm2[11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u] -; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,0,u,u,u,1,u,u,u,2,u,u,u,3,u,u,20,u,u,u,21,u,u,u,22,u,u,u,23,u,u,u] -; AVX512-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqa32 %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll index 91f5ec5..3a94d8f 100644 --- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll @@ -38,8 +38,22 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 ; CHECK-LABEL: @interleaved_store_vf16_i8_stride4( ; CHECK-NEXT: [[V1:%.*]] = shufflevector <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]], <32 x i32> ; CHECK-NEXT: [[V2:%.*]] = shufflevector <16 x i8> [[X3:%.*]], <16 x i8> [[X4:%.*]], <32 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <64 x i32> -; CHECK-NEXT: store <64 x i8> [[INTERLEAVED_VEC]], <64 x i8>* [[P:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <32 x i8> [[V1]], <32 x i8> [[V2]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP9]], <32 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP10]], <32 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> +; CHECK-NEXT: store <64 x i8> [[TMP15]], <64 x i8>* [[P:%.*]] ; CHECK-NEXT: ret void ; %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32>