From 32162cf291d40b8ead01061ea68bcdbc79ba9573 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 15 May 2022 14:57:58 +0100 Subject: [PATCH] [X86] lowerV4I64Shuffle - try harder to lower to PERMQ(BLENDD(V1,V2)) pattern --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 + .../X86/vector-interleaved-load-i64-stride-3.ll | 168 ++++++++++----------- llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 56 ++++--- 3 files changed, 126 insertions(+), 103 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f80886ac..1806e63 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17725,6 +17725,11 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return V; + // Try to lower to PERMQ(BLENDD(V1,V2)). + if (SDValue V = + lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return V; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index f9452db..279f728 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -121,22 +121,22 @@ define void @load_i64_stride3_vf4(<12 x i64>* %in.vec, <4 x i64>* %out.vec0, <4 ; ; AVX2-LABEL: load_i64_stride3_vf4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -251,26 +251,26 @@ define void @load_i64_stride3_vf8(<24 x i64>* %in.vec, <8 x i64>* %out.vec0, <8 ; ; AVX2-LABEL: load_i64_stride3_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm5 +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] @@ -279,10 +279,10 @@ define void @load_i64_stride3_vf8(<24 x i64>* %in.vec, <8 x i64>* %out.vec0, <8 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm5, (%rsi) -; AVX2-NEXT: vmovdqa %ymm4, 32(%rsi) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-NEXT: vzeroupper @@ -497,46 +497,46 @@ define void @load_i64_stride3_vf16(<48 x i64>* %in.vec, <16 x i64>* %out.vec0, < ; ; AVX2-LABEL: load_i64_stride3_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vinserti128 $1, 352(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,3,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 272(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 176(%rdi), %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpbroadcastq 368(%rdi), %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-NEXT: vmovaps (%rdi), %ymm9 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm4 +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm5[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 272(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vmovaps 112(%rdi), %xmm7 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] @@ -553,14 +553,14 @@ define void @load_i64_stride3_vf16(<48 x i64>* %in.vec, <16 x i64>* %out.vec0, < ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm4, 64(%rsi) -; AVX2-NEXT: vmovdqa %ymm15, 96(%rsi) -; AVX2-NEXT: vmovdqa %ymm14, (%rsi) -; AVX2-NEXT: vmovdqa %ymm13, 32(%rsi) -; AVX2-NEXT: vmovdqa %ymm5, 96(%rdx) -; AVX2-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-NEXT: vmovdqa %ymm6, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm3, 64(%rdx) +; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-NEXT: vmovaps %ymm15, 96(%rsi) +; AVX2-NEXT: vmovaps %ymm14, (%rsi) +; AVX2-NEXT: vmovaps %ymm13, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-NEXT: vmovaps %ymm0, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 5bbc6ae..80ba405 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1267,14 +1267,14 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_3254: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3254: @@ -1285,8 +1285,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_3254: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1379,18 +1379,36 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @shuffle_v4i64_2741(<4 x i64> %a, <4 x i64> %b) { -; AVX1OR2-LABEL: shuffle_v4i64_2741: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[0,1] -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_2741: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_2741: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1] -; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq +; AVX2-LABEL: shuffle_v4i64_2741: +; AVX2: # %bb.0: +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_2741: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2741: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_2741: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1767,8 +1785,8 @@ define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) { ; ; AVX2-LABEL: shuffle_v4i64_1234: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_1234: -- 2.7.4