From e9f9467da063875bd684e46660e2ff36ba4f55e2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 23 Apr 2023 11:48:50 +0100 Subject: [PATCH] [X86] X86FixupInstTunings - add VPERMILPDri -> VSHUFPDrri mapping Similar to the original VPERMILPSri -> VSHUFPSrri mapping added in D143787, replacing VPERMILPDri -> VSHUFPDrri should never be any slower and saves an encoding byte. The sibling VPERMILPDmi -> VPSHUFDmi mapping is trickier as we need the same shuffle mask in every lane (and it needs to be adjusted) - I haven't attempted that yet but we can investigate it in the future if there's interest. Fixes #61060 Differential Revision: https://reviews.llvm.org/D148999 --- llvm/lib/Target/X86/X86FixupInstTuning.cpp | 38 +++ llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll | 4 +- .../test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 8 +- llvm/test/CodeGen/X86/avx-intrinsics-x86.ll | 4 +- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 4 +- llvm/test/CodeGen/X86/avx512-cvt.ll | 6 +- llvm/test/CodeGen/X86/avx512-hadd-hsub.ll | 16 +- .../CodeGen/X86/avx512-intrinsics-fast-isel.ll | 76 ++--- llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 10 +- .../CodeGen/X86/avx512-shuffles/in_lane_permute.ll | 50 +-- llvm/test/CodeGen/X86/avx512fp16-mov.ll | 6 +- llvm/test/CodeGen/X86/avx512fp16-mscatter.ll | 4 +- .../CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 20 +- llvm/test/CodeGen/X86/combine-and.ll | 4 +- llvm/test/CodeGen/X86/complex-fastmath.ll | 12 +- .../X86/copy-low-subvec-elt-to-high-subvec-elt.ll | 4 +- llvm/test/CodeGen/X86/extract-concat.ll | 2 +- llvm/test/CodeGen/X86/fmaddsub-combine.ll | 52 ++-- llvm/test/CodeGen/X86/fmf-reduction.ll | 2 +- llvm/test/CodeGen/X86/haddsub-2.ll | 8 +- llvm/test/CodeGen/X86/haddsub-3.ll | 6 +- llvm/test/CodeGen/X86/haddsub-broadcast.ll | 2 +- llvm/test/CodeGen/X86/haddsub-shuf.ll | 16 +- llvm/test/CodeGen/X86/haddsub-undef.ll | 24 +- llvm/test/CodeGen/X86/haddsub.ll | 86 +++--- llvm/test/CodeGen/X86/half.ll | 4 +- llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll | 20 +- llvm/test/CodeGen/X86/horizontal-sum.ll | 44 +-- llvm/test/CodeGen/X86/known-signbits-vector.ll | 4 +- llvm/test/CodeGen/X86/load-partial-dot-product.ll | 6 +- llvm/test/CodeGen/X86/matrix-multiply.ll | 42 +-- llvm/test/CodeGen/X86/oddshuffles.ll | 12 +- llvm/test/CodeGen/X86/pr40730.ll | 2 +- llvm/test/CodeGen/X86/scalar-int-to-fp.ll | 2 +- llvm/test/CodeGen/X86/scalarize-fp.ll | 4 +- .../test/CodeGen/X86/shuffle-of-splat-multiuses.ll | 2 +- llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll | 4 +- llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 24 +- llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 16 +- .../CodeGen/X86/tuning-shuffle-permilpd-avx512.ll | 18 +- llvm/test/CodeGen/X86/tuning-shuffle-permilpd.ll | 4 +- llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll | 26 +- llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll | 54 ++-- llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll | 36 +-- llvm/test/CodeGen/X86/vec_fp_to_int.ll | 76 ++--- llvm/test/CodeGen/X86/vector-half-conversions.ll | 52 ++-- llvm/test/CodeGen/X86/vector-interleave.ll | 4 +- .../X86/vector-interleaved-load-i32-stride-5.ll | 32 +- .../X86/vector-interleaved-store-i32-stride-3.ll | 60 ++-- .../X86/vector-interleaved-store-i32-stride-4.ll | 2 +- .../X86/vector-interleaved-store-i32-stride-5.ll | 20 +- .../X86/vector-interleaved-store-i64-stride-3.ll | 94 +++--- .../X86/vector-interleaved-store-i64-stride-7.ll | 12 +- llvm/test/CodeGen/X86/vector-narrow-binop.ll | 6 +- llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll | 132 ++++---- llvm/test/CodeGen/X86/vector-reduce-fadd.ll | 344 ++++++++++----------- .../CodeGen/X86/vector-reduce-fmax-fmin-fast.ll | 28 +- llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll | 32 +- llvm/test/CodeGen/X86/vector-reduce-fmax.ll | 84 ++--- llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll | 32 +- llvm/test/CodeGen/X86/vector-reduce-fmin.ll | 84 ++--- llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll | 84 ++--- llvm/test/CodeGen/X86/vector-reduce-fmul.ll | 264 ++++++++-------- llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll | 10 +- llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 58 ++-- llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 4 +- llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll | 2 +- llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 26 +- .../CodeGen/X86/vector-shuffle-combining-avx.ll | 10 +- .../CodeGen/X86/vector-shuffle-combining-xop.ll | 4 +- llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 4 +- llvm/test/CodeGen/X86/x86-interleaved-access.ll | 8 +- 72 files changed, 1197 insertions(+), 1159 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index ba143a7..8ffd971 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -126,6 +126,21 @@ bool X86FixupInstTuningPass::processInstruction( return ReplaceInTie; }; + // `vpermilpd r, i` -> `vshufpd r, r, i` + // `vpermilpd r, i, k` -> `vshufpd r, r, i, k` + // `vshufpd` is always as fast or faster than `vpermilpd` and takes + // 1 less byte of code size for VEX and EVEX encoding. + auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool { + if (!NewOpcPreferable(NewOpc)) + return false; + unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(NumOperands - 2)); + MI.setDesc(TII->get(NewOpc)); + MI.addOperand(MachineOperand::CreateImm(MaskImm)); + return true; + }; + // `vpermilps r, i` -> `vshufps r, r, i` // `vpermilps r, i, k` -> `vshufps r, r, i, k` // `vshufps` is always as fast or faster than `vpermilps` and takes @@ -210,6 +225,29 @@ bool X86FixupInstTuningPass::processInstruction( }; switch (Opc) { + case X86::VPERMILPDri: + return ProcessVPERMILPDri(X86::VSHUFPDrri); + case X86::VPERMILPDYri: + return ProcessVPERMILPDri(X86::VSHUFPDYrri); + case X86::VPERMILPDZ128ri: + return ProcessVPERMILPDri(X86::VSHUFPDZ128rri); + case X86::VPERMILPDZ256ri: + return ProcessVPERMILPDri(X86::VSHUFPDZ256rri); + case X86::VPERMILPDZri: + return ProcessVPERMILPDri(X86::VSHUFPDZrri); + case X86::VPERMILPDZ128rikz: + return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz); + case X86::VPERMILPDZ256rikz: + return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz); + case X86::VPERMILPDZrikz: + return ProcessVPERMILPDri(X86::VSHUFPDZrrikz); + case X86::VPERMILPDZ128rik: + return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik); + case X86::VPERMILPDZ256rik: + return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik); + case X86::VPERMILPDZrik: + return ProcessVPERMILPDri(X86::VSHUFPDZrrik); + case X86::VPERMILPSri: return ProcessVPERMILPSri(X86::VSHUFPSrri); case X86::VPERMILPSYri: diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index c054691..b7dea5f 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1257,7 +1257,7 @@ define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind { ; CHECK-LABEL: test_mm_permute_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> ret <2 x double> %res @@ -1266,7 +1266,7 @@ define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind { define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind { ; CHECK-LABEL: test_mm256_permute_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> ret <4 x double> %res diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index 9d4feaa..966662f 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -843,13 +843,13 @@ declare void @llvm.x86.avx.storeu.ps.256(ptr, <8 x float>) nounwind define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) { ; AVX-LABEL: test_x86_avx_vpermil_pd: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; AVX-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; AVX-NEXT: # xmm0 = xmm0[1,0] ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vpermil_pd: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; AVX512VL-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; AVX512VL-NEXT: # xmm0 = xmm0[1,0] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1] @@ -861,13 +861,13 @@ declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnon define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) { ; AVX-LABEL: test_x86_avx_vpermil_pd_256: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x07] +; AVX-NEXT: vshufpd $7, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xc6,0xc0,0x07] ; AVX-NEXT: # ymm0 = ymm0[1,1,3,2] ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vpermil_pd_256: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilpd $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x07] +; AVX512VL-NEXT: vshufpd $7, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc0,0x07] ; AVX512VL-NEXT: # ymm0 = ymm0[1,1,3,2] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index 80ca682..f0e8987 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -656,13 +656,13 @@ declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) no define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) { ; AVX-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd $9, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09] +; AVX-NEXT: vshufpd $9, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xc6,0xc0,0x09] ; AVX-NEXT: # ymm0 = ymm0[1,0,2,3] ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09] +; AVX512VL-NEXT: vshufpd $9, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc0,0x09] ; AVX512VL-NEXT: # ymm0 = ymm0[1,0,2,3] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) ; <<4 x double>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index b275820..f17cbc3 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -882,7 +882,7 @@ define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonl ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: vmulpd (%eax), %xmm0, %xmm1 ; X86-NEXT: vmulsd 16(%eax), %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) @@ -895,7 +895,7 @@ define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonl ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1 ; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 4f66f37..ff7f7b3 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -180,13 +180,13 @@ define <4 x i64> @f64to4sl(<4 x double> %a) { ; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1 ; NODQ-NEXT: vcvttsd2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 -; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; NODQ-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; NODQ-NEXT: vcvttsd2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; NODQ-NEXT: vcvttsd2si %xmm0, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 -; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NODQ-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; NODQ-NEXT: vcvttsd2si %xmm0, %rax ; NODQ-NEXT: vmovq %rax, %xmm0 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -214,7 +214,7 @@ define <4 x i64> @f32to4sl(<4 x float> %a) { ; NODQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; NODQ-NEXT: vcvttss2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 -; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; NODQ-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; NODQ-NEXT: vcvttss2si %xmm2, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll index c12c1f5..ae37774 100644 --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -59,7 +59,7 @@ define i32 @hsub_16(<16 x i32> %x225) { define float @fhadd_16(<16 x float> %x225) { ; KNL-LABEL: fhadd_16: ; KNL: # %bb.0: -; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; KNL-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -67,7 +67,7 @@ define float @fhadd_16(<16 x float> %x225) { ; ; SKX-LABEL: fhadd_16: ; SKX: # %bb.0: -; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -84,7 +84,7 @@ define float @fhadd_16(<16 x float> %x225) { define float @fhsub_16(<16 x float> %x225) { ; KNL-LABEL: fhsub_16: ; KNL: # %bb.0: -; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; KNL-NEXT: vsubss %xmm1, %xmm0, %xmm0 @@ -92,7 +92,7 @@ define float @fhsub_16(<16 x float> %x225) { ; ; SKX-LABEL: fhsub_16: ; SKX: # %bb.0: -; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 @@ -192,13 +192,13 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) { define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_eel: ; KNL: # %bb.0: -; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_eel: ; SKX: # %bb.0: -; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -215,14 +215,14 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fsub_noundef_ee: ; KNL: # %bb.0: ; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0 -; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fsub_noundef_ee: ; SKX: # %bb.0: ; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0 -; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 2313355..f4c6532 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -940,7 +940,7 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) { ; CHECK-LABEL: test_mm512_permute_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -951,13 +951,13 @@ define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x d ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] +; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_mask_permute_pd: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] +; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] ; X64-NEXT: retq %arg1 = bitcast i8 %a1 to <8 x i1> %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> @@ -970,13 +970,13 @@ define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] +; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] ; X86-NEXT: retl ; ; X64-LABEL: test_mm512_maskz_permute_pd: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] +; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] ; X64-NEXT: retq %arg0 = bitcast i8 %a0 to <8 x i1> %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> @@ -1841,7 +1841,7 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) { ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; X86-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-NEXT: retl @@ -7317,7 +7317,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -7333,7 +7333,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7364,7 +7364,7 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) { ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -7380,7 +7380,7 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) { ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7406,7 +7406,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) { ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -7423,7 +7423,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) { ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -7456,7 +7456,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -7473,7 +7473,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -7514,7 +7514,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -7532,7 +7532,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7569,7 +7569,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -7588,7 +7588,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7619,7 +7619,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -7638,7 +7638,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -7677,7 +7677,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -7697,7 +7697,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -7815,7 +7815,7 @@ define double @test_mm512_reduce_max_pd(<8 x double> %__W) { ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -7831,7 +7831,7 @@ define double @test_mm512_reduce_max_pd(<8 x double> %__W) { ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7940,7 +7940,7 @@ define double @test_mm512_reduce_min_pd(<8 x double> %__W) { ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -7956,7 +7956,7 @@ define double @test_mm512_reduce_min_pd(<8 x double> %__W) { ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -8085,7 +8085,7 @@ define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -8104,7 +8104,7 @@ define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -8237,7 +8237,7 @@ define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -8256,7 +8256,7 @@ define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -8360,7 +8360,7 @@ define float @test_mm512_reduce_max_ps(<16 x float> %__W) { ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -8377,7 +8377,7 @@ define float @test_mm512_reduce_max_ps(<16 x float> %__W) { ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -8486,7 +8486,7 @@ define float @test_mm512_reduce_min_ps(<16 x float> %__W) { ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -8503,7 +8503,7 @@ define float @test_mm512_reduce_min_ps(<16 x float> %__W) { ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -8664,7 +8664,7 @@ define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -8684,7 +8684,7 @@ define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -8849,7 +8849,7 @@ define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -8869,7 +8869,7 @@ define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 8989e2e..5a16760 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -970,7 +970,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilpd $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xc0,0x16] +; CHECK-NEXT: vshufpd $22, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xc0,0x16] ; CHECK-NEXT: ## zmm0 = zmm0[0,1,3,2,5,4,6,6] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) @@ -982,7 +982,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 ; X86: ## %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16] +; X86-NEXT: vshufpd $22, %zmm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xc8,0x16] ; X86-NEXT: ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] ; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] @@ -990,7 +990,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16] +; X64-NEXT: vshufpd $22, %zmm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xc8,0x16] ; X64-NEXT: ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] ; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] @@ -1003,14 +1003,14 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i ; X86: ## %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] +; X86-NEXT: vshufpd $22, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc0,0x16] ; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] +; X64-NEXT: vshufpd $22, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc0,0x16] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll index 56af39c..c876d99 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll @@ -1242,7 +1242,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_imm_mem_mask7(ptr %vp, <16 x fl define <2 x double> @test_2xdouble_perm_mask0(<2 x double> %vec) { ; CHECK-LABEL: test_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: retq %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> ret <2 x double> %res @@ -1252,7 +1252,7 @@ define <2 x double> @test_masked_2xdouble_perm_mask0(<2 x double> %vec, <2 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1,0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> @@ -1266,7 +1266,7 @@ define <2 x double> @test_masked_z_2xdouble_perm_mask0(<2 x double> %vec, <2 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0] ; CHECK-NEXT: retq %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer @@ -1278,7 +1278,7 @@ define <2 x double> @test_masked_2xdouble_perm_mask1(<2 x double> %vec, <2 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1,0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> @@ -1292,7 +1292,7 @@ define <2 x double> @test_masked_z_2xdouble_perm_mask1(<2 x double> %vec, <2 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0] ; CHECK-NEXT: retq %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer @@ -1367,7 +1367,7 @@ define <2 x double> @test_masked_z_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) { ; CHECK-LABEL: test_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,3] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res @@ -1377,7 +1377,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,0,2,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1,0,2,3] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> @@ -1391,7 +1391,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,2,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -1403,7 +1403,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> @@ -1417,7 +1417,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -1429,7 +1429,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,3,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,3,3] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> @@ -1443,7 +1443,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,3,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,3,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -1453,7 +1453,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x d define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) { ; CHECK-LABEL: test_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res @@ -1463,7 +1463,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> @@ -1477,7 +1477,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -1617,7 +1617,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,2,4,5,7,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,0,3,2,4,5,7,6] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -1627,7 +1627,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,3,2,4,5,7,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,3,2,4,5,7,6] ; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -1641,7 +1641,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,3,2,4,5,7,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,3,2,4,5,7,6] ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = fcmp oeq <8 x double> %mask, zeroinitializer @@ -1653,7 +1653,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask1(<8 x double> %vec, <8 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,7,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,7,6] ; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -1667,7 +1667,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask1(<8 x double> %vec, <8 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,7,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,7,6] ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = fcmp oeq <8 x double> %mask, zeroinitializer @@ -1679,7 +1679,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,3,5,5,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,3,5,5,6,7] ; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -1693,7 +1693,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,3,5,5,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,3,5,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = fcmp oeq <8 x double> %mask, zeroinitializer @@ -1703,7 +1703,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x d define <8 x double> @test_8xdouble_perm_mask3(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,7] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -1713,7 +1713,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask3(<8 x double> %vec, <8 x dou ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,2,4,4,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,2,4,4,6,7] ; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -1727,7 +1727,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask3(<8 x double> %vec, <8 x d ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = fcmp oeq <8 x double> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll index 1fd6f64..7aebc1b 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1316,7 +1316,7 @@ define half @extract_f16_3(<8 x half> %x) { define half @extract_f16_4(<8 x half> %x) { ; CHECK-LABEL: extract_f16_4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 4 ret half %res @@ -1570,14 +1570,14 @@ define void @extract_store_f16_3(<8 x half> %x, ptr %y) { define void @extract_store_f16_4(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_4: ; X64: # %bb.0: -; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_4: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; X86-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 4 diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll index 08208ae..fd72126 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -24,7 +24,7 @@ define void @test_mscatter_v16f16(ptr %base, <16 x i32> %index, <16 x half> %val ; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 ; CHECK-NEXT: vpextrq $1, %xmm4, %rax ; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; CHECK-NEXT: vmovq %xmm4, %rax ; CHECK-NEXT: vmovsh %xmm3, (%rax) @@ -51,7 +51,7 @@ define void @test_mscatter_v16f16(ptr %base, <16 x i32> %index, <16 x half> %val ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm1 ; CHECK-NEXT: vpextrq $1, %xmm3, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3 ; CHECK-NEXT: vmovq %xmm3, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index ff217a5..ee6869b 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -787,7 +787,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 define <4 x double>@test_int_x86_avx512_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x06] +; CHECK-NEXT: vshufpd $6, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc0,0x06] ; CHECK-NEXT: # ymm0 = ymm0[0,1,3,2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1) @@ -799,7 +799,7 @@ define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] +; X86-NEXT: vshufpd $6, %ymm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xc8,0x06] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,1,3,2] ; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] @@ -807,7 +807,7 @@ define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] +; X64-NEXT: vshufpd $6, %ymm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xc8,0x06] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,1,3,2] ; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] @@ -820,14 +820,14 @@ define <4 x double>@test_int_x86_avx512_maskz_vpermil_pd_256(<4 x double> %x0, i ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] +; X86-NEXT: vshufpd $6, %ymm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xc6,0xc0,0x06] ; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] +; X64-NEXT: vshufpd $6, %ymm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xc6,0xc0,0x06] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3) @@ -839,7 +839,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 define <2 x double>@test_int_x86_avx512_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; CHECK-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; CHECK-NEXT: # xmm0 = xmm0[1,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1) @@ -851,7 +851,7 @@ define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] +; X86-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xc8,0x01] ; X86-NEXT: # xmm1 {%k1} = xmm0[1,0] ; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] @@ -859,7 +859,7 @@ define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] +; X64-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xc8,0x01] ; X64-NEXT: # xmm1 {%k1} = xmm0[1,0] ; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] @@ -872,14 +872,14 @@ define <2 x double>@test_int_x86_avx512_maskz_vpermil_pd_128(<2 x double> %x0, i ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] +; X86-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc0,0x01] ; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] +; X64-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc0,0x01] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3) diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index da52c62..d821070 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -627,7 +627,7 @@ define <8 x i64> @neg_scalar_broadcast_v8i64(i64 %a0, <2 x i64> %a1) { ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] ; AVX1-NEXT: vmovq %rdi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 @@ -718,7 +718,7 @@ define <4 x i64> @neg_scalar_broadcast_v4i64(i64 %a0, <2 x i64> %a1) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] ; AVX1-NEXT: vandnpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/complex-fastmath.ll b/llvm/test/CodeGen/X86/complex-fastmath.ll index 8a5f7f3..29a37a1 100644 --- a/llvm/test/CodeGen/X86/complex-fastmath.ll +++ b/llvm/test/CodeGen/X86/complex-fastmath.ll @@ -70,7 +70,7 @@ define <2 x double> @complex_square_f64(<2 x double>) #0 { ; ; AVX1-LABEL: complex_square_f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vmulsd %xmm0, %xmm0, %xmm0 @@ -81,7 +81,7 @@ define <2 x double> @complex_square_f64(<2 x double>) #0 { ; ; FMA-LABEL: complex_square_f64: ; FMA: # %bb.0: -; FMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; FMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; FMA-NEXT: vaddsd %xmm0, %xmm0, %xmm2 ; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm2 ; FMA-NEXT: vmulsd %xmm1, %xmm1, %xmm1 @@ -176,8 +176,8 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 { ; ; AVX1-LABEL: complex_mul_f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX1-NEXT: vmulsd %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm5 ; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4 @@ -189,8 +189,8 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 { ; ; FMA-LABEL: complex_mul_f64: ; FMA: # %bb.0: -; FMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; FMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; FMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; FMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm4 ; FMA-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm3 * xmm0) + xmm4 ; FMA-NEXT: vmulsd %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index 6f05e75..edfcd94 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -123,7 +123,7 @@ define <16 x i8> @vec128_eltty_i8_source_subvec_0_target_subvec_mask_1_binary(<1 define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_1_unary(<4 x double> %x) nounwind { ; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_1_unary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,3] ; CHECK-NEXT: retq %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> ret <4 x double> %r @@ -199,7 +199,7 @@ define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_1_bi define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_2_unary(<4 x double> %x) nounwind { ; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_2_unary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,1,2,2] ; CHECK-NEXT: retq %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> ret <4 x double> %r diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index a156e82..8eb68d3 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -74,7 +74,7 @@ define <16 x i64> @catcat(<4 x i64> %x) { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[1,1,3,3] ; AVX1-NEXT: vmovaps %ymm4, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fmaddsub-combine.ll b/llvm/test/CodeGen/X86/fmaddsub-combine.ll index b832bf1..5219ab3 100644 --- a/llvm/test/CodeGen/X86/fmaddsub-combine.ll +++ b/llvm/test/CodeGen/X86/fmaddsub-combine.ll @@ -482,8 +482,8 @@ define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] @@ -531,8 +531,8 @@ define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; NOFMA-NEXT: retq @@ -564,14 +564,14 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, ; NOFMA: # %bb.0: # %bb ; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm4 ; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm5 ; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm6 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm8 = xmm5[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm7 = xmm4[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm8 = xmm5[1,0] ; NOFMA-NEXT: vaddss %xmm7, %xmm8, %xmm7 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm8 = xmm0[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm9 = xmm2[1,1,3,3] @@ -648,12 +648,12 @@ define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3 ; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm4 ; NOFMA-NEXT: vaddsd %xmm4, %xmm3, %xmm5 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm4[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0] ; NOFMA-NEXT: vsubsd %xmm2, %xmm1, %xmm1 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -695,24 +695,24 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> ; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm2 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] ; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm3 ; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm6 ; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm7 ; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm8 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm9 = xmm6[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm10 = xmm7[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm9 = xmm6[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm10 = xmm7[1,0] ; NOFMA-NEXT: vaddss %xmm10, %xmm9, %xmm9 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0],xmm8[3] ; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm9 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm10 = xmm1[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm11 = xmm5[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm10 = xmm1[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm11 = xmm5[1,0] ; NOFMA-NEXT: vaddss %xmm11, %xmm10, %xmm10 ; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm11 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm12 = xmm11[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm12 = xmm11[1,0] ; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm13 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm14 = xmm13[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm14 = xmm13[1,0] ; NOFMA-NEXT: vaddss %xmm14, %xmm12, %xmm12 ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm14 = xmm0[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm15 = xmm4[1,1,3,3] @@ -846,16 +846,16 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> ; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm1 ; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm5 ; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm9 -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] ; NOFMA-NEXT: vsubsd %xmm4, %xmm0, %xmm0 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm6[1,0] ; NOFMA-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm7[0],xmm2[0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; NOFMA-NEXT: vshufpd {{.*#+}} xmm3 = xmm5[1,0] ; NOFMA-NEXT: vsubsd %xmm3, %xmm1, %xmm1 ; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/fmf-reduction.ll b/llvm/test/CodeGen/X86/fmf-reduction.ll index 1d669d2..b15e64f 100644 --- a/llvm/test/CodeGen/X86/fmf-reduction.ll +++ b/llvm/test/CodeGen/X86/fmf-reduction.ll @@ -10,7 +10,7 @@ define double @julia_dotf(<4 x double> %x, <4 x double> %y, <4 x double> %z, i1 ; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index 2edabaa..1f64590e 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -906,14 +906,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; ; AVX-LABEL: not_a_hsub_2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 @@ -954,9 +954,9 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { ; ; AVX-LABEL: not_a_hsub_3: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll index 4805945..b3518bc 100644 --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -37,7 +37,7 @@ define float @pr26491(<4 x float> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: retq ; @@ -52,7 +52,7 @@ define float @pr26491(<4 x float> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> @@ -112,7 +112,7 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/haddsub-broadcast.ll b/llvm/test/CodeGen/X86/haddsub-broadcast.ll index bf0ac0ab..7b7fd7d 100644 --- a/llvm/test/CodeGen/X86/haddsub-broadcast.ll +++ b/llvm/test/CodeGen/X86/haddsub-broadcast.ll @@ -9,7 +9,7 @@ define <4 x double> @PR43402(i64 %x) { ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 2298774..d8abe59 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -185,7 +185,7 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) { ; ; AVX1_SLOW-LABEL: hadd_v2f64: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1_SLOW-NEXT: retq @@ -197,7 +197,7 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) { ; ; AVX2_SLOW-LABEL: hadd_v2f64: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2_SLOW-NEXT: retq @@ -229,7 +229,7 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { ; ; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1_SLOW-NEXT: retq @@ -241,7 +241,7 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { ; ; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2_SLOW-NEXT: retq @@ -311,7 +311,7 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { ; ; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -325,7 +325,7 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { ; ; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2_SLOW-NEXT: retq @@ -393,7 +393,7 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) { ; ; AVX1_SLOW-LABEL: hsub_v2f64: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1_SLOW-NEXT: retq @@ -405,7 +405,7 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) { ; ; AVX2_SLOW-LABEL: hsub_v2f64: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2_SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 2c828b3..58afc473 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -127,7 +127,7 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { ; ; AVX-SLOW-LABEL: test5_undef: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -207,7 +207,7 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) { ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] @@ -376,7 +376,7 @@ define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) { ; AVX512-SLOW: # %bb.0: ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] @@ -384,7 +384,7 @@ define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) { ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -451,7 +451,7 @@ define <2 x double> @add_pd_003_2(<2 x double> %x) { ; ; AVX-SLOW-LABEL: add_pd_003_2: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -473,7 +473,7 @@ define <2 x double> @add_pd_010(<2 x double> %x) { ; AVX-SLOW-LABEL: add_pd_010: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -688,7 +688,7 @@ define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) { ; AVX1-SLOW-LABEL: add_pd_011: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: retq ; @@ -975,7 +975,7 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { ; ; AVX-SLOW-LABEL: PR45747_2: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1] ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq @@ -1147,7 +1147,7 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) { ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] @@ -1197,7 +1197,7 @@ define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) { ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1244,7 +1244,7 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) { ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1296,7 +1296,7 @@ define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) { ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll index f36e959..a077819 100644 --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -55,7 +55,7 @@ define <2 x double> @haddpd3(<2 x double> %x) { ; ; AVX-SLOW-LABEL: haddpd3: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -224,7 +224,7 @@ define <2 x double> @hsubpd2(<2 x double> %x) { ; ; AVX-SLOW-LABEL: hsubpd2: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -501,7 +501,7 @@ define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) { ; ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq @@ -562,7 +562,7 @@ define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) { ; ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq @@ -593,7 +593,7 @@ define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -622,7 +622,7 @@ define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -682,7 +682,7 @@ define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) { ; ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq @@ -728,7 +728,7 @@ define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) { ; ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -753,7 +753,7 @@ define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -778,7 +778,7 @@ define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) { ; ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <2 x double> %x, i32 0 @@ -836,7 +836,7 @@ define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) { ; ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper @@ -872,7 +872,7 @@ define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) { ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper @@ -938,7 +938,7 @@ define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) { ; ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper @@ -974,7 +974,7 @@ define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) { ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper @@ -1008,7 +1008,7 @@ define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1041,7 +1041,7 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) { ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1073,7 +1073,7 @@ define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1106,7 +1106,7 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) { ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1171,7 +1171,7 @@ define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) { ; ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper @@ -1260,7 +1260,7 @@ define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1289,7 +1289,7 @@ define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) { ; ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1376,7 +1376,7 @@ define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1407,7 +1407,7 @@ define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1488,7 +1488,7 @@ define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) { ; ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1515,7 +1515,7 @@ define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) { ; ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1648,7 +1648,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1690,7 +1690,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) { ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper @@ -1792,7 +1792,7 @@ define float @hadd32_4(<4 x float> %x225) { ; ; AVX-SLOW-LABEL: hadd32_4: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1800,7 +1800,7 @@ define float @hadd32_4(<4 x float> %x225) { ; ; AVX-FAST-LABEL: hadd32_4: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq @@ -1832,7 +1832,7 @@ define float @hadd32_8(<8 x float> %x225) { ; ; AVX-SLOW-LABEL: hadd32_8: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1841,7 +1841,7 @@ define float @hadd32_8(<8 x float> %x225) { ; ; AVX-FAST-LABEL: hadd32_8: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper @@ -1874,7 +1874,7 @@ define float @hadd32_16(<16 x float> %x225) { ; ; AVX-SLOW-LABEL: hadd32_16: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1883,7 +1883,7 @@ define float @hadd32_16(<16 x float> %x225) { ; ; AVX-FAST-LABEL: hadd32_16: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper @@ -1907,7 +1907,7 @@ define float @hadd32_4_optsize(<4 x float> %x225) optsize { ; ; AVX-LABEL: hadd32_4_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1930,7 +1930,7 @@ define float @hadd32_8_optsize(<8 x float> %x225) optsize { ; ; AVX-LABEL: hadd32_8_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -1954,7 +1954,7 @@ define float @hadd32_16_optsize(<16 x float> %x225) optsize { ; ; AVX-LABEL: hadd32_16_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -1978,7 +1978,7 @@ define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 { ; ; AVX-LABEL: hadd32_4_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -2001,7 +2001,7 @@ define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 { ; ; AVX-LABEL: hadd32_8_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -2025,7 +2025,7 @@ define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 { ; ; AVX-LABEL: hadd32_16_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -2058,7 +2058,7 @@ define float @partial_reduction_fadd_v8f32(<8 x float> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -2102,7 +2102,7 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -2111,7 +2111,7 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) { ; ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper @@ -2144,7 +2144,7 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) { ; ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 2761d3b..5f0fa26c 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1708,12 +1708,12 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 { ; BWON-F16C-NEXT: .LBB26_6: ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm2 ; BWON-F16C-NEXT: vmovd %xmm2, %edx -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; BWON-F16C-NEXT: vpextrw $0, %xmm2, %esi ; BWON-F16C-NEXT: movzwl %si, %esi ; BWON-F16C-NEXT: vmovd %esi, %xmm2 ; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; BWON-F16C-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; BWON-F16C-NEXT: vpextrw $0, %xmm3, %esi ; BWON-F16C-NEXT: movzwl %si, %esi ; BWON-F16C-NEXT: vmovd %esi, %xmm3 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll index 16f3ef5..15cbd01 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -36,7 +36,7 @@ define float @PR37890_v4f32(<4 x float> %a) { ; ; AVX1-SLOW-LABEL: PR37890_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -50,7 +50,7 @@ define float @PR37890_v4f32(<4 x float> %a) { ; ; AVX2-LABEL: PR37890_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -91,7 +91,7 @@ define double @PR37890_v4f64(<4 x double> %a) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -108,7 +108,7 @@ define double @PR37890_v4f64(<4 x double> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -154,7 +154,7 @@ define float @PR37890_v8f32(<8 x float> %a) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -174,7 +174,7 @@ define float @PR37890_v8f32(<8 x float> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -226,7 +226,7 @@ define double @PR37890_v8f64(<8 x double> %a) { ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -245,7 +245,7 @@ define double @PR37890_v8f64(<8 x double> %a) { ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -303,7 +303,7 @@ define float @PR37890_v16f32(<16 x float> %a) { ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -325,7 +325,7 @@ define float @PR37890_v16f32(<16 x float> %a) { ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index ee4ecce..ab9a1e9 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -230,7 +230,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 @@ -251,7 +251,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 @@ -274,7 +274,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 @@ -295,7 +295,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 @@ -588,7 +588,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4 ; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -608,7 +608,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4 ; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] @@ -891,27 +891,27 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -921,24 +921,24 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1004,13 +1004,13 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float ; ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero @@ -1023,14 +1023,14 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 25f02ac..de71865 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -254,7 +254,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) ; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X86-NEXT: vcvtdq2pd %xmm0, %ymm0 @@ -265,7 +265,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) ; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll index 4c98eae..1d7aac7 100644 --- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll +++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll @@ -51,7 +51,7 @@ define float @dot3_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) % ; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -106,7 +106,7 @@ define float @dot3_float4_as_float3(ptr dereferenceable(16) %a0, ptr dereference ; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -180,7 +180,7 @@ define float @dot3_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) % ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 8686bbf..f2a3094 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -37,7 +37,7 @@ define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind { ; ; AVX1-LABEL: test_mul2x2_f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-NEXT: vmovsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] ; AVX1-NEXT: vmulps %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] @@ -53,7 +53,7 @@ define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind { ; ; AVX2-LABEL: test_mul2x2_f32: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX2-NEXT: vmulps %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] @@ -69,7 +69,7 @@ define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind { ; ; AVX512-LABEL: test_mul2x2_f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX512-NEXT: vmulps %xmm3, %xmm0, %xmm3 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] @@ -122,13 +122,13 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] ; AVX-NEXT: vmulpd %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,1] ; AVX-NEXT: vmulpd %xmm4, %xmm2, %xmm4 ; AVX-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] ; AVX-NEXT: vmulpd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,1] ; AVX-NEXT: vmulpd %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 @@ -348,13 +348,13 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] ; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm5[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9 ; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] ; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 @@ -378,7 +378,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] ; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 ; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 @@ -407,13 +407,13 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 ; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 ; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0] ; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3] ; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5 @@ -437,7 +437,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm8[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 @@ -1363,7 +1363,7 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm4[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm9 = xmm4[1,1] +; AVX1-NEXT: vshufpd {{.*#+}} xmm9 = xmm4[1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 ; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 @@ -1371,13 +1371,13 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm4[0,0,2,2] ; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 ; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,3,3] ; AVX1-NEXT: vmulpd %ymm4, %ymm3, %ymm4 ; AVX1-NEXT: vaddpd %ymm4, %ymm8, %ymm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm5[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm9 = xmm5[1,1] +; AVX1-NEXT: vshufpd {{.*#+}} xmm9 = xmm5[1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 ; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 @@ -1385,13 +1385,13 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm5[0,0,2,2] ; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 ; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,1,3,3] ; AVX1-NEXT: vmulpd %ymm5, %ymm3, %ymm5 ; AVX1-NEXT: vaddpd %ymm5, %ymm8, %ymm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm6[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm9 = xmm6[1,1] +; AVX1-NEXT: vshufpd {{.*#+}} xmm9 = xmm6[1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 ; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 @@ -1399,13 +1399,13 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm6[0,0,2,2] ; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 ; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,1,3,3] ; AVX1-NEXT: vmulpd %ymm6, %ymm3, %ymm6 ; AVX1-NEXT: vaddpd %ymm6, %ymm8, %ymm6 ; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm7[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm8 = xmm7[1,1] +; AVX1-NEXT: vshufpd {{.*#+}} xmm8 = xmm7[1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-NEXT: vmulpd %ymm1, %ymm8, %ymm1 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 @@ -1413,7 +1413,7 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; AVX1-NEXT: vmovddup {{.*#+}} ymm7 = ymm1[0,0,2,2] ; AVX1-NEXT: vmulpd %ymm7, %ymm2, %ymm2 ; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,3,3] ; AVX1-NEXT: vmulpd %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm3 ; AVX1-NEXT: vmovapd %ymm4, %ymm0 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index b3b94e8..49911af 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1720,7 +1720,7 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] @@ -1732,7 +1732,7 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) ; AVX1-NEXT: vmovups %ymm1, (%rdi) @@ -1840,7 +1840,7 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2] ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,2,3] -; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,3,3] +; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,3] ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] ; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm2[1] ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] @@ -1849,7 +1849,7 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; XOP-NEXT: vbroadcastsd (%rcx), %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; XOP-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; XOP-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) ; XOP-NEXT: vmovups %ymm2, (%rdi) @@ -2421,7 +2421,7 @@ define void @D107009(ptr %input, ptr %output) { ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) @@ -2493,7 +2493,7 @@ define void @D107009(ptr %input, ptr %output) { ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] +; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; XOP-NEXT: vmovdqa %xmm0, 16(%rsi) diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll index ecc6173..1d87a57 100644 --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -9,7 +9,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll index 979b440..0757d30 100644 --- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -653,7 +653,7 @@ define double @u64_to_d(i64 %a) nounwind { ; AVX512F_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F_32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512F_32-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; AVX512F_32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F_32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F_32-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512F_32-NEXT: vmovsd %xmm0, (%esp) ; AVX512F_32-NEXT: fldl (%esp) diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll index 4c4b62c..8379d20 100644 --- a/llvm/test/CodeGen/X86/scalarize-fp.ll +++ b/llvm/test/CodeGen/X86/scalarize-fp.ll @@ -434,7 +434,7 @@ define <2 x double> @fadd_splat_splat_nonzero_v2f64(<2 x double> %vx, <2 x doubl ; AVX-LABEL: fadd_splat_splat_nonzero_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> %splaty = shufflevector <2 x double> %vy, <2 x double> undef, <2 x i32> @@ -455,7 +455,7 @@ define <2 x double> @fadd_splat_splat_mismatch_v2f64(<2 x double> %vx, <2 x doub ; AVX-LABEL: fadd_splat_splat_mismatch_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,1] ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll index 52148ad..ecd9435 100644 --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -7,7 +7,7 @@ define <2 x double> @foo2(<2 x double> %v, ptr%p) nounwind { ; AVX2-LABEL: foo2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX2-NEXT: vmovapd %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 806b209..47d35f3 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1489,7 +1489,7 @@ define float @PR26515(<4 x float> %0) nounwind { ; X86-AVX-LABEL: PR26515: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %eax -; X86-AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovss %xmm0, (%esp) ; X86-AVX-NEXT: flds (%esp) @@ -1505,7 +1505,7 @@ define float @PR26515(<4 x float> %0) nounwind { ; ; X64-AVX-LABEL: PR26515: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: retq %2 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index eca4e81..eba3907 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -6552,7 +6552,7 @@ define void @test_mm_storeh_sd(ptr%a0, <2 x double> %a1) { ; X86-AVX1-LABEL: test_mm_storeh_sd: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X86-AVX1-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X86-AVX1-NEXT: # xmm0 = xmm0[1,0] ; X86-AVX1-NEXT: vmovsd %xmm0, (%eax) # encoding: [0xc5,0xfb,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -6560,7 +6560,7 @@ define void @test_mm_storeh_sd(ptr%a0, <2 x double> %a1) { ; X86-AVX512-LABEL: test_mm_storeh_sd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X86-AVX512-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X86-AVX512-NEXT: # xmm0 = xmm0[1,0] ; X86-AVX512-NEXT: vmovsd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] @@ -6574,14 +6574,14 @@ define void @test_mm_storeh_sd(ptr%a0, <2 x double> %a1) { ; ; X64-AVX1-LABEL: test_mm_storeh_sd: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X64-AVX1-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X64-AVX1-NEXT: # xmm0 = xmm0[1,0] ; X64-AVX1-NEXT: vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_storeh_sd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X64-AVX512-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X64-AVX512-NEXT: # xmm0 = xmm0[1,0] ; X64-AVX512-NEXT: vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] @@ -6595,14 +6595,14 @@ define void @test_mm_storeh_sd(ptr%a0, <2 x double> %a1) { ; ; X32-AVX1-LABEL: test_mm_storeh_sd: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X32-AVX1-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X32-AVX1-NEXT: # xmm0 = xmm0[1,0] ; X32-AVX1-NEXT: vmovsd %xmm0, (%edi) # encoding: [0x67,0xc5,0xfb,0x11,0x07] ; X32-AVX1-NEXT: retq # encoding: [0xc3] ; ; X32-AVX512-LABEL: test_mm_storeh_sd: ; X32-AVX512: # %bb.0: -; X32-AVX512-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X32-AVX512-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X32-AVX512-NEXT: # xmm0 = xmm0[1,0] ; X32-AVX512-NEXT: vmovsd %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x11,0x07] ; X32-AVX512-NEXT: retq # encoding: [0xc3] @@ -6736,7 +6736,7 @@ define void @test_mm_storer_pd(ptr%a0, <2 x double> %a1) { ; X86-AVX1-LABEL: test_mm_storer_pd: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X86-AVX1-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X86-AVX1-NEXT: # xmm0 = xmm0[1,0] ; X86-AVX1-NEXT: vmovapd %xmm0, (%eax) # encoding: [0xc5,0xf9,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -6744,7 +6744,7 @@ define void @test_mm_storer_pd(ptr%a0, <2 x double> %a1) { ; X86-AVX512-LABEL: test_mm_storer_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X86-AVX512-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X86-AVX512-NEXT: # xmm0 = xmm0[1,0] ; X86-AVX512-NEXT: vmovapd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] @@ -6758,14 +6758,14 @@ define void @test_mm_storer_pd(ptr%a0, <2 x double> %a1) { ; ; X64-AVX1-LABEL: test_mm_storer_pd: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X64-AVX1-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X64-AVX1-NEXT: # xmm0 = xmm0[1,0] ; X64-AVX1-NEXT: vmovapd %xmm0, (%rdi) # encoding: [0xc5,0xf9,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_storer_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X64-AVX512-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X64-AVX512-NEXT: # xmm0 = xmm0[1,0] ; X64-AVX512-NEXT: vmovapd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] @@ -6779,14 +6779,14 @@ define void @test_mm_storer_pd(ptr%a0, <2 x double> %a1) { ; ; X32-AVX1-LABEL: test_mm_storer_pd: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X32-AVX1-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X32-AVX1-NEXT: # xmm0 = xmm0[1,0] ; X32-AVX1-NEXT: vmovapd %xmm0, (%edi) # encoding: [0x67,0xc5,0xf9,0x29,0x07] ; X32-AVX1-NEXT: retq # encoding: [0xc3] ; ; X32-AVX512-LABEL: test_mm_storer_pd: ; X32-AVX512: # %bb.0: -; X32-AVX512-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; X32-AVX512-NEXT: vshufpd $1, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc0,0x01] ; X32-AVX512-NEXT: # xmm0 = xmm0[1,0] ; X32-AVX512-NEXT: vmovapd %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf9,0x29,0x07] ; X32-AVX512-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll index 2328ecd..f94dd2e 100644 --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -276,16 +276,16 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) { ; ; AVX1-LABEL: test11: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: retq ; ; AVX512-LABEL: test11: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -373,8 +373,8 @@ define <4 x float> @test14(<4 x float> %A, <4 x float> %B) { ; AVX-LABEL: test14: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] ; AVX-NEXT: retq @@ -462,8 +462,8 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll index 90a7671..7d8bb56 100644 --- a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll @@ -8,7 +8,7 @@ define <8 x double> @transform_VPERMILPSZrr(<8 x double> %a) nounwind { ; CHECK-LABEL: transform_VPERMILPSZrr: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,7,6] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,7,6] ; CHECK-NEXT: retq %shufp = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> ret <8 x double> %shufp @@ -17,7 +17,7 @@ define <8 x double> @transform_VPERMILPSZrr(<8 x double> %a) nounwind { define <4 x double> @transform_VPERMILPSYrr(<4 x double> %a) nounwind { ; CHECK-LABEL: transform_VPERMILPSYrr: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,3] ; CHECK-NEXT: retq %shufp = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> ret <4 x double> %shufp @@ -26,7 +26,7 @@ define <4 x double> @transform_VPERMILPSYrr(<4 x double> %a) nounwind { define <2 x double> @transform_VPERMILPSrr(<2 x double> %a) nounwind { ; CHECK-LABEL: transform_VPERMILPSrr: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: retq %shufp = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> ret <2 x double> %shufp @@ -36,7 +36,7 @@ define <8 x double> @transform_VPERMILPSZrrkz(<8 x double> %a, i8 %mask_int) nou ; CHECK-LABEL: transform_VPERMILPSZrrkz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,5,4,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,5,4,6,7] ; CHECK-NEXT: retq %mask = bitcast i8 %mask_int to <8 x i1> %shufp = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> @@ -48,7 +48,7 @@ define <4 x double> @transform_VPERMILPSYrrkz(<4 x double> %a, i4 %mask_int) nou ; CHECK-LABEL: transform_VPERMILPSYrrkz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,3,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,3,2] ; CHECK-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> @@ -60,7 +60,7 @@ define <2 x double> @transform_VPERMILPSrrkz(<2 x double> %a, i2 %mask_int) noun ; CHECK-LABEL: transform_VPERMILPSrrkz: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0] ; CHECK-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> @@ -72,7 +72,7 @@ define <8 x double> @transform_VPERMILPSZrrk(<8 x double> %a, <8 x double> %b, i ; CHECK-LABEL: transform_VPERMILPSZrrk: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,4,5,6,7] +; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,4,5,6,7] ; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq %mask = bitcast i8 %mask_int to <8 x i1> @@ -85,7 +85,7 @@ define <4 x double> @transform_VPERMILPSYrrk(<4 x double> %a, <4 x double> %b, i ; CHECK-LABEL: transform_VPERMILPSYrrk: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,0,3,2] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1,0,3,2] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> @@ -98,7 +98,7 @@ define <2 x double> @transform_VPERMILPSrrk(<2 x double> %a, <2 x double> %b, i2 ; CHECK-LABEL: transform_VPERMILPSrrk: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1,0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd.ll index 211c2ee..b3deb51 100644 --- a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd.ll @@ -11,7 +11,7 @@ define <4 x double> @transform_VPERMILPDYrr(<4 x double> %a) nounwind { ; CHECK-LABEL: transform_VPERMILPDYrr: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,3] ; CHECK-NEXT: retq %shufp = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> ret <4 x double> %shufp @@ -20,7 +20,7 @@ define <4 x double> @transform_VPERMILPDYrr(<4 x double> %a) nounwind { define <2 x double> @transform_VPERMILPDrr(<2 x double> %a) nounwind { ; CHECK-LABEL: transform_VPERMILPDrr: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: retq %shufp = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> ret <2 x double> %shufp diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll index 76ea9f9..a11f13e 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -119,7 +119,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-64-NEXT: vmovq %rax, %xmm1 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-64-NEXT: vmovq %rax, %xmm0 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -154,7 +154,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm0 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -189,7 +189,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -322,7 +322,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX-32-NEXT: vmovapd %xmm1, %xmm3 @@ -379,7 +379,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm3 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-64-NEXT: vcomisd %xmm1, %xmm0 ; AVX-64-NEXT: jb .LBB1_4 ; AVX-64-NEXT: # %bb.3: @@ -404,7 +404,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512F-32-NEXT: andl $-8, %esp ; AVX512F-32-NEXT: subl $16, %esp -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomisd %xmm2, %xmm1 @@ -442,7 +442,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm0 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -457,7 +457,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp ; AVX512VL-32-NEXT: andl $-8, %esp ; AVX512VL-32-NEXT: subl $16, %esp -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm2, %xmm1 @@ -495,7 +495,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1630,7 +1630,7 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 { ; ; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx ; AVX-64-NEXT: vmovd %ecx, %xmm0 @@ -2327,7 +2327,7 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 { ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-64-NEXT: vmovq %rax, %xmm1 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-64-NEXT: vmovq %rax, %xmm0 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -2484,7 +2484,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 { ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX-32-NEXT: vmovapd %xmm1, %xmm3 @@ -2541,7 +2541,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 { ; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm3 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-64-NEXT: vcomisd %xmm1, %xmm0 ; AVX-64-NEXT: jb .LBB19_4 ; AVX-64-NEXT: # %bb.3: diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll index c86fcd5..d072072 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -77,13 +77,13 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX-64-NEXT: vmovq %rax, %xmm2 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-64-NEXT: vmovq %rax, %xmm2 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-64-NEXT: vmovq %rax, %xmm0 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -132,13 +132,13 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512F-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm2 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512F-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm2 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm0 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -187,13 +187,13 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -226,7 +226,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX-32-NEXT: .cfi_def_cfa_register %ebp ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX-32-NEXT: vmovapd %xmm1, %xmm3 @@ -244,7 +244,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX-32-NEXT: vmovapd %xmm1, %xmm4 ; AVX-32-NEXT: jae .LBB1_4 @@ -320,7 +320,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX-64-NEXT: movzbl %al, %eax ; AVX-64-NEXT: shlq $63, %rax ; AVX-64-NEXT: xorq %rcx, %rax -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX-64-NEXT: vcomisd %xmm1, %xmm4 ; AVX-64-NEXT: vxorpd %xmm5, %xmm5, %xmm5 ; AVX-64-NEXT: jb .LBB1_4 @@ -349,7 +349,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX-64-NEXT: shlq $63, %rcx ; AVX-64-NEXT: xorq %rax, %rcx ; AVX-64-NEXT: vmovq %rcx, %xmm4 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-64-NEXT: vcomisd %xmm1, %xmm0 ; AVX-64-NEXT: jb .LBB1_8 ; AVX-64-NEXT: # %bb.7: @@ -378,7 +378,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 ; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomisd %xmm1, %xmm3 @@ -394,7 +394,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512F-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-32-NEXT: xorl %ecx, %ecx ; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %cl @@ -446,13 +446,13 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512F-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm2 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512F-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm2 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm0 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -471,7 +471,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 @@ -487,7 +487,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %ecx, %ecx ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %cl @@ -539,13 +539,13 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -610,7 +610,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-64-NEXT: vcvttss2si %xmm1, %rax ; AVX-64-NEXT: vmovq %rax, %xmm1 -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-64-NEXT: vcvttss2si %xmm2, %rax ; AVX-64-NEXT: vmovq %rax, %xmm2 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -664,7 +664,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-64-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm2 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -718,7 +718,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -790,7 +790,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX-32-NEXT: movzbl %cl, %ecx ; AVX-32-NEXT: shll $31, %ecx ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX-32-NEXT: vmovaps %xmm1, %xmm3 ; AVX-32-NEXT: jae .LBB3_6 @@ -851,7 +851,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX-64-NEXT: movzbl %al, %eax ; AVX-64-NEXT: shlq $63, %rax ; AVX-64-NEXT: xorq %rcx, %rax -; AVX-64-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-64-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-64-NEXT: vcomiss %xmm1, %xmm4 ; AVX-64-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX-64-NEXT: jb .LBB3_4 @@ -917,7 +917,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX512F-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512F-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512F-32-NEXT: vmovss %xmm2, (%esp) -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-32-NEXT: xorl %edx, %edx ; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %dl @@ -977,7 +977,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-64-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 -; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm2 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1010,7 +1010,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, (%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %edx, %edx ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %dl @@ -1070,7 +1070,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index 27b84bc..a7ac1a1 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -95,14 +95,14 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-64-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -110,13 +110,13 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] @@ -150,7 +150,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 @@ -169,7 +169,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al @@ -186,7 +186,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al @@ -202,7 +202,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %bl @@ -284,14 +284,14 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-64-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -299,13 +299,13 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] @@ -388,7 +388,7 @@ define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttss2si %xmm3, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -402,7 +402,7 @@ define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttss2si %xmm3, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -452,7 +452,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al @@ -486,7 +486,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %ecx, %ecx ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %cl @@ -577,7 +577,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttss2usi %xmm3, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -591,7 +591,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512VL-64-NEXT: vcvttss2usi %xmm3, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 610c7d4..04609f0 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -32,7 +32,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; VEX: # %bb.0: ; VEX-NEXT: vcvttsd2si %xmm0, %rax ; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; VEX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; VEX-NEXT: vcvttsd2si %xmm0, %rax ; VEX-NEXT: vmovq %rax, %xmm0 ; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -42,7 +42,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -52,7 +52,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -144,13 +144,13 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vcvttsd2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-NEXT: vcvttsd2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vcvttsd2si %xmm0, %rax ; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vcvttsd2si %xmm0, %rax ; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -162,13 +162,13 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vcvttsd2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-NEXT: vcvttsd2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: vcvttsd2si %xmm0, %rax ; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vcvttsd2si %xmm0, %rax ; AVX2-NEXT: vmovq %rax, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -180,13 +180,13 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vcvttsd2si %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512F-NEXT: vcvttsd2si %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vcvttsd2si %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -198,13 +198,13 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -284,7 +284,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx ; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; VEX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm1 ; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx @@ -300,7 +300,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -310,7 +310,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -564,7 +564,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 ; AVX1-NEXT: vcvttsd2si %xmm4, %rax ; AVX1-NEXT: vcvttsd2si %xmm2, %rcx @@ -582,7 +582,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vcvttsd2si %xmm1, %rax ; AVX1-NEXT: vcvttsd2si %xmm0, %rcx @@ -607,7 +607,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vcvttsd2si %xmm4, %rax ; AVX2-NEXT: vcvttsd2si %xmm2, %rcx @@ -625,7 +625,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vcvttsd2si %xmm1, %rax ; AVX2-NEXT: vcvttsd2si %xmm0, %rcx @@ -643,13 +643,13 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -661,13 +661,13 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -957,7 +957,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-NEXT: vcvttss2si %xmm2, %rax ; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -975,7 +975,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vcvttss2si %xmm2, %rax ; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -993,7 +993,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2si %xmm2, %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1011,7 +1011,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1065,7 +1065,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-NEXT: vcvttss2si %xmm2, %rax ; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1083,7 +1083,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vcvttss2si %xmm2, %rax ; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1101,7 +1101,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512F-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-NEXT: vcvttss2si %xmm0, %rcx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2si %xmm1, %rdx ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2si %xmm0, %rsi @@ -1119,7 +1119,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi @@ -1576,7 +1576,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: vcvttss2si %xmm3, %rcx @@ -1619,7 +1619,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX2-NEXT: vcvttss2si %xmm4, %rax ; AVX2-NEXT: vcvttss2si %xmm3, %rcx @@ -1655,7 +1655,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2usi %xmm2, %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1673,7 +1673,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -1760,7 +1760,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: vcvttss2si %xmm3, %rcx @@ -1803,7 +1803,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX2-NEXT: vcvttss2si %xmm4, %rax ; AVX2-NEXT: vcvttss2si %xmm3, %rcx @@ -1839,7 +1839,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi @@ -1857,7 +1857,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index f39f4a9..611ad2d 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3084,7 +3084,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: subq $40, %rsp ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: callq __truncdfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3117,7 +3117,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: subq $88, %rsp ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3129,7 +3129,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3148,7 +3148,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: subq $88, %rsp ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3160,7 +3160,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3249,7 +3249,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: subq $88, %rsp ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3261,7 +3261,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3280,7 +3280,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: subq $88, %rsp ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3292,7 +3292,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3382,7 +3382,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: subq $88, %rsp ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3394,7 +3394,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3413,7 +3413,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: subq $88, %rsp ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3425,7 +3425,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3736,7 +3736,7 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind { ; AVX-NEXT: subq $32, %rsp ; AVX-NEXT: movq %rdi, %rbx ; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: callq __truncdfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload @@ -3796,14 +3796,14 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: subq $80, %rsp ; AVX1-NEXT: movq %rdi, %rbx ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3831,14 +3831,14 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: subq $80, %rsp ; AVX2-NEXT: movq %rdi, %rbx ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3940,7 +3940,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: subq $80, %rsp ; AVX1-NEXT: movq %rdi, %rbx ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3952,7 +3952,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3975,7 +3975,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: subq $80, %rsp ; AVX2-NEXT: movq %rdi, %rbx ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3987,7 +3987,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4090,7 +4090,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: subq $80, %rsp ; AVX1-NEXT: movq %rdi, %rbx ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4102,7 +4102,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4125,7 +4125,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: subq $80, %rsp ; AVX2-NEXT: movq %rdi, %rbx ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4137,7 +4137,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll index 51e101a..63ca7c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -582,9 +582,9 @@ define void @splat2_i64(ptr %s, ptr %d) { ; AVX1-LABEL: splat2_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) ; AVX1-NEXT: vmovupd %ymm1, (%rsi) ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 2961c99..6016de9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -233,7 +233,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3] @@ -443,7 +443,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] @@ -886,7 +886,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] @@ -900,7 +900,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm7[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] @@ -1852,7 +1852,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm2 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] @@ -1869,7 +1869,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] @@ -1883,7 +1883,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm4 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] @@ -1899,7 +1899,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3,0,1] @@ -3865,7 +3865,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3884,7 +3884,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3903,7 +3903,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3922,7 +3922,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -3940,7 +3940,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3959,7 +3959,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3,0,1] @@ -3975,7 +3975,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] @@ -3991,7 +3991,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm5 = xmm5[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index 44362bc..fede3ba 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -248,12 +248,12 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) @@ -476,7 +476,7 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2] @@ -490,17 +490,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) @@ -918,7 +918,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 @@ -935,7 +935,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm12[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[1,1],xmm12[0,2] @@ -951,7 +951,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1],xmm12[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 @@ -967,27 +967,27 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) @@ -1804,7 +1804,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm2[1] @@ -1821,7 +1821,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 @@ -1841,7 +1841,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 @@ -1861,7 +1861,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 @@ -1882,7 +1882,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 @@ -1902,7 +1902,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 @@ -1921,7 +1921,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 @@ -1940,47 +1940,47 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm11 = ymm11[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = ymm12[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm12[1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 608(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index 19e04fb..94f9a01 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -208,7 +208,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm4[1,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index fd46e1c..be54bf1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -168,7 +168,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[u,u,1,0,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm7[0,0,3,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 @@ -444,7 +444,7 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -962,14 +962,14 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] @@ -2131,7 +2131,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -2150,14 +2150,14 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm11[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm11[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm14[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] @@ -4570,7 +4570,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = ymm15[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm15[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4593,7 +4593,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload @@ -4644,7 +4644,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm5[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index 42892da..3619d7f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -117,10 +117,10 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rcx) @@ -239,18 +239,18 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = ymm0[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm0[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rcx) @@ -475,34 +475,34 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm6[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm8[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm14 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3] ; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3] ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm7[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) @@ -939,13 +939,13 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm3[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -953,7 +953,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm10[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -961,7 +961,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm9[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -969,7 +969,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 144(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = ymm8[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm6[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill @@ -977,7 +977,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm11 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -985,14 +985,14 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm5[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm11 = ymm3[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -1003,23 +1003,23 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = ymm10[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm9[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2],ymm9[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = ymm8[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2],ymm8[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm7[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2],ymm3[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovapd %ymm11, 736(%rcx) @@ -1903,7 +1903,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1913,7 +1913,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1923,7 +1923,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1933,7 +1933,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1943,7 +1943,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1952,7 +1952,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1961,7 +1961,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1970,7 +1970,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1979,7 +1979,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1988,7 +1988,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1997,7 +1997,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2006,7 +2006,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2015,7 +2015,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2023,7 +2023,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm13[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2031,7 +2031,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm10[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm10[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2039,7 +2039,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm7[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill @@ -2104,13 +2104,13 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2],ymm13[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = ymm10[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm7[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 7b271b3..ef9165d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -592,7 +592,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = ymm15[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] @@ -2174,7 +2174,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm8 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3] @@ -5074,7 +5074,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2],ymm15[3] @@ -11161,7 +11161,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] @@ -11181,7 +11181,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] @@ -11200,7 +11200,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm11 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3] diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll index 1dbdda8..ad34521 100644 --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -165,7 +165,7 @@ define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) { ; AVX1-NEXT: vmulpd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmulpd %xmm2, %xmm2, %xmm1 ; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: fmul_v2f64: @@ -175,7 +175,7 @@ define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) { ; AVX2-NEXT: vmulpd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmulpd %xmm2, %xmm2, %xmm1 ; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: retq ; ; AVX512-LABEL: fmul_v2f64: @@ -184,7 +184,7 @@ define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) { ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmulpd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm2 * xmm2) + xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: retq %s = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> %bo = fmul fast <4 x double> %s, %s diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll index 7d123ab..51dd366 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -81,7 +81,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; ; AVX1-SLOW-LABEL: test_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -97,7 +97,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; ; AVX2-LABEL: test_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -106,7 +106,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -144,7 +144,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -166,7 +166,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -178,7 +178,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -222,7 +222,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -235,7 +235,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -247,7 +247,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -261,7 +261,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -338,7 +338,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; ; AVX1-SLOW-LABEL: test_v4f32_zero: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -352,7 +352,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; ; AVX2-LABEL: test_v4f32_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -360,7 +360,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -395,7 +395,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -415,7 +415,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -426,7 +426,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -467,7 +467,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -479,7 +479,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper @@ -490,7 +490,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -503,7 +503,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -579,7 +579,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; ; AVX1-SLOW-LABEL: test_v4f32_undef: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -593,7 +593,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; ; AVX2-LABEL: test_v4f32_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -601,7 +601,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -636,7 +636,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -656,7 +656,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -667,7 +667,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -708,7 +708,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -720,7 +720,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper @@ -731,7 +731,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -744,7 +744,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -769,7 +769,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; ; AVX1-SLOW-LABEL: test_v2f64: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq @@ -782,14 +782,14 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; ; AVX2-LABEL: test_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -811,7 +811,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper @@ -830,7 +830,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -840,7 +840,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -866,7 +866,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper @@ -887,7 +887,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -899,7 +899,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -931,7 +931,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper @@ -956,7 +956,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -969,7 +969,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -992,7 +992,7 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; ; AVX1-SLOW-LABEL: test_v2f64_zero: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; @@ -1003,13 +1003,13 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; ; AVX2-LABEL: test_v2f64_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) @@ -1029,7 +1029,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1046,7 +1046,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1055,7 +1055,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1079,7 +1079,7 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1098,7 +1098,7 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1109,7 +1109,7 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1139,7 +1139,7 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1162,7 +1162,7 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1174,7 +1174,7 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1196,7 +1196,7 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; ; AVX1-SLOW-LABEL: test_v2f64_undef: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; @@ -1207,13 +1207,13 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; ; AVX2-LABEL: test_v2f64_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) @@ -1233,7 +1233,7 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1250,7 +1250,7 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1259,7 +1259,7 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1283,7 +1283,7 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1302,7 +1302,7 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1313,7 +1313,7 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1343,7 +1343,7 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1366,7 +1366,7 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1378,7 +1378,7 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll index 90aaf42..883806d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -74,7 +74,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -85,7 +85,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -142,7 +142,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -150,7 +150,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -162,7 +162,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -170,7 +170,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -262,7 +262,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 @@ -270,14 +270,14 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -285,7 +285,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -297,7 +297,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -305,7 +305,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -313,7 +313,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -321,7 +321,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -403,7 +403,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -412,7 +412,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; AVX1-FAST-LABEL: test_v4f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -422,7 +422,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -432,7 +432,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -486,7 +486,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -494,7 +494,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -504,7 +504,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX1-FAST-LABEL: test_v8f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -512,7 +512,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -523,7 +523,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -531,7 +531,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -542,7 +542,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -550,7 +550,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -639,7 +639,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 @@ -647,14 +647,14 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -662,7 +662,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -672,7 +672,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-FAST-LABEL: test_v16f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 @@ -680,14 +680,14 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -695,7 +695,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -706,7 +706,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 @@ -714,14 +714,14 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -729,7 +729,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -740,7 +740,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -748,7 +748,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -756,7 +756,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -764,7 +764,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -834,7 +834,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -844,7 +844,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -898,7 +898,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -906,7 +906,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -917,7 +917,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -925,7 +925,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -1014,7 +1014,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 @@ -1022,14 +1022,14 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -1037,7 +1037,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1048,7 +1048,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1056,7 +1056,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1064,7 +1064,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1072,7 +1072,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 @@ -1097,14 +1097,14 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) @@ -1125,11 +1125,11 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1137,11 +1137,11 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1169,18 +1169,18 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1188,19 +1188,19 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1268,32 +1268,32 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1301,34 +1301,34 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1350,7 +1350,7 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; ; AVX1-SLOW-LABEL: test_v2f64_zero: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; @@ -1361,13 +1361,13 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; ; AVX2-LABEL: test_v2f64_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0) @@ -1387,11 +1387,11 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; ; AVX1-SLOW-LABEL: test_v4f64_zero: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1401,29 +1401,29 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v4f64_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1450,18 +1450,18 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; ; AVX1-SLOW-LABEL: test_v8f64_zero: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1471,51 +1471,51 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm2 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f64_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1554,32 +1554,32 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; ; AVX1-SLOW-LABEL: test_v16f64_zero: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm4 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm4 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -1589,94 +1589,94 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm4 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm4 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v16f64_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm4 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1697,13 +1697,13 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) @@ -1722,22 +1722,22 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; ; AVX-LABEL: test_v4f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1763,37 +1763,37 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; ; AVX-LABEL: test_v8f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1831,66 +1831,66 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; ; AVX-LABEL: test_v16f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll index 5f24935..5bd9b02 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -65,7 +65,7 @@ define float @test_v4f32(<4 x float> %a0) { ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -73,7 +73,7 @@ define float @test_v4f32(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -108,7 +108,7 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -119,7 +119,7 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -160,7 +160,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -173,7 +173,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -197,13 +197,13 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0) @@ -223,7 +223,7 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -232,7 +232,7 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -256,7 +256,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -267,7 +267,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -297,7 +297,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -309,7 +309,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index 4166db3..71c4427 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -70,7 +70,7 @@ define float @test_v4f32(<4 x float> %a0) { ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -78,7 +78,7 @@ define float @test_v4f32(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -113,7 +113,7 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -124,7 +124,7 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -165,7 +165,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -178,7 +178,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 @@ -202,13 +202,13 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0) @@ -238,7 +238,7 @@ define double @test_v3f64(<3 x double> %a0) { ; ; AVX-LABEL: test_v3f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 @@ -247,7 +247,7 @@ define double @test_v3f64(<3 x double> %a0) { ; ; AVX512-LABEL: test_v3f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 @@ -270,7 +270,7 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -279,7 +279,7 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -303,7 +303,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -314,7 +314,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -344,7 +344,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -356,7 +356,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index 53eefbe..4c4a2fc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -113,7 +113,7 @@ define float @test_v3f32(<3 x float> %a0) { ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq @@ -125,7 +125,7 @@ define float @test_v3f32(<3 x float> %a0) { ; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq @@ -198,7 +198,7 @@ define float @test_v4f32(<4 x float> %a0) { ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 @@ -214,7 +214,7 @@ define float @test_v4f32(<4 x float> %a0) { ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4 ; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -303,10 +303,10 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm8 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 @@ -336,10 +336,10 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -369,10 +369,10 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -502,7 +502,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -518,7 +518,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -535,7 +535,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm3 ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -550,7 +550,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -566,7 +566,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} @@ -582,7 +582,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} @@ -597,18 +597,18 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm6[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm14 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm16 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -682,7 +682,7 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 @@ -690,7 +690,7 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm1 ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} @@ -740,8 +740,8 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 @@ -757,8 +757,8 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} @@ -838,7 +838,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 ; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 ; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 @@ -847,7 +847,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -856,12 +856,12 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512BW-LABEL: test_v8f64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] ; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} @@ -889,12 +889,12 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512VL-LABEL: test_v8f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] ; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} @@ -1036,7 +1036,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 ; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 ; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 @@ -1045,7 +1045,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -1056,7 +1056,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vmaxpd %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] ; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm1 ; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} @@ -1065,7 +1065,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 @@ -1073,7 +1073,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 @@ -1081,7 +1081,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll index 14c2b01..0b2f9d6 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -74,7 +74,7 @@ define float @test_v3f32(<3 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -82,7 +82,7 @@ define float @test_v3f32(<3 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a0) @@ -111,7 +111,7 @@ define float @test_v4f32(<4 x float> %a0) { ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -119,7 +119,7 @@ define float @test_v4f32(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -154,7 +154,7 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -165,7 +165,7 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -206,7 +206,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -219,7 +219,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 @@ -243,13 +243,13 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0) @@ -269,7 +269,7 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -278,7 +278,7 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -302,7 +302,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -313,7 +313,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -343,7 +343,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -355,7 +355,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index afc7919..e622899 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -131,7 +131,7 @@ define float @test_v4f32(<4 x float> %a0) { ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 @@ -147,7 +147,7 @@ define float @test_v4f32(<4 x float> %a0) { ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4 ; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -236,10 +236,10 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm0, %xmm7, %xmm8 ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 @@ -269,10 +269,10 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -302,10 +302,10 @@ define float @test_v8f32(<8 x float> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -435,7 +435,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -451,7 +451,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -468,7 +468,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm3 ; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -483,7 +483,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -499,7 +499,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} @@ -515,7 +515,7 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} @@ -530,18 +530,18 @@ define float @test_v16f32(<16 x float> %a0) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm6[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm14 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm16 ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 @@ -615,7 +615,7 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 @@ -623,7 +623,7 @@ define double @test_v2f64(<2 x double> %a0) { ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm1 ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} @@ -676,7 +676,7 @@ define double @test_v3f64(<3 x double> %a0) { ; ; AVX-LABEL: test_v3f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 @@ -689,7 +689,7 @@ define double @test_v3f64(<3 x double> %a0) { ; ; AVX512-LABEL: test_v3f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm2 ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} @@ -743,8 +743,8 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm0, %xmm3, %xmm4 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 @@ -760,8 +760,8 @@ define double @test_v4f64(<4 x double> %a0) { ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm0, %xmm3, %xmm4 ; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} @@ -841,7 +841,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 ; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 ; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 @@ -850,7 +850,7 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -859,12 +859,12 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512BW-LABEL: test_v8f64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] ; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm8 ; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} @@ -892,12 +892,12 @@ define double @test_v8f64(<8 x double> %a0) { ; AVX512VL-LABEL: test_v8f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0] ; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm8 ; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1} @@ -1039,7 +1039,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 ; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 ; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 ; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 @@ -1048,7 +1048,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -1059,7 +1059,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vminpd %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 ; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] ; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm1 ; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} @@ -1068,7 +1068,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 @@ -1076,7 +1076,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 @@ -1084,7 +1084,7 @@ define double @test_v16f64(<16 x double> %a0) { ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll index 8c89782..249cbaf 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -67,7 +67,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -76,7 +76,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -114,7 +114,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -126,7 +126,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -170,7 +170,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -184,7 +184,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -250,7 +250,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; ; AVX-LABEL: test_v4f32_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -258,7 +258,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -293,7 +293,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -304,7 +304,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -345,7 +345,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -358,7 +358,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -423,7 +423,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; ; AVX-LABEL: test_v4f32_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -431,7 +431,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; ; AVX512-LABEL: test_v4f32_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -466,7 +466,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -477,7 +477,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -518,7 +518,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -531,7 +531,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -556,14 +556,14 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -585,7 +585,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -595,7 +595,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -621,7 +621,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -633,7 +633,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -665,7 +665,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -678,7 +678,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -701,13 +701,13 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) @@ -727,7 +727,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -736,7 +736,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -760,7 +760,7 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -771,7 +771,7 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -801,7 +801,7 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -813,7 +813,7 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -835,13 +835,13 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) @@ -861,7 +861,7 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -870,7 +870,7 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -894,7 +894,7 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -905,7 +905,7 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -935,7 +935,7 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -947,7 +947,7 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll index 1d48ec0..17fb2ed 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -73,7 +73,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -84,7 +84,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -141,7 +141,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -149,7 +149,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -161,7 +161,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -169,7 +169,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -261,7 +261,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 @@ -269,14 +269,14 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -284,7 +284,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -296,7 +296,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -304,7 +304,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -312,7 +312,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -320,7 +320,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -391,7 +391,7 @@ define float @test_v4f32_one(<4 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -401,7 +401,7 @@ define float @test_v4f32_one(<4 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -455,7 +455,7 @@ define float @test_v8f32_one(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -463,7 +463,7 @@ define float @test_v8f32_one(<8 x float> %a0) { ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -474,7 +474,7 @@ define float @test_v8f32_one(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -482,7 +482,7 @@ define float @test_v8f32_one(<8 x float> %a0) { ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -571,7 +571,7 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 @@ -579,14 +579,14 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -594,7 +594,7 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -605,7 +605,7 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -613,7 +613,7 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -621,7 +621,7 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -629,7 +629,7 @@ define float @test_v16f32_one(<16 x float> %a0) { ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -699,7 +699,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -709,7 +709,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -763,7 +763,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -771,7 +771,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -782,7 +782,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -790,7 +790,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -879,7 +879,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 @@ -887,14 +887,14 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -902,7 +902,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -913,7 +913,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -921,7 +921,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -929,7 +929,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -937,7 +937,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 @@ -962,14 +962,14 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) @@ -990,11 +990,11 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1002,11 +1002,11 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1034,18 +1034,18 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1053,19 +1053,19 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1133,32 +1133,32 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm4[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1166,34 +1166,34 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 ; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1215,13 +1215,13 @@ define double @test_v2f64_one(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64_one: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_one: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) @@ -1241,22 +1241,22 @@ define double @test_v4f64_one(<4 x double> %a0) { ; ; AVX-LABEL: test_v4f64_one: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64_one: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1283,37 +1283,37 @@ define double @test_v8f64_one(<8 x double> %a0) { ; ; AVX-LABEL: test_v8f64_one: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64_one: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1352,66 +1352,66 @@ define double @test_v16f64_one(<16 x double> %a0) { ; ; AVX-LABEL: test_v16f64_one: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm4 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64_one: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1432,13 +1432,13 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; ; AVX-LABEL: test_v2f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0) @@ -1457,22 +1457,22 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; ; AVX-LABEL: test_v4f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1498,37 +1498,37 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; ; AVX-LABEL: test_v8f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1566,66 +1566,66 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; ; AVX-LABEL: test_v16f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 85ac2eb..55f7dec 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -142,7 +142,7 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_10: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -156,7 +156,7 @@ define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_11: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -199,7 +199,7 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -214,7 +214,7 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_33: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -312,7 +312,7 @@ define <2 x double> @shuffle_v2f64_3u(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_3u: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index e14c218..b818845 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -92,7 +92,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_1000: ; AVX1: # %bb.0: ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_1000: @@ -174,7 +174,7 @@ define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_2233: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_2233: @@ -215,7 +215,7 @@ define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_3210: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_3210: @@ -234,7 +234,7 @@ define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_0023: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -263,7 +263,7 @@ define <4 x double> @shuffle_v4f64mem_0022(ptr %ptr, <4 x double> %b) { define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_1032: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -272,7 +272,7 @@ define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_1133: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -281,7 +281,7 @@ define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_1023: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -290,7 +290,7 @@ define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_1022: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -450,13 +450,13 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { ; AVX1OR2-LABEL: shuffle_v4f64_1054: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1OR2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4f64_1054: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1054: @@ -468,7 +468,7 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1054: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -478,13 +478,13 @@ define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { ; AVX1OR2-LABEL: shuffle_v4f64_3254: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1OR2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4f64_3254: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3254: @@ -496,7 +496,7 @@ define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_3254: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-FAST-PERLANE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -506,13 +506,13 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { ; AVX1OR2-LABEL: shuffle_v4f64_3276: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1OR2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4f64_3276: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3276: @@ -524,7 +524,7 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_3276: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -534,13 +534,13 @@ define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) { ; AVX1OR2-LABEL: shuffle_v4f64_1076: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1OR2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1OR2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4f64_1076: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4f64_1076: @@ -614,7 +614,7 @@ define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_11uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; ALL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -644,7 +644,7 @@ define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_3333: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_3333: @@ -771,7 +771,7 @@ define <4 x double> @shuffle_v4f64_1032_v2f64(<2 x double> %a, <2 x double> %b) ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq %1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %2 = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> @@ -896,7 +896,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1000: ; AVX1: # %bb.0: ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_1000: @@ -957,7 +957,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_3210: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_3210: @@ -1238,7 +1238,7 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1054: ; AVX1: # %bb.0: ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_1054: @@ -1272,7 +1272,7 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_3254: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_3254: @@ -1306,7 +1306,7 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_3276: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_3276: @@ -1340,7 +1340,7 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1076: ; AVX1: # %bb.0: ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_1076: @@ -1537,7 +1537,7 @@ define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_3333: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_3333: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index ec55616..ad25c80 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2613,7 +2613,7 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-NEXT: retq ; @@ -3237,7 +3237,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 7158b40..994ff3b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -132,7 +132,7 @@ define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15 define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 4f40ae3..de66f67 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -453,7 +453,7 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00234467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -471,7 +471,7 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -480,7 +480,7 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11335577: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -489,7 +489,7 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -498,7 +498,7 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10225466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -651,7 +651,7 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10324567: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -660,7 +660,7 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11334567: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -669,7 +669,7 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -678,7 +678,7 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -719,7 +719,7 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_103245uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -728,7 +728,7 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_1133uu67: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -737,7 +737,7 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_0uu354uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -746,7 +746,7 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_uuu3uu66: ; ALL: # %bb.0: -; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6] +; ALL-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 5d7984f..0ad7b89 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -439,7 +439,7 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 ; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] @@ -466,7 +466,7 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] -; X86-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] @@ -512,7 +512,7 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X64-AVX1-LABEL: PR48908: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 ; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] @@ -536,7 +536,7 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] -; X64-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] @@ -592,7 +592,7 @@ define <4 x i64> @concat_self_v4i64(<2 x i64> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: concat_self_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index bc6f2c7..483191f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -137,7 +137,7 @@ define <4 x double> @demandedelts_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x ; X86-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; X86-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 ; X86-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0 -; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3] +; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,3] ; X86-NEXT: retl ; ; X64-AVX-LABEL: demandedelts_vpermil2pd256_as_shufpd: @@ -148,7 +148,7 @@ define <4 x double> @demandedelts_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x ; X64-AVX2-LABEL: demandedelts_vpermil2pd256_as_shufpd: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3] -; X64-AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3] +; X64-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,3] ; X64-AVX2-NEXT: retq %res0 = insertelement <4 x i64> , i64 %a2, i32 0 %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %res0, i8 0) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 6e566c0..c87869e6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2235,7 +2235,7 @@ define <4 x float> @combine_undef_input_test9(<4 x float> %a) { ; ; AVX-LABEL: combine_undef_input_test9: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -2423,7 +2423,7 @@ define <4 x float> @combine_undef_input_test19(<4 x float> %a) { ; ; AVX-LABEL: combine_undef_input_test19: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 279a3b7..a97a82c 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1625,9 +1625,9 @@ define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind { ; AVX1-LABEL: splat2_v4f64_load_store: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) ; AVX1-NEXT: vmovupd %ymm1, (%rsi) ; AVX1-NEXT: vzeroupper @@ -1662,9 +1662,9 @@ define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind { ; AVX1-LABEL: splat2_v4i64_load_store: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) ; AVX1-NEXT: vmovupd %ymm1, (%rsi) ; AVX1-NEXT: vzeroupper -- 2.7.4