X86FastISel.cpp
X86FixupBWInsts.cpp
X86FixupLEAs.cpp
+ X86FixupInstTuning.cpp
X86AvoidStoreForwardingBlocks.cpp
X86DynAllocaExpander.cpp
X86FixupSetCC.cpp
/// instructions, in order to eliminate execution delays in some processors.
FunctionPass *createX86FixupLEAs();
+/// Return as pass that replaces equivilent slower instructions with faster
+/// ones.
+FunctionPass *createX86FixupInstTuning();
+
/// Return a pass that removes redundant LEA instructions and redundant address
/// recalculations.
FunctionPass *createX86OptimizeLEAs();
void initializeFPSPass(PassRegistry &);
void initializeFixupBWInstPassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
+void initializeX86FixupInstTuningPassPass(PassRegistry &);
void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
--- /dev/null
+//===-- X86FixupInstTunings.cpp - replace instructions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file does a tuning pass replacing slower machine instructions
+// with faster ones. We do this here, as opposed to during normal ISel, as
+// attempting to get the "right" instruction can break patterns. This pass
+// is not meant search for special cases where an instruction can be transformed
+// to another, it is only meant to do transformations where the old instruction
+// is always replacable with the new instructions. For example:
+//
+// `vpermq ymm` -> `vshufd ymm`
+// -- BAD, not always valid (lane cross/non-repeated mask)
+//
+// `vpermilps ymm` -> `vshufd ymm`
+// -- GOOD, always replaceable
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-inst-tuning"
+
+STATISTIC(NumInstChanges, "Number of instructions changes");
+
+namespace {
+class X86FixupInstTuningPass : public MachineFunctionPass {
+public:
+ static char ID;
+
+ X86FixupInstTuningPass() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &I);
+
+ // This pass runs after regalloc and doesn't support VReg operands.
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ const X86InstrInfo *TII = nullptr;
+ const X86Subtarget *ST = nullptr;
+};
+} // end anonymous namespace
+
+char X86FixupInstTuningPass::ID = 0;
+
+INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
+
+FunctionPass *llvm::createX86FixupInstTuning() {
+ return new X86FixupInstTuningPass();
+}
+
+bool X86FixupInstTuningPass::processInstruction(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
+ unsigned Opc = MI.getOpcode();
+ unsigned NumOperands = MI.getDesc().getNumOperands();
+
+ // `vpermilps r, i` -> `vshufps r, r, i`
+ // `vshufps` is always as fast or faster than `vpermilps` and takes 1 less
+ // byte of code size.
+ auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
+ unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
+ MI.removeOperand(NumOperands - 1);
+ MI.addOperand(MI.getOperand(1));
+ MI.setDesc(TII->get(NewOpc));
+ MI.addOperand(MachineOperand::CreateImm(MaskImm));
+ return true;
+ };
+
+ // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
+ // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
+ // byte of code size.
+ auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
+ // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
+ // `vpshufd` saves a byte of code size.
+ if (!ST->hasNoDomainDelayShuffle())
+ return false;
+ MI.setDesc(TII->get(NewOpc));
+ return true;
+ };
+
+ // TODO: Add masked predicate execution variants.
+ switch (Opc) {
+ case X86::VPERMILPSri:
+ return ProcessVPERMILPSri(X86::VSHUFPSrri);
+ case X86::VPERMILPSYri:
+ return ProcessVPERMILPSri(X86::VSHUFPSYrri);
+ case X86::VPERMILPSZ128ri:
+ return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
+ case X86::VPERMILPSZ256ri:
+ return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
+ case X86::VPERMILPSZri:
+ return ProcessVPERMILPSri(X86::VSHUFPSZrri);
+ case X86::VPERMILPSmi:
+ return ProcessVPERMILPSmi(X86::VPSHUFDmi);
+ case X86::VPERMILPSYmi:
+ // TODO: See if there is a more generic way we can test if the replacement
+ // instruction is supported.
+ return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
+ case X86::VPERMILPSZ128mi:
+ return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
+ case X86::VPERMILPSZ256mi:
+ return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
+ case X86::VPERMILPSZmi:
+ return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
+ default:
+ return false;
+ }
+}
+
+bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
+ bool Changed = false;
+ ST = &MF.getSubtarget<X86Subtarget>();
+ TII = ST->getInstrInfo();
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (processInstruction(MF, MBB, I)) {
+ ++NumInstChanges;
+ Changed = true;
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
+ return Changed;
+}
addPass(createX86FixupBWInsts());
addPass(createX86PadShortFunctions());
addPass(createX86FixupLEAs());
+ addPass(createX86FixupInstTuning());
}
addPass(createX86EvexToVexInsts());
addPass(createX86DiscriminateMemOpsPass());
; AVX1-LABEL: endless_loop:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovaps (%eax), %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,2]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,2]
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,2,2]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,2]
; CHECK-NEXT: vxorps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%A = load <4 x i32>, ptr %pA
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovaps 48(%rdi), %xmm0
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
; CHECK-LABEL: test_mm_permute_ps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %res
define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
; CHECK-LABEL: test2_mm_permute_ps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,3]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
ret <4 x float> %res
define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
; CHECK-LABEL: test_mm256_permute_ps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %res
; X86-LABEL: test_mm256_set1_epi32:
; X86: # %bb.0:
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X86-NEXT: retl
;
; X86-LABEL: test_mm256_set1_ps:
; X86: # %bb.0:
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_set1_ps:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <8 x float> undef, float %a0, i32 0
define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
; AVX-LABEL: test_x86_avx_vpermil_ps:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps $7, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07]
+; AVX-NEXT: vshufps $7, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x07]
; AVX-NEXT: # xmm0 = xmm0[3,1,0,0]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermil_ps:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps $7, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07]
+; AVX512VL-NEXT: vshufps $7, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x07]
; AVX512VL-NEXT: # xmm0 = xmm0[3,1,0,0]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
; AVX-LABEL: test_x86_avx_vpermil_ps_256:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07]
+; AVX-NEXT: vshufps $7, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0xc6,0xc0,0x07]
; AVX-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermil_ps_256:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07]
+; AVX512VL-NEXT: vshufps $7, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x07]
; AVX512VL-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcG:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
; CHECK-LABEL: funcH:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
; CHECK-NEXT: ret{{[l|q]}}
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
define <4 x i32> @H(<4 x i32> %a) {
; X86-LABEL: H:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT: retl
;
; X64-LABEL: H:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; X64: ## %bb.0: ## %bb
; X64-NEXT: movdq2q %xmm0, %mm0
; X64-NEXT: movq2dq %mm0, %xmm0
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
bb:
%tmp1 = bitcast x86_mmx %tmp to i64
; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vmovups %xmm0, (%rax)
; CHECK-NEXT: retq
allocas:
; ALL-LABEL: shuffle_v8f32_uu67ucuf:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
; CHECK-LABEL: test_mm256_shuffle_epi32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
define <4 x i64> @f32to4sl(<4 x float> %a) {
; NODQ-LABEL: f32to4sl:
; NODQ: # %bb.0:
-; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; NODQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; NODQ-NEXT: vcvttss2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; NOVL: # %bb.0:
; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
; NOVL-NEXT: retq
;
define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
; CHECK-LABEL: test_mm512_permute_ps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
ret <16 x float> %res
define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
; CHECK-LABEL: test_mm512_shuffle_epi32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <8 x i64> %a0 to <16 x i32>
%res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpermilps $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x16]
+; CHECK-NEXT: vshufps $22, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc0,0x16]
; CHECK-NEXT: ## zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_pshuf_d_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpermilps $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x03]
+; CHECK-NEXT: vshufps $3, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc0,0x03]
; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
define <4 x float> @test_4xfloat_perm_mask0(<4 x float> %vec) {
; CHECK-LABEL: test_4xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,1]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,1]
; CHECK-NEXT: retq
%res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
ret <4 x float> %res
define <4 x float> @test_4xfloat_perm_mask3(<4 x float> %vec) {
; CHECK-LABEL: test_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,2]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,2]
; CHECK-NEXT: retq
%res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
ret <4 x float> %res
define <8 x float> @test_8xfloat_perm_imm_mask3(<8 x float> %vec) {
; CHECK-LABEL: test_8xfloat_perm_imm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4]
; CHECK-NEXT: retq
%res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
ret <8 x float> %res
define <16 x float> @test_16xfloat_perm_imm_mask3(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_perm_imm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
; CHECK-NEXT: retq
%res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
ret <16 x float> %res
define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
; CHECK-LABEL: test_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,3,0]
; CHECK-NEXT: retq
%res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
ret <4 x i32> %res
define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
; CHECK-LABEL: test_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,3]
; CHECK-NEXT: retq
%res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
ret <4 x i32> %res
define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
ret <8 x i32> %res
define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
ret <8 x i32> %res
define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
ret <16 x i32> %res
define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
ret <16 x i32> %res
define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
; ALL-LABEL: trunc_qd_128:
; ALL: ## %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; ALL-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i32>
ret <2 x i32> %x
define void @trunc_qd_128_mem(<2 x i64> %i, ptr %res) #0 {
; KNL-LABEL: trunc_qd_128_mem:
; KNL: ## %bb.0:
-; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL-NEXT: vmovlps %xmm0, (%rdi)
; KNL-NEXT: retq
;
; AVX512-LABEL: test46:
; AVX512: ## %bb.0:
; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
-; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
+; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4]
; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3]
; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A]
; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
define half @extract_f16_6(<8 x half> %x) {
; CHECK-LABEL: extract_f16_6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 6
ret half %res
define void @extract_store_f16_6(<8 x half> %x, ptr %y) {
; X64-LABEL: extract_store_f16_6:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_6:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 6
; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpextrq $1, %xmm4, %rax
; CHECK-NEXT: vmovsh %xmm3, (%rax)
-; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: vmovsh %xmm3, (%rax)
; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpextrq $1, %xmm3, %rax
; CHECK-NEXT: vmovsh %xmm1, (%rax)
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; CHECK-NEXT: vextracti32x4 $3, %zmm2, %xmm2
; CHECK-NEXT: vmovq %xmm2, %rax
; CHECK-NEXT: vmovsh %xmm1, (%rax)
define <8 x float>@test_int_x86_avx512_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_256:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x16]
+; CHECK-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x16]
; CHECK-NEXT: # ymm0 = ymm0[2,1,1,0,6,5,5,4]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
define <4 x float>@test_int_x86_avx512_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x16]
+; CHECK-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x16]
; CHECK-NEXT: # xmm0 = xmm0[2,1,1,0]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
define <4 x i32>@test_int_x86_avx512_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_pshuf_d_128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x03]
+; CHECK-NEXT: vshufps $3, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x03]
; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
define <8 x i32>@test_int_x86_avx512_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_pshuf_d_256:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x03]
+; CHECK-NEXT: vshufps $3, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x03]
; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,7,4,4,4]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
;
; AVX-LABEL: extract1_i32_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX-LABEL: extract2_i32_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX-LABEL: extract0_i32_zext_insert1_i64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX-NEXT: retq
; AVX1-LABEL: neg_scalar_broadcast_v8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3]
;
; AVX1-LABEL: casted_neg_scalar_broadcast_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vmovaps (%rsi), %ymm1
; AVX1-NEXT: vmovaps (%rdx), %ymm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7]
define <4 x float> @vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary(<4 x float> %x) nounwind {
; CHECK-LABEL: vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0]
; CHECK-NEXT: retq
%r = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
ret <4 x float> %r
define <4 x i32> @vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary(<4 x i32> %x) nounwind {
; CHECK-LABEL: vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0]
; CHECK-NEXT: retq
%r = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
ret <4 x i32> %r
define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; CHECK-NEXT: retq
%r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
;
; CHECK-FAST-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; CHECK-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; CHECK-FAST-NEXT: retq
%r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
;
; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; CHECK-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; CHECK-FAST-NEXT: retq
%r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary:
; CHECK-SLOW: # %bb.0:
; CHECK-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; CHECK-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; CHECK-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
; CHECK-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
; CHECK-SLOW-NEXT: retq
;
;
; AVX1-LABEL: catcat:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2]
; AVX-X86-LABEL: extract_i64_1:
; AVX-X86: # %bb.0:
; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-X86-NEXT: vmovlps %xmm0, (%eax)
; AVX-X86-NEXT: retl
;
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vy = insertelement <4 x float> undef, float %y, i32 0
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vy = insertelement <4 x float> undef, float %y, i32 0
; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NOFMA-NEXT: retq
; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14
; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0
; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm7[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[3,3,3,3]
-; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,3,3,3]
+; NOFMA-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-LABEL: not_a_hsub_2:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX1: # %bb.0:
; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse_v8f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: retq
%lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9>
; AVX1-LABEL: hadd_reverse2_v8f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse2_v8f32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse3_v8f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: retq
%shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse_v16f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1]
; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1]
; AVX2-NEXT: vmovaps %ymm3, %ymm0
; AVX2-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: vhaddps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6]
; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
; AVX1-NEXT: vmovaps %ymm3, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: hadd_reverse2_v16f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovaps %ymm3, %ymm0
; AVX2-NEXT: retq
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-SLOW-NEXT: retq
; AVX-FAST-LABEL: test8_undef:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX-FAST-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2
; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512-SLOW-NEXT: retq
;
; AVX-SLOW-LABEL: add_ps_030:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: add_ps_030:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
; AVX-LABEL: add_ps_016:
; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3]
; AVX-NEXT: retq
%3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6>
%4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7>
;
; AVX-SLOW-LABEL: add_ps_017:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
;
; AVX-SLOW-LABEL: PR45747_1:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,2,2]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-SLOW-LABEL: PR45747_2:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%x0 = extractelement <4 x float> %x, i32 2
; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX1-NEXT: vmovd %xmm0, %eax
; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-AVX2-NEXT: vmovd %xmm0, %eax
; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, %rax
; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, %rax
;
; X86-AVX1-LABEL: test_reduce_v2i64:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
;
; X86-AVX1-LABEL: test_reduce_v2i64:
; X86-AVX1: ## %bb.0:
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
; X64-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4
; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3
; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX-LABEL: test_unpacklo_hadd_v4f32:
; AVX: ## %bb.0:
; AVX-NEXT: vhaddps %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: ret{{[l|q]}}
%5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4
%6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4
; AVX-LABEL: test_unpackhi_hadd_v4f32:
; AVX: ## %bb.0:
; AVX-NEXT: vhaddps %xmm3, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: ret{{[l|q]}}
%5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4
%6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4
; AVX-LABEL: test_unpacklo_hsub_v4f32:
; AVX: ## %bb.0:
; AVX-NEXT: vhsubps %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: ret{{[l|q]}}
%5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4
%6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4
; AVX-LABEL: test_unpackhi_hsub_v4f32:
; AVX: ## %bb.0:
; AVX-NEXT: vhsubps %xmm3, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: ret{{[l|q]}}
%5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4
%6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4
; AVX-LABEL: test_unpacklo_hadd_v4f32_unary:
; AVX: ## %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: ret{{[l|q]}}
%2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) #4
%3 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
; CHECK-LABEL: test_unpacklo_hadd_v8f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vhaddps %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; CHECK-NEXT: ret{{[l|q]}}
%5 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %1) #4
%6 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %2, <8 x float> %3) #4
; CHECK-LABEL: test_unpackhi_hadd_v8f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vhaddps %ymm3, %ymm1, %ymm0
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; CHECK-NEXT: ret{{[l|q]}}
%5 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %1) #4
%6 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %2, <8 x float> %3) #4
; CHECK-LABEL: test_unpacklo_hsub_v8f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vhsubps %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; CHECK-NEXT: ret{{[l|q]}}
%5 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %0, <8 x float> %1) #4
%6 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %2, <8 x float> %3) #4
; CHECK-LABEL: test_unpackhi_hsub_v8f32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vhsubps %ymm3, %ymm1, %ymm0
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; CHECK-NEXT: ret{{[l|q]}}
%5 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %0, <8 x float> %1) #4
%6 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %2, <8 x float> %3) #4
; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vhaddps %ymm3, %ymm2, %ymm1
; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
; CHECK-NEXT: ret{{[l|q]}}
%1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-SLOW-NEXT: retq
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-FAST-NEXT: retq
; X86-AVX-NEXT: # xmm1 = mem[0,0]
; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255]
; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; X64-AVX-NEXT: retq
;
;
; AVX1-LABEL: arg_f32_v4f32_undef:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: arg_f32_v4f32_undef:
;
; AVX1-LABEL: arg_f32_v8f32_undef:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
;
; AVX1-LABEL: arg_f32_v4f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
;
; AVX1-LABEL: arg_f32_v8f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X86-LABEL: knownbits_mask_shuffle_uitofp:
; X86: # %bb.0:
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: knownbits_mask_shuffle_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
define <4 x float> @knownbits_mask_xor_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X86-LABEL: knownbits_mask_xor_shuffle_uitofp:
; X86: # %bb.0:
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
;
; X64-LABEL: knownbits_mask_xor_shuffle_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X86-LABEL: knownbits_mask_concat_uitofp:
; X86: # %bb.0:
-; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3]
+; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071]
; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X86-NEXT: vandps %xmm2, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
;
; X64-LABEL: knownbits_mask_concat_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3]
+; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071]
; X64-NEXT: vandps %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X64-NEXT: vandps %xmm2, %xmm0, %xmm0
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3
; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a2, <i32 65535, i32 -1, i32 255, i32 -1>
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3
; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = lshr <4 x i32> %a2, <i32 5, i32 1, i32 5, i32 1>
; X86-LABEL: signbits_ashr_extract_sitofp_0:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
;
; X64-LABEL: signbits_ashr_extract_sitofp_0:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = ashr <2 x i64> %a0, <i64 32, i64 32>
; X86-LABEL: signbits_ashr_extract_sitofp_1:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
;
; X64-LABEL: signbits_ashr_extract_sitofp_1:
; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = ashr <2 x i64> %a0, <i64 32, i64 63>
define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%1 = ashr <2 x i64> %a0, <i64 16, i64 16>
; X86-NEXT: vpsrad $1, %xmm2, %xmm2
; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; X86-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X86-NEXT: vextractf128 $1, %ymm1, %xmm1
; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
-; X64-AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; X64-AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax)
; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
-; X86-AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax)
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX1-NEXT: vmulps %xmm4, %xmm2, %xmm4
; AVX1-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX1-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX2-NEXT: vmulps %xmm4, %xmm2, %xmm4
; AVX2-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX2-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX512-NEXT: vmulps %xmm4, %xmm2, %xmm4
; AVX512-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX512-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[1,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4
; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9
; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11
; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7
; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm9[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3]
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2]
; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0
; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2
; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6
; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9
; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm0[1,0]
; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11
; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm8[1,0]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0
; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2
; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
;
; AVX1-LABEL: test_mul4x4_f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
; AVX1-NEXT: vmulps %ymm4, %ymm5, %ymm4
-; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
; AVX1-NEXT: vmulps %ymm6, %ymm7, %ymm0
; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm6
; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4
; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
; AVX1-NEXT: vmulps %ymm2, %ymm5, %ymm2
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vmulps %ymm4, %ymm7, %ymm4
; AVX1-NEXT: vaddps %ymm2, %ymm4, %ymm2
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4
; AVX1-NEXT: vaddps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_mul4x4_f32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3]
; AVX2-NEXT: vmulps %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1]
; AVX2-NEXT: vmulps %ymm6, %ymm7, %ymm0
; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1]
; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4
; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
; AVX2-NEXT: vmulps %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-NEXT: vmulps %ymm4, %ymm7, %ymm4
; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4
; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
; AVX512-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm2
-; AVX512-NEXT: vpermilps {{.*#+}} zmm4 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
+; AVX512-NEXT: vshufps {{.*#+}} zmm4 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm5 = zmm0[2,3,2,3,2,3,2,3]
; AVX512-NEXT: vmulps %zmm4, %zmm5, %zmm4
-; AVX512-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1]
; AVX512-NEXT: vmulps %zmm1, %zmm3, %zmm1
; AVX512-NEXT: vaddps %zmm4, %zmm1, %zmm1
-; AVX512-NEXT: vpermilps {{.*#+}} zmm3 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512-NEXT: vshufps {{.*#+}} zmm3 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm4 = zmm0[4,5,4,5,4,5,4,5]
; AVX512-NEXT: vmulps %zmm3, %zmm4, %zmm3
; AVX512-NEXT: vaddps %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpermilps {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512-NEXT: vshufps {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
; AVX512-NEXT: vmulps %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0
;
; AVX512F-LABEL: test_mul8x8_f32:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm9
; AVX512F-NEXT: vextractf64x4 $1, %zmm4, %ymm8
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm9, %zmm16
; AVX512F-NEXT: vbroadcastss %xmm4, %ymm10
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm8[1,1,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm18
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm4[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13
; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm12, %zmm19
; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm12
; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm14 = xmm5[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm13, %zmm20
; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm13
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm15 = xmm13[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT: vinsertf64x4 $1, %ymm15, %zmm14, %zmm21
; AVX512F-NEXT: vbroadcastss %xmm5, %ymm15
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm13[1,1,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm15, %zmm10
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm15 = xmm5[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm5[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm11
; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm15, %zmm15
; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm13
; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm6[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm11, %zmm11
; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm13
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm13[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm8, %zmm8
; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12
; AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm26
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm27 = zmm2[4,5,6,7,4,5,6,7]
; AVX512F-NEXT: vmovshdup {{.*#+}} zmm0 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512F-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm4
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
; AVX512F-NEXT: vmulps %zmm17, %zmm14, %zmm17
; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm2, %zmm27, %zmm2
; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm2 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm2, %zmm4, %zmm2
; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm2 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm2, %zmm3, %zmm2
; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
; AVX512F-NEXT: vmulps %zmm8, %zmm24, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm5, %ymm5
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm13[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm5, %zmm5
; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm8
; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm5
; AVX512F-NEXT: vbroadcastss %xmm5, %ymm5
; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm8, %zmm5
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
; AVX512F-NEXT: vmulps %zmm5, %zmm26, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm5, %zmm27, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm5, %zmm4, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm5
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm6, %zmm3, %zmm6
; AVX512F-NEXT: vaddps %zmm6, %zmm2, %zmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm6, %ymm6
; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm8, %zmm6
; AVX512F-NEXT: vbroadcastss %xmm7, %ymm8
; AVX512F-NEXT: vmulps %zmm9, %zmm23, %zmm9
; AVX512F-NEXT: vaddps %zmm9, %zmm8, %zmm8
; AVX512F-NEXT: vaddps %zmm6, %zmm8, %zmm6
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm7[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm5, %ymm5
; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm8, %zmm5
; AVX512F-NEXT: vmulps %zmm5, %zmm25, %zmm5
; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm6, %zmm27, %zmm6
; AVX512F-NEXT: vaddps %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm6 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm6 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm6, %zmm4, %zmm4
; AVX512F-NEXT: vaddps %zmm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vshufps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512F-NEXT: vmulps %zmm5, %zmm3, %zmm3
; AVX512F-NEXT: vaddps %zmm3, %zmm4, %zmm3
; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm11
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm4[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm10
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm14
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm4[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm9
; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm8
; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8
; AVX512VL-NEXT: vextractf64x4 $1, %zmm4, %ymm12
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm12[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm16 = xmm12[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm12[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm12, %ymm12
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm16, %zmm14, %zmm16
; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm14
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm16
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm12, %zmm9, %zmm12
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm9, %ymm18
; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm9 = zmm1[4,5,6,7,4,5,6,7]
; AVX512VL-NEXT: vmulps %zmm12, %zmm9, %zmm1
; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm12 = zmm2[4,5,6,7,4,5,6,7]
; AVX512VL-NEXT: vmulps %zmm1, %zmm12, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm13
; AVX512VL-NEXT: vmulps %zmm1, %zmm13, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vextractf64x4 $1, %zmm5, %ymm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
; AVX512VL-NEXT: vmulps %zmm2, %zmm3, %zmm2
; AVX512VL-NEXT: vaddps %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm18, %zmm2
; AVX512VL-NEXT: vextractf32x4 $2, %zmm5, %xmm4
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm1[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm16, %zmm15
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm16 = xmm5[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm5[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT: vmulps %zmm4, %zmm11, %zmm4
; AVX512VL-NEXT: vmulps %zmm15, %zmm10, %zmm15
; AVX512VL-NEXT: vaddps %zmm15, %zmm4, %zmm4
; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm15
; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX512VL-NEXT: vmulps %zmm2, %zmm14, %zmm2
; AVX512VL-NEXT: vaddps %zmm2, %zmm4, %zmm2
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm6[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm15, %zmm4
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm15 = xmm6[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT: vmulps %zmm4, %zmm8, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm4, %zmm12, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm4 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vextractf64x4 $1, %zmm6, %ymm4
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm5 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm1, %zmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm15, %zmm5
; AVX512VL-NEXT: vextractf32x4 $2, %zmm6, %xmm15
; AVX512VL-NEXT: vaddps %zmm2, %zmm15, %zmm2
; AVX512VL-NEXT: vmulps %zmm5, %zmm14, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4
; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm5
; AVX512VL-NEXT: vextractf32x4 $3, %zmm6, %xmm4
; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm7[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
; AVX512VL-NEXT: vmulps %zmm4, %zmm8, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm4, %zmm12, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm4 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
; AVX512VL-NEXT: vextractf64x4 $1, %zmm7, %ymm4
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm6, %zmm3, %zmm6
; AVX512VL-NEXT: vaddps %zmm6, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[2,2,2,2]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm6, %zmm5, %zmm5
; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm6
; AVX512VL-NEXT: vmulps %zmm11, %zmm10, %zmm10
; AVX512VL-NEXT: vaddps %zmm10, %zmm6, %zmm6
; AVX512VL-NEXT: vaddps %zmm5, %zmm6, %zmm5
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm6, %zmm4
; AVX512VL-NEXT: vmulps %zmm4, %zmm9, %zmm4
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm5, %zmm12, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm5 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm5, %zmm13, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vshufps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm3
; AVX512VL-NEXT: vaddps %zmm3, %zmm4, %zmm3
;
; AVX-LABEL: v5i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,2,3]
; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi)
; AVX-NEXT: vmovaps %xmm1, (%rdi)
; AVX-LABEL: v5f32:
; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi)
; AVX-NEXT: vmovaps %xmm1, (%rdi)
; AVX-NEXT: retq
; AVX-LABEL: v7i32:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,3,2]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-NEXT: vmovss %xmm1, 24(%rdi)
; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX-NEXT: vmovaps %xmm2, (%rdi)
; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX1-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm2, (%rdi)
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi)
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <u,3,7,u,u,u,u,u>
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0
-; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdi)
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdi)
; XOP: # %bb.0:
; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6]
-; XOP-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1]
+; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
-; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; XOP-NEXT: vmovaps %xmm0, 32(%rdi)
; XOP-NEXT: vmovaps %ymm2, (%rdi)
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
; AVX1-NEXT: vmovups 16(%rdi), %xmm6
; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi)
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsi)
; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
-; XOP-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
; XOP-NEXT: vmovups 16(%rdi), %xmm6
; XOP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
-; XOP-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
+; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm3
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm5
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1
; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; XOP-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; XOP-NEXT: vpsrld $16, %xmm0, %xmm0
; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
-; XOP-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
+; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: X86 Atom pad short functions
; CHECK-NEXT: X86 LEA Fixup
+; CHECK-NEXT: X86 Fixup Inst Tuning
; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible
; CHECK-NEXT: X86 Discriminate Memory Operands
; CHECK-NEXT: X86 Insert Cache Prefetches
; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vzeroupper
;
; CHECK-AVX-LABEL: test1:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,0]
; CHECK-AVX-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 >
ret <4 x i32> %C
; CHECK-NEXT: vmovaps G2(%rip), %xmm0
; CHECK-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
; CHECK-NEXT: retq
entry:
%V = load <2 x float>, ptr @G1, align 8
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7]
; CHECK-NEXT: retq
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,1,0]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; CHECK-NEXT: vpsrad $2, %xmm2, %xmm2
; CHECK-NEXT: vcvtdq2ps %ymm2, %ymm2
-; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; CHECK-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
; CHECK-NEXT: vmaskmovps %ymm2, %ymm1, 32(%rdi)
define <4 x i32> @rot_v4i32_zero_non_splat(<4 x i32> %x) {
; XOPAVX1-LABEL: rot_v4i32_zero_non_splat:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: rot_v4i32_zero_non_splat:
; AVX-LABEL: fmul_splat_splat_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer
%splaty = shufflevector <4 x float> %vy, <4 x float> undef, <4 x i32> zeroinitializer
; AVX-LABEL: fdiv_splat_splat_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
%splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer
; AVX-LABEL: fmul_splat_const_op1_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer
%r = fmul fast <4 x float> %splatx, <float 17.0, float 17.0, float 17.0, float 17.0>
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
%splatx = shufflevector <8 x float> <float 4.5, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, <8 x float> undef, <8 x i32> zeroinitializer
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
%splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer
; AVX-LABEL: splat0_fmul_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%b = fmul fast <4 x float> %vx, %vy
%r = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
; AVX-LABEL: splat0_fdiv_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
%b = fdiv fast <8 x float> %vx, %vy
; AVX-LABEL: splat0_fmul_const_op1_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%b = fmul fast <4 x float> %vx, <float 6.0, float -1.0, float 1.0, float 7.0>
%r = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
;
; AVX-LABEL: splat0_fdiv_const_op1_v8f32:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
%b = fdiv fast <8 x float> %vx, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
%b = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %vx
; AVX-LABEL: multi_use_binop:
; AVX: # %bb.0:
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
; X64-AVX2-NEXT: pushq %rax
; X64-AVX2-NEXT: movl $63, %edi
; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X64-AVX2-NEXT: popq %rax
; X64-AVX2-NEXT: retq
;
; X86-AVX2-NEXT: pushl $63
; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
; X86-AVX2-NEXT: addl $4, %esp
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X86-AVX2-NEXT: retl
%i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63)
%i2 = bitcast <2 x i64> %i1 to <4 x i32>
; X64-AVX2-NEXT: pushq %rax
; X64-AVX2-NEXT: movl $63, %edi
; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
-; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: popq %rax
; X64-AVX2-NEXT: retq
;
; X86-AVX2-NEXT: pushl $63
; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
; X86-AVX2-NEXT: addl $4, %esp
-; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: retl
%i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63)
%i2 = bitcast <2 x i64> %i1 to <2 x i64>
define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask3:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
define <4 x i32> @undef_splatmask4(<4 x i32> %v, ptr %p) nounwind {
; AVX2-LABEL: undef_splatmask4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vmovaps %xmm0, (%rdi)
; AVX2-NEXT: vmovaps %xmm1, %xmm0
; AVX2-NEXT: retq
; AVX-NEXT: vmovapd (%rdi), %xmm0
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vmovapd %xmm0, (%rdi)
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
;
; X64-AVX1-LABEL: test_mm_set_ps1:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
;
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
;
; X64-AVX1-LABEL: test_mm_set1_ps:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_store_ps1:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_store_ps1:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
; X86-AVX1-LABEL: test_mm_store1_ps:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_store1_ps:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
; X86-AVX1-LABEL: test_mm_storer_ps:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
+; X86-AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b]
; X86-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0]
; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
; X86-AVX512-LABEL: test_mm_storer_ps:
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
+; X86-AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b]
; X86-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0]
; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
; X86-AVX512-NEXT: retl # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_storer_ps:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
+; X64-AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b]
; X64-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0]
; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
; X64-AVX1-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512-LABEL: test_mm_storer_ps:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
+; X64-AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b]
; X64-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0]
; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
; X64-AVX512-NEXT: retq # encoding: [0xc3]
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
;
; AVX1-LABEL: test_mm_shuffle_epi32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; AVX1-NEXT: # xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
;
; AVX1-LABEL: test_x86_sse2_pshuf_d:
; AVX1: ## %bb.0: ## %entry
-; AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
+; AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x1b]
; AVX1-NEXT: ## xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: test_x86_sse2_pshuf_d:
; AVX512: ## %bb.0: ## %entry
-; AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
+; AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b]
; AVX512-NEXT: ## xmm0 = xmm0[3,2,1,0]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
entry:
; X86-AVX-LABEL: test4:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
; X86-AVX-NEXT: retl
;
;
; X64-AVX-LABEL: test4:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: vmovaps (%edx), %xmm0
; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
-; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
; X86-AVX-NEXT: retl
;
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdx), %xmm0
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
-; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
%tmp3 = load <4 x float>, ptr %B ; <<4 x float>> [#uses=1]
;
; AVX1-LABEL: test13:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: test13:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vbroadcastss %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vbroadcastss %xmm2, %xmm1
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; AVX-NEXT: retq
; X86-AVX1-LABEL: ext_1:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50]
-; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X86-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512-LABEL: ext_1:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50]
-; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X86-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
;
; X64-AVX1-LABEL: ext_1:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X64-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
;
; X64-AVX512-LABEL: ext_1:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X64-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X86-AVX1-LABEL: ext_2:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50]
-; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24]
; X86-AVX512-LABEL: ext_2:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50]
-; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24]
;
; X64-AVX1-LABEL: ext_2:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX512-LABEL: ext_2:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%s = extractelement <4 x float> %v, i32 3
;
; AVX1-LABEL: insertps_from_shufflevector_i32_2:
; AVX1: ## %bb.0: ## %entry
-; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
;
; AVX512-LABEL: insertps_from_shufflevector_i32_2:
; AVX512: ## %bb.0: ## %entry
-; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3]
; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
;
; AVX1-LABEL: i32_shuf_XYY0:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
+; AVX1-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4]
; AVX1-NEXT: ## xmm0 = xmm0[0,1,1,3]
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
;
; AVX512-LABEL: i32_shuf_XYY0:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
+; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xd4]
; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3]
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
;
; AVX1-LABEL: i32_shuf_XYW0:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpermilps $244, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4]
+; AVX1-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xf4]
; AVX1-NEXT: ## xmm0 = xmm0[0,1,3,3]
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
;
; AVX512-LABEL: i32_shuf_XYW0:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4]
+; AVX512-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xf4]
; AVX512-NEXT: ## xmm0 = xmm0[0,1,3,3]
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
;
; AVX1-LABEL: i32_shuf_W00W:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
;
; AVX512-LABEL: i32_shuf_W00W:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff]
+; AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff]
; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX1-NEXT: vpermilps $0, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x00]
+; AVX1-NEXT: vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00]
; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1-LABEL: i32_shuf_X00X:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
+; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; X86-AVX1-LABEL: insertps_pr20411:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X86-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X86-AVX512-LABEL: insertps_pr20411:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X86-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
;
; X64-AVX1-LABEL: insertps_pr20411:
; X64-AVX1: ## %bb.0:
-; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X64-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
;
; X64-AVX512-LABEL: insertps_pr20411:
; X64-AVX512: ## %bb.0:
-; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee]
+; X64-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3]
; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
define <8 x i32> @swizzle_2(<8 x i32> %v) {
; CHECK-LABEL: swizzle_2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
define <8 x i32> @swizzle_3(<8 x i32> %v) {
; CHECK-LABEL: swizzle_3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
define <16 x float> @transform_VPERMILPSZrr(<16 x float> %a) nounwind {
; CHECK-LABEL: transform_VPERMILPSZrr:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
%shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
ret <16 x float> %shufp
define <8 x float> @transform_VPERMILPSYrr(<8 x float> %a) nounwind {
; CHECK-LABEL: transform_VPERMILPSYrr:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shufp
define <4 x float> @transform_VPERMILPSrr(<4 x float> %a) nounwind {
; CHECK-LABEL: transform_VPERMILPSrr:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shufp
}
define <16 x float> @transform_VPERMILPSZrm(ptr %ap) nounwind {
-; CHECK-LABEL: transform_VPERMILPSZrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-NEXT: retq
+; CHECK-ICX-LABEL: transform_VPERMILPSZrm:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VPERMILPSZrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VPERMILPSZrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VPERMILPSZrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-ZNVER4-NEXT: retq
%a = load <16 x float>, ptr %ap
%shufp = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
ret <16 x float> %shufp
}
define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
-; CHECK-LABEL: transform_VPERMILPSYrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
-; CHECK-NEXT: retq
+; CHECK-ICX-LABEL: transform_VPERMILPSYrm:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VPERMILPSYrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VPERMILPSYrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VPERMILPSYrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-ZNVER4-NEXT: retq
%a = load <8 x float>, ptr %ap
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shufp
}
define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
-; CHECK-LABEL: transform_VPERMILPSrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
-; CHECK-NEXT: retq
+; CHECK-ICX-LABEL: transform_VPERMILPSrm:
+; CHECK-ICX: # %bb.0:
+; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-ICX-NEXT: retq
+;
+; CHECK-V4-LABEL: transform_VPERMILPSrm:
+; CHECK-V4: # %bb.0:
+; CHECK-V4-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-V4-NEXT: retq
+;
+; CHECK-AVX512-LABEL: transform_VPERMILPSrm:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-ZNVER4-LABEL: transform_VPERMILPSrm:
+; CHECK-ZNVER4: # %bb.0:
+; CHECK-ZNVER4-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-ZNVER4-NEXT: retq
%a = load <4 x float>, ptr %ap
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shufp
%res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> %b
ret <4 x float> %res
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-AVX512: {{.*}}
-; CHECK-ICX: {{.*}}
-; CHECK-V4: {{.*}}
-; CHECK-ZNVER4: {{.*}}
define <8 x float> @transform_VPERMILPSYrr(<8 x float> %a) nounwind {
; CHECK-LABEL: transform_VPERMILPSYrr:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; CHECK-NEXT: retq
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shufp
define <4 x float> @transform_VPERMILPSrr(<4 x float> %a) nounwind {
; CHECK-LABEL: transform_VPERMILPSrr:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shufp
}
define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind {
-; CHECK-LABEL: transform_VPERMILPSYrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
-; CHECK-NEXT: retq
+; CHECK-AVX1-LABEL: transform_VPERMILPSYrm:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX1-DELAY-LABEL: transform_VPERMILPSYrm:
+; CHECK-AVX1-DELAY: # %bb.0:
+; CHECK-AVX1-DELAY-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-AVX1-DELAY-NEXT: retq
+;
+; CHECK-AVX2-LABEL: transform_VPERMILPSYrm:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX2-DELAY-LABEL: transform_VPERMILPSYrm:
+; CHECK-AVX2-DELAY: # %bb.0:
+; CHECK-AVX2-DELAY-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-AVX2-DELAY-NEXT: retq
%a = load <8 x float>, ptr %ap
%shufp = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shufp
}
define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind {
-; CHECK-LABEL: transform_VPERMILPSrm:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
-; CHECK-NEXT: retq
+; CHECK-AVX1-LABEL: transform_VPERMILPSrm:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX1-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-AVX1-DELAY: # %bb.0:
+; CHECK-AVX1-DELAY-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX1-DELAY-NEXT: retq
+;
+; CHECK-AVX2-LABEL: transform_VPERMILPSrm:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX2-DELAY-LABEL: transform_VPERMILPSrm:
+; CHECK-AVX2-DELAY: # %bb.0:
+; CHECK-AVX2-DELAY-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; CHECK-AVX2-DELAY-NEXT: retq
%a = load <4 x float>, ptr %ap
%shufp = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shufp
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-AVX1: {{.*}}
-; CHECK-AVX1-DELAY: {{.*}}
-; CHECK-AVX2: {{.*}}
-; CHECK-AVX2-DELAY: {{.*}}
;
; AVX-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX-64-NEXT: vcvttss2si %xmm1, %rax
; AVX-64-NEXT: vmovq %rax, %xmm1
; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX512F-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64:
; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512F-64-NEXT: vcvttss2si %xmm1, %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm1
; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX512VL-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64:
; AVX512VL-64: # %bb.0:
-; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-32-NEXT: movzbl %al, %eax
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX-32-NEXT: vcomiss %xmm1, %xmm2
; AVX-32-NEXT: vmovaps %xmm1, %xmm3
; AVX-32-NEXT: jae .LBB3_4
;
; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-64-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-64-NEXT: vcomiss %xmm1, %xmm3
; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512F-32-NEXT: andl $-8, %esp
; AVX512F-32-NEXT: subl $40, %esp
; AVX512F-32-NEXT: .cfi_offset %ebx, -12
-; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512F-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: xorl %eax, %eax
; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2
;
; AVX512F-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64:
; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512F-64-NEXT: vcvttss2usi %xmm1, %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm1
; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512VL-32-NEXT: andl $-8, %esp
; AVX512VL-32-NEXT: subl $40, %esp
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
;
; AVX512VL-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64:
; AVX512VL-64: # %bb.0:
-; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512VL-64-LABEL: strict_vector_fptosi_v8f32_to_v8i64:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX512VL-32-NEXT: .cfi_offset %edi, -16
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3
; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z}
; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %al
; AVX512VL-64-LABEL: strict_vector_fptoui_v8f32_to_v8i64:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $24, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fstps (%esp)
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1
-; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512DQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVX512DQ-32-NEXT: vzeroupper
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $24, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1
-; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512DQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVX512DQ-32-NEXT: vzeroupper
;
; AVX1-32-LABEL: uitofp_v2i1_v2f64:
; AVX1-32: # %bb.0:
-; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; AVX1-32-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v2i1_v2f64:
; AVX1-64: # %bb.0:
-; AVX1-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-64-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-64-NEXT: retq
;
; AVX512F-LABEL: uitofp_v2i1_v2f64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0
;
; AVX512DQ-LABEL: uitofp_v2i1_v2f64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512DQ-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0
;
; AVX512DQVL-32-LABEL: uitofp_v2i1_v2f64:
; AVX512DQVL-32: # %bb.0:
-; AVX512DQVL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512DQVL-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512DQVL-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
; AVX512DQVL-32-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX512DQVL-32-NEXT: retl
;
; AVX512DQVL-64-LABEL: uitofp_v2i1_v2f64:
; AVX512DQVL-64: # %bb.0:
-; AVX512DQVL-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512DQVL-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512DQVL-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512DQVL-64-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX512DQVL-64-NEXT: retq
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $32, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $32, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $64, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $64, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $48, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $48, %esp
; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, %eax
; AVX-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: subl $128, %esp
; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: subl $128, %esp
; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2
; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractps $1, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $96, %esp
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $96, %esp
; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm3
; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3]
; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: vextractps $1, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
;
; AVX1-LABEL: fptosi_4f32_to_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX2-LABEL: fptosi_4f32_to_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX512F-LABEL: fptosi_4f32_to_4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX512VL-LABEL: fptosi_4f32_to_4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX1-LABEL: fptosi_8f32_to_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX2-LABEL: fptosi_8f32_to_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512F-NEXT: vcvttss2si %xmm1, %rdx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-NEXT: vcvttss2si %xmm0, %rsi
; AVX512F-NEXT: vmovq %rsi, %xmm0
; AVX512F-NEXT: vmovq %rdx, %xmm1
; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi
; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: vmovq %rdx, %xmm1
;
; AVX1-LABEL: fptoui_4f32_to_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vcvttss2si %xmm3, %rax
;
; AVX2-LABEL: fptoui_4f32_to_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vcvttss2si %xmm3, %rax
;
; AVX512F-LABEL: fptoui_4f32_to_4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX512VL-LABEL: fptoui_4f32_to_4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
;
; AVX1-LABEL: fptoui_8f32_to_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vcvttss2si %xmm3, %rax
;
; AVX2-LABEL: fptoui_8f32_to_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vcvttss2si %xmm3, %rax
; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi
; AVX512F-NEXT: vmovq %rsi, %xmm0
; AVX512F-NEXT: vmovq %rdx, %xmm1
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi
; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
;
; AVX-LABEL: extract3_sitofp_v4i32_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 3
;
; AVX-LABEL: extract3_sitofp_v4i32_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 3
;
; AVX512F-LABEL: extract3_uitofp_v4i32_f32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
;
; AVX512VL-LABEL: extract3_uitofp_v4i32_f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
;
; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32:
; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VLDQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 3
;
; AVX512F-LABEL: extract3_uitofp_v4i32_f64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
;
; AVX512VL-LABEL: extract3_uitofp_v4i32_f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
;
; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64:
; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VLDQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 3
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpsllq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq __truncsfhf2@PLT
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncsfhf2@PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncsfhf2@PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1: # %bb.0:
; AVX1-NEXT: vmovups (%rdi), %xmm0
; AVX1-NEXT: vmovups 16(%rdi), %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1-NEXT: vmovups %xmm1, 48(%rsi)
; AVX1-NEXT: vmovups %xmm3, 32(%rsi)
; AVX1-NEXT: vmovups %xmm0, 16(%rsi)
; AVX2-LABEL: splat2_i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
; AVX2-NEXT: vmovups %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm12
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13
; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm12
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm11
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[0,1,0,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[0,1,0,3]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,3,3]
; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1]
; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1
; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[3,3,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm12[3,3,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm7[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[3,3,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm13[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[3,3,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX-LABEL: load_i32_stride2_vf2:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-NEXT: vmovlps %xmm1, (%rsi)
; AVX-NEXT: vmovlps %xmm0, (%rdx)
; AVX-NEXT: retq
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,0,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vmovlps %xmm2, (%rsi)
; AVX1-ONLY-NEXT: vmovlps %xmm3, (%rdx)
; AVX2-ONLY: # %bb.0:
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0
; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-ONLY-NEXT: vbroadcastss 8(%rdi), %xmm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rsi)
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
-; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
+; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512F-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512F-SLOW-NEXT: vmovlps %xmm2, (%rsi)
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
-; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3]
+; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512BW-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512BW-SLOW-NEXT: vmovlps %xmm2, (%rsi)
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,3,2,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,0,3,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi)
; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rdx)
; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx)
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi)
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi)
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4]
; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi)
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi)
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm10[2,0],ymm8[5,4],ymm10[6,4]
; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4]
; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm12[3,0],ymm2[6,4],ymm12[7,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4]
; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,2],ymm4[0,3],ymm5[5,6],ymm4[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm6[0,3],ymm15[5,6],ymm6[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm7[0,3],ymm14[5,6],ymm7[4,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm12, %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm12, %ymm6
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm12, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm12, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4]
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm2[3,0],ymm11[4,4],ymm2[7,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4]
; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm7[2,0],ymm13[7,4],ymm7[6,4]
; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,0],ymm14[2,0],ymm10[7,4],ymm14[6,4]
; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,0],ymm10[0,0],ymm4[5,4],ymm10[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm11[1,0],ymm5[6,4],ymm11[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm10[1,0],ymm4[6,4],ymm10[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm10[0,0],ymm1[5,4],ymm10[4,4]
; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm7[0,0],ymm1[5,4],ymm7[4,4]
; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm9[0,0],ymm3[5,4],ymm9[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm1[0,0],ymm8[5,4],ymm1[4,4]
; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm10[1,0],ymm1[6,4],ymm10[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm7[1,0],ymm1[6,4],ymm7[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm2[1,0],ymm8[6,4],ymm2[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,0],ymm9[1,0],ymm3[6,4],ymm9[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
; AVX1-ONLY-NEXT: vmovlps %xmm3, (%rsi)
; AVX1-ONLY-NEXT: vmovlps %xmm4, (%rdx)
; AVX1-ONLY-NEXT: vmovlps %xmm5, (%rcx)
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,2,3,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <4,2,u,u>
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[3,3]
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsi)
; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx)
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[0],ymm10[3],ymm4[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u>
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm6
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm9
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm2, %ymm2
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u>
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm6
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <u,u,u,4,2,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm9
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm10
; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0],ymm14[0],ymm7[3],ymm14[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1
; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm12
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[0,1]
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm12[0],ymm5[3],ymm12[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[3,0],ymm6[1,0],ymm13[7,4],ymm6[5,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm2[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,1],ymm14[1,3],ymm7[7,5],ymm14[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm4
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm7
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2
; AVX1-ONLY-NEXT: vshufps $215, (%rsp), %ymm7, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1],ymm5[2,0],ymm14[5,5],ymm5[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm12[1,0],ymm9[7,4],ymm12[5,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm2[2,0],ymm13[5,5],ymm2[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14
; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm15
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u>
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14
; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm15
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u>
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1
; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14
; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm15
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[0,1]
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[0],ymm5[0],ymm12[3],ymm5[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm5
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1
; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm13[0,1]
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[3],ymm0[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm11[1,3],ymm12[7,5],ymm11[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm8[1,3],ymm0[7,5],ymm8[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1],ymm7[1,3],ymm15[7,5],ymm7[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm12
; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0],xmm5[1],xmm10[2,3]
; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm5
; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm15[1,1],ymm8[2,0],ymm15[5,5],ymm8[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4]
; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4]
; AVX1-ONLY-NEXT: # ymm6 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm14[1,0],ymm4[7,4],ymm14[5,4]
; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1],ymm5[2,0],ymm7[5,5],ymm5[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1]
; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u>
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9
; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11
; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1]
; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u>
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3
; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4
; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11
; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u>
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9
; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm15[0,1]
; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm14[1,3],ymm1[7,5],ymm14[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm2[1,3],ymm1[7,5],ymm2[5,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm13
; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4],ymm8[5,6,7]
; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,1],ymm15[2,0],ymm8[5,5],ymm15[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4],ymm6[5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,1],ymm15[2,0],ymm8[5,5],ymm15[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4]
; AVX1-ONLY-NEXT: # ymm13 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm8[1,1],ymm13[2,0],ymm8[5,5],ymm13[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4]
; AVX1-ONLY-NEXT: # ymm12 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[1,1],ymm12[2,0],ymm8[5,5],ymm12[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm2[1,0],ymm10[7,4],ymm2[5,4]
; AVX1-ONLY-NEXT: # ymm11 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[1,1],ymm11[2,0],ymm8[5,5],ymm11[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm1[1,0],ymm9[7,4],ymm1[5,4]
; AVX1-ONLY-NEXT: # ymm10 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,1],ymm10[2,0],ymm8[5,5],ymm10[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm0[1,0],ymm7[7,4],ymm0[5,4]
; AVX1-ONLY-NEXT: # ymm9 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[2,0],ymm8[5,5],ymm9[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm12
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm9, %ymm7
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm3
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm6
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm4 = mem[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm12
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm8, %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm1, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm6
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3
; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm4
; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4],ymm7[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7]
; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm4
; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm4
; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,3,2,3]
; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7]
; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8
; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,3,2,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm4[0,0,2,3,4,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,0,2,3,4,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,0,1,4,5,4,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm12
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm13
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3],ymm7[4,5,6,7]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm12
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm9, %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm6
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm12
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm8, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm1, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,2,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vmovlps %xmm5, (%rsi)
; AVX1-ONLY-NEXT: vmovlps %xmm6, (%rdx)
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,0,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
; AVX2-ONLY-NEXT: vbroadcastss 8(%rdi), %xmm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <4,3,u,u>
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7]
; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rsi)
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3]
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,1,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,1,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[0,1],xmm5[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi)
; AVX2-SLOW-NEXT: vbroadcastss 84(%rdi), %xmm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3]
; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm6
; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 84(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5
; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm10[2]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm9[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm11[0,3],ymm2[7,5],ymm11[4,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,0],ymm12[0,0],ymm11[5,4],ymm12[4,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm11[0,2],ymm12[7,5],ymm11[4,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm13[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,2],ymm12[2,0],ymm8[4,6],ymm12[6,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm12[2,0],ymm8[6,4],ymm12[6,4]
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[0,1],xmm9[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm14[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,0],ymm9[0,0],ymm14[7,4],ymm9[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,0],ymm5[4,5],ymm9[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm15[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm8[1,3],ymm7[4,5],ymm8[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm13[0,0],ymm4[5,4],ymm13[4,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,1],ymm2[0,2],ymm13[7,5],ymm2[4,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm9[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm3[1,3],ymm5[4,5],ymm3[5,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4]
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm10[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm14[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm6[0,0],ymm7[7,4],ymm6[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm7[0,0],ymm5[7,4],ymm7[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm14[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm12
; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm10[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,3,2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,3,2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm6[0,3],ymm15[7,5],ymm6[4,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,3,2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm0[1],xmm6[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[3,1],ymm6[0,2],ymm10[7,5],ymm6[4,6]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm10 = xmm5[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm6
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[0,0],ymm3[5,4],ymm7[4,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm8[0,2],ymm7[7,5],ymm8[4,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm13[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm9
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm15[1,3],ymm9[4,5],ymm15[5,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm15
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm3[2,0],ymm2[6,4],ymm3[6,4]
; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4]
; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm11[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm13 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[0,1],xmm10[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[0,0],ymm4[1,0],ymm3[4,4],ymm4[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[0,1],xmm12[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0],ymm14[1,0],ymm10[4,4],ymm14[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[0,1],xmm1[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,1],xmm13[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm1[0,0],ymm6[7,4],ymm1[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4]
; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[1,0],ymm4[2,0],ymm3[5,4],ymm4[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: # xmm7 = xmm8[0,1,2],mem[3]
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm11
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4
; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5
; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm8
; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3
; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm6
; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm11[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm7[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm7[2,3,2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm11[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,3,2,3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm4[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm8[0,2],ymm0[7,5],ymm8[4,6]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm14 = xmm11[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm8[0,2],ymm0[7,5],ymm8[4,6]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm14 = xmm9[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm14
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = xmm4[0,1,2],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3]
; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1]
; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3]
; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,0],ymm15[1,0],ymm14[4,4],ymm15[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[0,1],xmm10[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1],xmm8[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,1],xmm7[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm7 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm9 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7]
; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14
; AVX2-SLOW-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7]
; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7]
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2]
; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3]
; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
; AVX2-FAST-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7]
; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm7[1,3],ymm9[4,6],ymm7[5,7]
; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm2[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm8[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14
; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7]
; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7]
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vmovq %xmm4, (%rsi)
; AVX1-ONLY-NEXT: vmovq %xmm5, (%rdx)
; AVX2-ONLY-NEXT: vbroadcastss 48(%rdi), %xmm3
; AVX2-ONLY-NEXT: vbroadcastss 16(%rdi), %ymm6
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-ONLY-NEXT: vmovq %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vmovq %xmm5, (%rdx)
; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm4
; AVX512F-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX512F-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm4
; AVX512BW-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6
; AVX512BW-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
+; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm9[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm6[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm6[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm6[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1,2],xmm10[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm14
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm13[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm12[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm15[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm15
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm3[2],xmm9[3],xmm3[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm12[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm15[2],xmm7[3],xmm15[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4
; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7
; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10
; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[2],ymm10[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0
; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm13
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm2[1,0],ymm0[5,4],ymm2[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm14[1,0],ymm13[5,4],ymm14[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm10[1],ymm9[3],ymm10[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0],ymm14[3,0],ymm13[7,4],ymm14[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1]
; AVX2-ONLY-NEXT: vmovaps %xmm15, %xmm10
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm9[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm15[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm3[0,1,2],xmm13[3]
; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5
; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm7
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11
; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm12[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm7
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm3[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm13[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm11[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0
; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm2[1,0],ymm5[5,4],ymm2[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm13 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm12[1],ymm3[3],ymm12[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm13 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm4[1],ymm2[3],ymm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm5[1],mem[1],ymm5[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm7[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = xmm11[2],mem[2],xmm11[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm12[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm10[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm9[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm9 = xmm4[2],mem[2],xmm4[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm14[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm14[2],xmm5[3],xmm14[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm14[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm4
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm11
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm15[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm3[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm12[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm12[0],ymm2[2],ymm12[2]
; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2]
; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm3
; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm5[1,0],ymm1[5,4],ymm5[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm10[1],ymm6[1],ymm10[3],ymm6[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm10[1],ymm3[3],ymm10[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm12[1],ymm5[3],ymm12[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm2[1],ymm9[3],ymm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm15[1],ymm13[3],ymm15[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm1[1],mem[1],ymm1[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm3[1],mem[1],ymm3[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm12[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm12[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm7
; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm5
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3],ymm14[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1]
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[0,1,0,1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm14[0,1,0,1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm13[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3]
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[0,0,0,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12
; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm11[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm14
; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[0,0,0,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[0,0,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-ONLY-NEXT: vzeroupper
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7]
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2]
; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm7
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3]
; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm9
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4
; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm6
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7]
; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm9
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm12
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm14
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3]
; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2]
; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm12
; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm14
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7]
; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7]
; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7]
; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm12
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm13
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm10
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 152(%rdi), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm7
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 184(%rdi), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 216(%rdi), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm11
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7]
; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm4
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2]
; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm13
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm10
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 184(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm11
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
; AVX1-ONLY-NEXT: vzeroupper
; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512F-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512F-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-SLOW-NEXT: vmovaps %ymm0, (%r8)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-SLOW-NEXT: vmovaps %ymm0, (%r8)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} ymm6 = ymm5[0,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm4[1,0,3,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm5[0],xmm7[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm0[1],xmm1[1],zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm8[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm15[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm7[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[3,0],xmm13[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm12
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9
; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm7
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4
; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11
; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8)
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm12
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm9
; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm5
; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3
; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm1[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,0],xmm13[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[3,0],xmm9[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[3,0],xmm0[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm10[3,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8)
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,0,2,u,5,7,u]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX1-ONLY-NEXT: vmovlps %xmm1, 32(%r9)
; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9)
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <u,3,7,u,u,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r9)
; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9)
; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm14
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6],ymm13[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4],ymm8[5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm6
; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm8
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm9
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm5
; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm9
; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm10
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm10[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2],xmm11[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6],ymm8[7]
; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm8
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm11[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6],ymm9[7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm8[0],xmm7[0],zero,zero
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4,5,6],ymm0[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm6[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm13[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13
; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm14
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm15
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm14
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7]
; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15
; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm13
; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14
; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm15
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm15[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm15[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2],xmm8[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm12
; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3],ymm6[4],ymm10[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4],ymm14[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm14
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5],ymm10[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3,4],ymm14[5,6],ymm4[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3,4],ymm0[5,6],ymm4[7]
; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4
; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm14
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm15
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm14
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm10[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm13
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm1[1],xmm2[1],zero
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm13
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13
; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm11[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm14[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4
; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6
; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7]
; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0
; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm3
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm6
; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm5
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-SLOW-NEXT: vmovaps 96(%r8), %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm12
; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm11
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm10
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm9
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm8
; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm7
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6
; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4
; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm3
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm15
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4]
; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm1
; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8
; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm8[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm10
; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm0
; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm3[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm11
; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5
; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8
; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13
; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10
; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm9
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8
; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3,4],ymm8[5,6],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm6
; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14
; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4],ymm15[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2,3,4],ymm15[5],ymm11[6,7]
; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3,4],ymm11[5,6],ymm15[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4],ymm12[5,6,7]
; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5],ymm13[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4,5,6],ymm4[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm6
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm5
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm12
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm11
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm10
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm9
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm8
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm7
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm15
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4]
; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm13[1],xmm10[1],zero
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3
; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11
; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 132(%rdx), %xmm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4
; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0
; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6
; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm6
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0
; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0]
; AVX1-ONLY-NEXT: vbroadcastss 228(%rdx), %xmm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1
; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4]
; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm5[1,0,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4
; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6
; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7]
; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0
; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm3
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vmovaps 96(%r8), %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %xmm0
; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %xmm3
; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %xmm4
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-SLOW-NEXT: vmovaps 128(%r8), %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %xmm0
; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %xmm3
; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %xmm4
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %xmm0
; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %xmm3
; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm4
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %xmm0
; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm3
; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm4
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm14
; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm13
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 144(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm12
; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm11
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9
; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm8
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 176(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm7
; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %ymm6
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5
; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 208(%r8), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm3
; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7]
; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7]
; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm8
; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14
; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm14[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm6
; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm12 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2],xmm15[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm0
; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15
; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2],ymm11[3,4],ymm15[5,6],ymm11[7]
; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm10[1,2,3,4],ymm11[5],ymm10[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0
; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm9
; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm10
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3,4],ymm9[5,6],ymm5[7]
; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm0
; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm5
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7]
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6
; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm7
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3,4],ymm6[5,6],ymm4[7]
; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5],ymm3[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4],ymm3[5,6],ymm4[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1
; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4
; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5
; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm6
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm13
; AVX2-FAST-NEXT: vmovaps 128(%rcx), %ymm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm13
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 144(%r8), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1
; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm12
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3
; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm10
; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm9
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 176(%r8), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7
; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm6
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5
; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm4
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 208(%r8), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3
; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4],ymm11[5,6,7]
; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm11
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4],ymm11[5],ymm8[6,7]
; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3,4],ymm8[5,6],ymm11[7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7]
; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4]
; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm12 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r8), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm14
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm13
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 144(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm12
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm11
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm8
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 176(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm7
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %ymm6
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3]
; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax)
; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax)
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7]
; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm9
; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm2
; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10
; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11
; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm12
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7]
; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0
; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7]
; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm6
; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7]
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax)
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2]
; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[2],ymm6[2]
; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm15[1,2]
; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13
; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm11
; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,2],ymm10[1,2],ymm0[5,6],ymm10[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,2],ymm14[1,2],ymm6[5,6],ymm14[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,0,0,0]
; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm1[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1
; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm3
; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6],ymm9[7]
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax)
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7
; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6
; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1
; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm15 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm11 = ymm11[1,2],mem[1,2],ymm11[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm15[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm14[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,0,0,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[0,0,0,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,0,0,0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm2 = ymm3[1,2],mem[1,2],ymm3[5,6],mem[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm5[1,2],ymm1[5,6],ymm5[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm2 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7]
; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm4
; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm11
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm12[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm13 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm13[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7]
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm13 = xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm3[3,0],ymm0[7,4],ymm3[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4,5,6],ymm14[7]
; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14
; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm15[0,2,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,2,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm6 = mem[2,3],ymm14[2,3]
; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6],ymm14[7]
; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX1-ONLY-NEXT: vmovaps %ymm6, 1504(%rax)
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13
; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm14 = mem[2,3],ymm14[2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13
; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3],ymm14[2,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u>
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u>
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,0],xmm3[0,0]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6],ymm3[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3]
; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = <u,1,5,u,u,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <u,u,0,4>
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,3,7,u,u,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax)
; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax)
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3]
; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9
; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4
; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm5[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero
; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm14
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm15
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm15[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm9
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3]
; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5]
; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6
; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4
; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm9, %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm13
; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm14
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6],ymm6[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3]
; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm15
; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3]
; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9
; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm5[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm14
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm15
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm15[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm9
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14
; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6],ymm0[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm12
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm7
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm5, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm14
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7]
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm10
; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm2
; AVX2-SLOW-NEXT: vbroadcastss %xmm5, %xmm1
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm4
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8
; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1]
; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10
; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm10[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7]
; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3]
; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7]
; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3]
; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm12
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm7
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm5, %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm14
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7]
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm10
; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm2
; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm5, %xmm1
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm4
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm15[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm13
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[2,3],ymm11[1,2],ymm2[6,7],ymm11[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1,2,3,4],ymm11[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm12[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3]
; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3],ymm4[1,2],ymm2[6,7],ymm4[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm8
; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9
; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm4
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12
; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm6
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%r9), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 64(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm13
; AVX2-SLOW-NEXT: vmovaps 96(%r8), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm11
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 48(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 80(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm9
; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm15
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm3
; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm3
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm11
; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5
; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm13
; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8
; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%r9), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-NEXT: vmovaps 64(%rax), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm2
; AVX2-FAST-NEXT: vmovaps 96(%r8), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1]
; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm12
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps (%r8), %ymm6
; AVX2-FAST-NEXT: vmovaps (%r9), %ymm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4,5],ymm6[6],ymm1[7]
; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm15
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm15
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3
; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm14
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
; AVX2-FAST-NEXT: vbroadcastss 108(%r8), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm12[2,2,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm3
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3]
; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4],ymm4[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2]
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm0
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6],ymm0[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3]
; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm6[2,3,4],ymm0[5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm6
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4],ymm11[5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm11
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm8
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm6
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm13
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm11
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 48(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm9
; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm15
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm5
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm3
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm11
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm11[0]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm3
; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6]
; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm3
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm4
; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5
; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11
; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%r9), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 64(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3
; AVX2-SLOW-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 128(%r9), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 128(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %xmm3
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 160(%r9), %xmm0
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 160(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %xmm3
; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%r8), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-SLOW-NEXT: vmovaps 192(%rax), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %xmm1
; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %xmm3
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 48(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 64(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 80(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 96(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 112(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 128(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 144(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 160(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 176(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 192(%r9), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vmovaps 208(%rax), %xmm2
; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm4
; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm2
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm15
; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %ymm13
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7]
; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[1,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm4[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2],xmm13[3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm1[1],xmm0[1],zero
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm12
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm13
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm1
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm4
; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm2
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm10
; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm13
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3],xmm14[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm15[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3]
; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm7
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm5
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm6
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm2
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7]
; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm7
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4
; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7
; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%r9), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-NEXT: vmovaps 64(%rax), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-NEXT: vmovaps 96(%rax), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3
; AVX2-FAST-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 128(%r9), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-NEXT: vmovaps 128(%rax), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm3
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 160(%r9), %xmm0
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-NEXT: vmovaps 160(%rax), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm3
; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 192(%r8), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-NEXT: vmovaps 192(%rax), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm1
; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm3
; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps (%r9), %ymm12
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 64(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 96(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 112(%rax), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 128(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 144(%rax), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 160(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 176(%rax), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 192(%r9), %ymm1
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-NEXT: vmovaps 208(%rax), %xmm2
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm3
; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm7
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm7[1,1,1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7]
; AVX2-FAST-NEXT: vmovaps 224(%rax), %xmm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 232(%rax), %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm2
; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm4[1,2,3,4,5,6],ymm15[7]
; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm4
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm15
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4,5,6,7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,2,3,3]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,2,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7]
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = ymm14[1,1],mem[1,1],ymm14[5,5],mem[5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4]
; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0
; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm3
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7]
; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7]
; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5]
-; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4]
; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%r9), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%r8), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rax), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %xmm3
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 48(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 112(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 144(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 176(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%r9), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vmovaps 208(%rax), %xmm2
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm2
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm15
; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %ymm13
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[1,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm4[1,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2],xmm13[3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm1[1],xmm0[1],zero
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm12
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm13
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm10
; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm13
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3],xmm14[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm15[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm7
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm5
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm6
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm2
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm7
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13
; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm11[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm12[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm9[1],ymm2[3],ymm9[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15
; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm13[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1]
; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3]
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9
; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm9[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7
; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10
; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3]
; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm9[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1]
; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm5
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6
; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm14[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm8
; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2
; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1,2],xmm13[3]
; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm11[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[4],ymm7[4],ymm2[5],ymm7[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm2[1],ymm7[3],ymm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4]
; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4
; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10
; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8
; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm3[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7
; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm4
; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm7
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm7[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1],xmm13[2,3]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm2[0],ymm9[2],ymm2[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm2[1],ymm9[3],ymm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4]
; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4
; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm8
; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm9
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0
; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3
; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm4
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3
; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm4
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3
; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm4
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3
; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm4
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm2[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm1[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm0[1],xmm10[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm9
; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7]
; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm4[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm7
; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm8
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm9[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm15
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm9[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm4
; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm7
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[0,1,0,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7]
; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm14
; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,2,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm7[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[1,1,1,1]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[2,2,2,2]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3
; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4
; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7]
; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7]
; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7]
; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7]
; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1
; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm10
; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm15
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1
; AVX2-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm4
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3]
; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm4
; AVX2-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm7
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm7
; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm15
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm8
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm5
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6,7]
; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm1
; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdx), %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1
; AVX2-ONLY-NEXT: vbroadcastsd 384(%rdx), %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1
; AVX2-ONLY-NEXT: vbroadcastsd 416(%rdx), %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1
; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm14
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm14
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5],ymm13[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3]
; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14
; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm12
-; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
+; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7]
; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],mem[0],ymm8[2],mem[2]
; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2]
; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2
; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm0
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm0
; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15
; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9
; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7]
; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm11[1],ymm4[3],ymm11[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6
; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm2[1],ymm9[3],ymm2[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8
; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm15[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm7[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11
; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,3,2,3]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 144(%r8), %xmm0
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5
; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 272(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 304(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm1
; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 368(%rcx), %xmm12
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm13
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3]
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3
; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0
; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1
; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm5
; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm9[0,0,1,1]
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,2,2]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,0,1,1]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[1,0,2,2,4,5,6,7]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [281474976710655,281474976710655,281474976710655,281474976710655]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2
; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5
; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12
; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4
; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4
; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5
; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8
; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3
; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6
; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5
; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0
; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [281474976710655,281474976710655,281474976710655,281474976710655]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855]
; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1
; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13
; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13
; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13
; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2
; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7]
; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7]
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5
; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0
; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5]
; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
-; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7]
+; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7]
; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5
; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3
; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: retq
;
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0)
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: vzeroupper
; AVX1-SLOW-NEXT: retq
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vzeroupper
; AVX1-SLOW-NEXT: retq
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2
; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2
; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0)
; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX-LABEL: test_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4
;
; AVX512-LABEL: test_v4f32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4
; AVX-LABEL: test_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm8
; AVX512BW-LABEL: test_v8f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8
; AVX512VL-LABEL: test_v8f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8
; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3
; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3
; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2
; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2
; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm0
; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-LABEL: test_v16f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3]
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm16
;
; AVX-LABEL: test_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4
;
; AVX512-LABEL: test_v4f32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4
; AVX-LABEL: test_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX-NEXT: vminss %xmm0, %xmm7, %xmm8
; AVX512BW-LABEL: test_v8f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8
; AVX512VL-LABEL: test_v8f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8
; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3
; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3
; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm2
; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2
; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm0
; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-LABEL: test_v16f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3]
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3]
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm16
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0)
; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm5, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
; XOPAVX1-LABEL: vector_variable_shift_right:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; XOPAVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; XOPAVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
; XOPAVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shift32_v4i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: shift32_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shift32_v4i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: shift32_v4i64:
;
; AVX1-LABEL: shuffle_v2i64_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_00:
;
; AVX-LABEL: shuffle_v2i64_10:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0>
ret <2 x i64> %shuffle
;
; AVX-LABEL: shuffle_v2i64_11:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %shuffle
;
; AVX1-LABEL: shuffle_v2i64_22:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_22:
;
; AVX-LABEL: shuffle_v2i64_32:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2>
ret <2 x i64> %shuffle
;
; AVX-LABEL: shuffle_v2i64_33:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3>
ret <2 x i64> %shuffle
;
; AVX-LABEL: shuffle_v4i32_0001:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_0020:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_0112:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_0300:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_1000:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_2200:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_3330:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_3210:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4i32_2121:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
ret <4 x i32> %shuffle
;
; AVX-LABEL: shuffle_v4f32_0001:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_0020:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_0300:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_1000:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_2200:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_3330:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_3210:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_0011:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
ret <4 x float> %shuffle
;
; AVX-LABEL: shuffle_v4f32_2233:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
ret <4 x float> %shuffle
;
; AVX1-LABEL: shuffle_v4i32_0124:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1-NEXT: retq
;
;
; AVX1-LABEL: shuffle_v4i32_0142:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i32_0142:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-NEXT: retq
;
;
; AVX1-LABEL: shuffle_v4i32_0412:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i32_0412:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: retq
;
;
; AVX1OR2-LABEL: shuffle_v4i32_4012:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2]
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1OR2-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4i32_0451:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_0451:
; AVX1OR2-LABEL: shuffle_v4i32_4015:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_4015:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
;
; AVX1-LABEL: shuffle_v4i32_z6zz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2-SLOW-NEXT: retq
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovaps %xmm1, (%rsi)
; AVX1-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
;
; AVX-LABEL: shuffle_v8i16_01012323:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
ret <8 x i16> %shuffle
;
; AVX-LABEL: shuffle_v8i16_67452301:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
ret <8 x i16> %shuffle
;
; AVX-LABEL: shuffle_v8i16_23016745:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
;
; AVX-LABEL: shuffle_v8i16_XXXdXXXX:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; ALL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0001:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1054:
; AVX2: # %bb.0:
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_1054:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1054:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1054:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
ret <4 x i64> %shuffle
; AVX2-LABEL: shuffle_v4i64_3276:
; AVX2: # %bb.0:
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276:
; AVX2-LABEL: shuffle_v4i64_1076:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v4i64_1076:
define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
; ALL-LABEL: shuffle_v4i64_11uu:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x i64> %shuffle
; AVX1-LABEL: shuffle_v4i64_22uu:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_22uu:
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX512VL-NEXT: retq
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
%2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; ALL-NEXT: retq
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
%2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000010:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_00000010:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000010:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000010:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000010:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000200:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_00000200:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000200:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000200:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000200:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00003000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_00003000:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00003000:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_00003000:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00003000:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00112233:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00001111:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_00001111:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00001111:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_00001111:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00001111:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; AVX1-LABEL: shuffle_v8f32_08080808:
; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8f32_08084c4c:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_08084c4c:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c:
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: retq
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: retq
define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_091b2d3f:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_09ab1def:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_09ab1def:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: retq
define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00014445:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00204464:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_03004744:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10005444:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_22006644:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_33307774:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_32107654:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00234467:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x float> %shuffle
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %1
define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10325476:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10235467:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x float> %shuffle
define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10225466:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x float> %shuffle
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0]
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX2-SLOW-NEXT: retq
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0]
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: retq
; AVX1-LABEL: shuffle_v8f32_f511235a:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[3],ymm0[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_f511235a:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-FAST-PERLANE-NEXT: retq
define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_32103210:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_32103210:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_32103210:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_32103210:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_32103210:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
; AVX1-LABEL: shuffle_v8f32_76547654:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_76547654:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76547654:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76547654:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
; AVX1-LABEL: shuffle_v8f32_76543210:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_76543210:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76543210:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76543210:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX1OR2-LABEL: shuffle_v8f32_3210ba98:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210ba98:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_3210ba98:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_3210ba98:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
ret <8 x float> %shuffle
; AVX1OR2-LABEL: shuffle_v8f32_3210fedc:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc:
; AVX1OR2-LABEL: shuffle_v8f32_7654fedc:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_7654fedc:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_7654fedc:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_7654fedc:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
ret <8 x float> %shuffle
; AVX1OR2-LABEL: shuffle_v8f32_fedc7654:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_fedc7654:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_fedc7654:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_fedc7654:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
; AVX1OR2-LABEL: shuffle_v8f32_ba987654:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654:
; AVX1OR2-LABEL: shuffle_v8f32_ba983210:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba983210:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_ba983210:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_ba983210:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
; AVX1-LABEL: shuffle_v8f32_084c195d:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_uuuu1111:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
; AVX1-LABEL: shuffle_v8f32_44444444:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8f32_44444444:
define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_uuuu3210:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_1111uuuu:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %shuffle
; ALL-LABEL: shuffle_v8f32_5555uuuu:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %shuffle
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%2 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000010:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_00000010:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000010:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000010:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000010:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000200:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_00000200:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000200:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000200:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000200:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00003000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_00003000:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00003000:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00003000:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00003000:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
;
; AVX2OR512VL-LABEL: shuffle_v8i32_01014545:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00112233:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_00112233:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00112233:
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00112233:
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00112233:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00112233:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00001111:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_00001111:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00001111:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00001111:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00001111:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; AVX1-LABEL: shuffle_v8i32_08080808:
; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i32_08084c4c:
; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08084c4c:
; AVX2: # %bb.0:
; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c:
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_09ab1def:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_00014445:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_00204464:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_03004744:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_10005444:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_22006644:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_33307774:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_32107654:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_00234467:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00224466:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_10325476:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i32> %shuffle
;
; AVX2OR512VL-LABEL: shuffle_v8i32_11335577:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_10235467:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_10225466:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-SLOW-NEXT: retq
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_6caa87e5:
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-FAST-PERLANE-NEXT: retq
define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_32103210:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_32103210:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_32103210:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_32103210:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_32103210:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
; AVX1-LABEL: shuffle_v8i32_76547654:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_76547654:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76547654:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76547654:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
; AVX1-LABEL: shuffle_v8i32_76543210:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_76543210:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76543210:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-SLOW-NEXT: retq
;
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76543210:
; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX1OR2-LABEL: shuffle_v8i32_3210ba98:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210ba98:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_3210ba98:
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_3210ba98:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
ret <8 x i32> %shuffle
; AVX1OR2-LABEL: shuffle_v8i32_3210fedc:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc:
; AVX1OR2-LABEL: shuffle_v8i32_7654fedc:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_7654fedc:
; AVX1OR2-LABEL: shuffle_v8i32_fedc7654:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_fedc7654:
; AVX1OR2-LABEL: shuffle_v8i32_ba987654:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654:
; AVX1OR2-LABEL: shuffle_v8i32_ba983210:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210:
define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_uuuu1111:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_2222uuuu:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
; AVX1-LABEL: shuffle_v8i32_44444444:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_44444444:
; AVX1-LABEL: shuffle_v8i32_44444444_bc:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i32_44444444_bc:
; ALL-LABEL: shuffle_v8i32_5555uuuu:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,1,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
;
; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
+; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
ret <8 x i32> %shuffle
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%2 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; ALL-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %1
define <8 x float> @splat_v8f32(<4 x float> %r) {
; AVX1-LABEL: splat_v8f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_z0U2zUz6:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_1U3z5zUU:
define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_30127456:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
ret <8 x i32> %shuffle
define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_12305674:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
ret <8 x i32> %shuffle
define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) {
; AVX1-LABEL: broadcast_concat_crash:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX512VL-SLOW: # %bb.0: # %entry
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-SLOW-NEXT: retq
; AVX1-LABEL: lowhalf_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,2,2]
; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: retq
;
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,2]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: lowhalf_v8f32:
define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4, i32 8, i32 8, i32 10, i32 8, i32 12, i32 12, i32 14, i32 12>
ret <16 x float> %shuffle
define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12>
ret <16 x float> %shuffle
define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
; ALL-NEXT: retq
%1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
ret <16 x i32> %c
define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
ret <16 x i32> %c
define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00224466:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10325476:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i64> %shuffle
define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_11335577:
; ALL: # %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i64> %shuffle
;
; KNL-LABEL: expand:
; KNL: # %bb.0:
-; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
; KNL-NEXT: ret{{[l|q]}}
;
; KNL-LABEL: expand15:
; KNL: # %bb.0:
-; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
; KNL-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
; KNL-NEXT: ret{{[l|q]}}
define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
; CHECK-LABEL: combine_vpermilvar_4f32_unpckh:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: ret{{[l|q]}}
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
ret <4 x float> %1
define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
; CHECK-LABEL: combine_vpermilvar_4f32_unpckl:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: ret{{[l|q]}}
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
ret <4 x float> %1
; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
-; AVX512-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX512-NEXT: ret{{[l|q]}}
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
%2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
; CHECK-LABEL: combine_vpermilvar_4f32_4stage:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
; CHECK-NEXT: ret{{[l|q]}}
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
%2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
; CHECK-LABEL: combine_vpermilvar_8f32_4stage:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; CHECK-NEXT: ret{{[l|q]}}
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; CHECK-NEXT: ret{{[l|q]}}
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: ret{{[l|q]}}
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0
;
; AVX-LABEL: combine_pshufb_palignr:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
%2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
;
; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb:
;
; AVX1-LABEL: combine_pshufd6:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufd6:
; AVX-LABEL: combine_bitwise_ops_test1:
; AVX: # %bb.0:
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
; AVX-LABEL: combine_bitwise_ops_test2:
; AVX: # %bb.0:
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
; AVX-LABEL: combine_bitwise_ops_test3:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
; AVX-LABEL: combine_bitwise_ops_test4:
; AVX: # %bb.0:
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
; AVX-LABEL: combine_bitwise_ops_test5:
; AVX: # %bb.0:
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
; AVX-LABEL: combine_bitwise_ops_test6:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
;
; AVX-LABEL: combine_nested_undef_test1:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
;
; AVX-LABEL: combine_nested_undef_test2:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
;
; AVX-LABEL: combine_nested_undef_test3:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
;
; AVX1-LABEL: combine_nested_undef_test4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test4:
;
; AVX-LABEL: combine_nested_undef_test5:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
;
; AVX-LABEL: combine_nested_undef_test6:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
;
; AVX-LABEL: combine_nested_undef_test7:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
;
; AVX-LABEL: combine_nested_undef_test8:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
;
; AVX-LABEL: combine_nested_undef_test9:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,2]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
;
; AVX-LABEL: combine_nested_undef_test10:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
;
; AVX-LABEL: combine_nested_undef_test11:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
;
; AVX1-LABEL: combine_nested_undef_test12:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test12:
;
; AVX1-LABEL: combine_nested_undef_test15:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test15:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
;
; AVX-LABEL: combine_nested_undef_test16:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; AVX-LABEL: combine_nested_undef_test17:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
;
; AVX-LABEL: combine_nested_undef_test18:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
; AVX-LABEL: combine_nested_undef_test19:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
; AVX-LABEL: combine_nested_undef_test20:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
; AVX1-LABEL: combine_nested_undef_test21:
; AVX1: # %bb.0:
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test21:
;
; AVX-LABEL: combine_nested_undef_test22:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,1,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
;
; AVX-LABEL: combine_nested_undef_test23:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
;
; AVX-LABEL: combine_nested_undef_test24:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
;
; AVX1-LABEL: combine_nested_undef_test25:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test25:
;
; AVX-LABEL: combine_nested_undef_test26:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
;
; AVX1-LABEL: combine_nested_undef_test27:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test27:
;
; AVX-LABEL: combine_nested_undef_test28:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
;
; AVX-LABEL: combine_test1b:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,2,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
; AVX-LABEL: combine_test3b:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
;
; AVX-LABEL: combine_test4b:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_unneeded_subvector2:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
entry:
%s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
;
; AVX-LABEL: PR22390:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
; AVX-LABEL: concat_a_to_shuf_of_a:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: vmovaps %ymm0, (%rsi)
; AVX-NEXT: vzeroupper
; AVX-LABEL: concat_shuf_of_a_to_a:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovaps %ymm0, (%rdx)
; AVX-NEXT: vzeroupper
; AVX-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vmovaps %xmm1, (%rdx)
; AVX-NEXT: vmovaps %xmm0, 16(%rsi)
; AVX-NEXT: vmovaps %xmm1, (%rsi)
; AVX2-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vmovaps %xmm1, (%rdx)
; AVX2-NEXT: vmovaps %xmm0, 16(%rsi)
; AVX2-NEXT: vmovaps %xmm1, (%rsi)
; AVX512F-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vmovaps %xmm1, (%rdx)
; AVX512F-NEXT: vmovaps %xmm0, 16(%rsi)
; AVX512F-NEXT: vmovaps %xmm1, (%rsi)
; AVX512BW-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
; AVX512BW-NEXT: vmovaps %xmm0, 16(%rsi)
; AVX512BW-NEXT: vmovaps %xmm1, (%rsi)
; AVX-LABEL: concat_aaa_to_shuf_of_a:
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX-NEXT: vmovaps %ymm0, 32(%rsi)
; AVX-NEXT: vmovaps %ymm1, (%rsi)
; AVX-LABEL: concat_shuf_of_a_to_aaa:
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vmovaps %ymm0, (%rsi)
; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v2i64_v2i32:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovlpd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovd %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
; AVX2-SLOW-NEXT: retq
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i32:
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovlpd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovd %xmm0, (%rdi)
; AVX1-NEXT: retq
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
; AVX2-SLOW-NEXT: retq
; AVX1-LABEL: vselect_concat_splat:
; AVX1: ## %bb.0: ## %entry
; AVX1-NEXT: vmovups (%rax), %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
; AVX1-NEXT: vmovups 16, %xmm2
; AVX1-NEXT: vmovups 32, %xmm3
; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3]
; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,0,3,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1
; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rdi), %ymm0
; AVX2-NEXT: vmovups (%rdi), %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rdi), %ymm0
; AVX2-NEXT: vmovups (%rdi), %xmm1
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX2-NEXT: vmovups %ymm0, 96(%rsi)
; AVX2-NEXT: vmovups %ymm3, 64(%rsi)
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3]
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7]
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovaps 48(%rdi), %xmm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3]
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX-NEXT: vbroadcastss (%rdi), %ymm2
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7]
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
"X86FastTileConfig.cpp",
"X86FixupBWInsts.cpp",
"X86FixupLEAs.cpp",
+ "X86FixupInstTuning.cpp",
"X86FixupSetCC.cpp",
"X86FlagsCopyLowering.cpp",
"X86FloatingPoint.cpp",