// registers. e.g. r1 = move v1024.
DenseMap<Register, Register> DstRegMap;
+ void removeClobberedSrcRegMap(MachineInstr *MI);
+
bool isRevCopyChain(Register FromReg, Register ToReg, int Maxlen);
bool noUseAfterLastDef(Register Reg, unsigned Dist, unsigned &LastDef);
if (!MRI->hasOneNonDBGUse(Reg))
// None or more than one use.
return nullptr;
- MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(Reg);
+ MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg);
+ MachineInstr &UseMI = *UseOp.getParent();
if (UseMI.getParent() != MBB)
return nullptr;
Register SrcReg;
IsDstPhys = DstReg.isPhysical();
return &UseMI;
}
+ if (UseMI.isCommutable()) {
+ unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex;
+ unsigned Src2 = UseMI.getOperandNo(&UseOp);
+ if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) {
+ MachineOperand &MO = UseMI.getOperand(Src1);
+ if (MO.isReg() && MO.isUse() &&
+ isTwoAddrUse(UseMI, MO.getReg(), DstReg)) {
+ IsDstPhys = DstReg.isPhysical();
+ return &UseMI;
+ }
+ }
+ }
return nullptr;
}
return TRI->regsOverlap(RegA, RegB);
}
+/// From RegMap remove entries mapped to a physical register which overlaps MO.
+static void removeMapRegEntry(const MachineOperand &MO,
+ DenseMap<Register, Register> &RegMap,
+ const TargetRegisterInfo *TRI) {
+ assert(
+ (MO.isReg() || MO.isRegMask()) &&
+ "removeMapRegEntry must be called with a register or regmask operand.");
+
+ SmallVector<Register, 2> Srcs;
+ for (auto SI : RegMap) {
+ Register ToReg = SI.second;
+ if (ToReg.isVirtual())
+ continue;
+
+ if (MO.isReg()) {
+ Register Reg = MO.getReg();
+ if (TRI->regsOverlap(ToReg, Reg))
+ Srcs.push_back(SI.first);
+ } else if (MO.clobbersPhysReg(ToReg))
+ Srcs.push_back(SI.first);
+ }
+
+ for (auto SrcReg : Srcs)
+ RegMap.erase(SrcReg);
+}
+
+/// If a physical register is clobbered, old entries mapped to it should be
+/// deleted. For example
+///
+/// %2:gr64 = COPY killed $rdx
+/// MUL64r %3:gr64, implicit-def $rax, implicit-def $rdx
+///
+/// After the MUL instruction, $rdx contains different value than in the COPY
+/// instruction. So %2 should not map to $rdx after MUL.
+void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) {
+ if (MI->isCopy()) {
+ // If a virtual register is copied to its mapped physical register, it
+ // doesn't change the potential coalescing between them, so we don't remove
+ // entries mapped to the physical register. For example
+ //
+ // %100 = COPY $r8
+ // ...
+ // $r8 = COPY %100
+ //
+ // The first copy constructs SrcRegMap[%100] = $r8, the second copy doesn't
+ // destroy the content of $r8, and should not impact SrcRegMap.
+ Register Dst = MI->getOperand(0).getReg();
+ if (!Dst || Dst.isVirtual())
+ return;
+
+ Register Src = MI->getOperand(1).getReg();
+ if (regsAreCompatible(Dst, getMappedReg(Src, SrcRegMap), TRI))
+ return;
+ }
+
+ for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isRegMask()) {
+ removeMapRegEntry(MO, SrcRegMap, TRI);
+ continue;
+ }
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg || Reg.isVirtual())
+ continue;
+ removeMapRegEntry(MO, SrcRegMap, TRI);
+ }
+}
+
// Returns true if Reg is equal or aliased to at least one register in Set.
static bool regOverlapsSet(const SmallVectorImpl<Register> &Set, Register Reg,
const TargetRegisterInfo *TRI) {
VirtRegPairs.push_back(NewReg);
break;
}
- bool isNew = SrcRegMap.insert(std::make_pair(NewReg, Reg)).second;
- if (!isNew)
- assert(SrcRegMap[NewReg] == Reg && "Can't map to two src registers!");
+ SrcRegMap[NewReg] = Reg;
VirtRegPairs.push_back(NewReg);
Reg = NewReg;
}
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
-
- // Propagate SrcRegMap.
- SrcRegMap[RegA] = RegB;
}
if (AllUsesCopied) {
LV->addVirtualRegisterKilled(RegB, *PrevMI);
}
+ if (RemovedKillFlag && ReplacedAllUntiedUses)
+ SrcRegMap[LastCopiedReg] = RegB;
+
// Update LiveIntervals.
if (LIS) {
LiveInterval &LI = LIS->getInterval(RegB);
// First scan through all the tied register uses in this instruction
// and record a list of pairs of tied operands for each register.
if (!collectTiedOperands(&*mi, TiedOperands)) {
+ removeClobberedSrcRegMap(&*mi);
mi = nmi;
continue;
}
// The tied operands have been eliminated or shifted further down
// the block to ease elimination. Continue processing with 'nmi'.
TiedOperands.clear();
+ removeClobberedSrcRegMap(&*mi);
mi = nmi;
continue;
}
// Clear TiedOperands here instead of at the top of the loop
// since most instructions do not have tied operands.
TiedOperands.clear();
+ removeClobberedSrcRegMap(&*mi);
mi = nmi;
}
}
define i32 @no_sat_incorrect_constant(i32 %x) #0 {
; V4T-LABEL: no_sat_incorrect_constant:
; V4T: @ %bb.0: @ %entry
-; V4T-NEXT: mov r2, #1065353216
+; V4T-NEXT: mov r1, #1065353216
; V4T-NEXT: cmn r0, #8388608
-; V4T-NEXT: orr r2, r2, #-1073741824
-; V4T-NEXT: mov r1, r0
-; V4T-NEXT: orrlt r1, r2, #1
-; V4T-NEXT: ldr r2, .LCPI11_0
+; V4T-NEXT: orr r1, r1, #-1073741824
+; V4T-NEXT: mov r2, r0
+; V4T-NEXT: orrlt r2, r1, #1
+; V4T-NEXT: ldr r1, .LCPI11_0
; V4T-NEXT: cmp r0, #8388608
-; V4T-NEXT: movge r1, r2
+; V4T-NEXT: movlt r1, r2
; V4T-NEXT: mov r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 {
; V4T-LABEL: no_unsigned_sat_incorrect_compare:
; V4T: @ %bb.0: @ %entry
-; V4T-NEXT: ldr r2, .LCPI14_0
; V4T-NEXT: cmp r1, #0
-; V4T-NEXT: mov r1, r0
-; V4T-NEXT: movmi r1, #0
+; V4T-NEXT: mov r2, r0
+; V4T-NEXT: movmi r2, #0
+; V4T-NEXT: ldr r1, .LCPI14_0
; V4T-NEXT: cmp r0, #8388608
-; V4T-NEXT: movge r1, r2
+; V4T-NEXT: movlt r1, r2
; V4T-NEXT: mov r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
;
; V6-LABEL: no_unsigned_sat_incorrect_compare:
; V6: @ %bb.0: @ %entry
-; V6-NEXT: ldr r2, .LCPI14_0
; V6-NEXT: cmp r1, #0
-; V6-NEXT: mov r1, r0
-; V6-NEXT: movmi r1, #0
+; V6-NEXT: mov r2, r0
+; V6-NEXT: movmi r2, #0
+; V6-NEXT: ldr r1, .LCPI14_0
; V6-NEXT: cmp r0, #8388608
-; V6-NEXT: movge r1, r2
+; V6-NEXT: movlt r1, r2
; V6-NEXT: mov r0, r1
; V6-NEXT: bx lr
; V6-NEXT: .p2align 2
; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: mov {{.*}}[[SCRATCH]], {{.*}}[[RESULT]]
; CHECK: swap {{.*}}[[SCRATCH]]
-; CHECK: add {{.*}}[[SCRATCH]], {{.*}}[[RESULT]]
-; CHECK: andi {{.*}}[[SCRATCH]], 15
-; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
+; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
+; CHECK: andi {{.*}}[[RESULT]], 15
; CHECK: ret
define i16 @mult16(i16 %a, i16 %b) {
; CHECK-LABEL: mult16:
; CHECK: muls r22, r25
-; CHECK: mov r18, r0
+; CHECK: mov r20, r0
; CHECK: mul r22, r24
-; CHECK: mov r19, r0
-; CHECK: mov r20, r1
+; CHECK: mov r21, r0
+; CHECK: mov r18, r1
; CHECK: clr r1
-; CHECK: add r20, r18
+; CHECK: add r18, r20
; CHECK: muls r23, r24
; CHECK: clr r1
-; CHECK: mov r22, r0
-; CHECK: add r22, r20
+; CHECK: add r18, r0
; :TODO: finish after reworking shift instructions
%mul = mul nsw i16 %b, %a
ret i16 %mul
; CHECK-LABEL: fun6:
; CHECK: afi
; CHECK-NEXT: chi
-; CHECK-NEXT: locrlh
+; CHECK-NEXT: locre
bb:
%tmp = add i32 %arg, -2147483648
%tmp1 = icmp eq i32 %tmp, 0
; CHECK-LABEL: fun7:
; CHECK: afi
; CHECK-NEXT: chi
-; CHECK-NEXT: locrle
+; CHECK-NEXT: locrh
bb:
%tmp = add i32 %arg, -2147483648
%tmp1 = icmp sgt i32 %tmp, 0
%mul = mul i128 %add18, %add
ret i128 %mul
}
-; CHECK: adds r5, r1, r7
+; CHECK: adds r5, r1, r6
; CHECK: mov r5, r4
-; CHECK: adcs r5, r6
+; CHECK: adcs r5, r7
; CHECK: ldr r5, [sp, #12] @ 4-byte Reload
; CHECK: adcs r2, r5
; CHECK: ldr r5, [sp, #16] @ 4-byte Reload
; CHECK: adcs r3, r5
-; CHECK: adds r5, r1, r7
-; CHECK: adcs r4, r6
+; CHECK: adds r6, r1, r6
+; CHECK: adcs r4, r7
; CHECK: ldr r1, [r0, #20]
; CHECK: str r1, [sp, #16] @ 4-byte Spill
-; CHECK: ldr r6, [r0, #28]
+; CHECK: ldr r5, [r0, #28]
; CHECK: ldr r1, [r0, #16]
; CHECK: ldr r7, [r0, #24]
; CHECK: adcs r7, r1
; CHECK: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK: adcs r6, r0
+; CHECK: adcs r5, r0
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; CHECK-NEXT: movl %ebp, %esi
; CHECK-NEXT: calll callee@PLT
-; CHECK-NEXT: leal (%eax,%ebx), %esi
-; CHECK-NEXT: addl %ebp, %esi
+; CHECK-NEXT: addl %eax, %ebx
+; CHECK-NEXT: addl %ebp, %ebx
+; CHECK-NEXT: movl %ebx, %esi
; CHECK-NEXT: addl $12, %esp
; CHECK-NEXT: retl
%b1 = call x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0)
define i128 @test_i128(i128 %a) nounwind {
; X64-LABEL: test_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rsi, %rdx
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: xorq %rcx, %rdx
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: sarq $63, %rdx
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: adcq %rdx, %rsi
+; X64-NEXT: xorq %rdx, %rax
+; X64-NEXT: xorq %rsi, %rdx
; X64-NEXT: retq
;
; X86-LABEL: test_i128:
; CHECK-LABEL: select_max32_2_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: leaq 2(%rdi), %rax
-; CHECK-NEXT: addq $2147483647, %rdi # imm = 0x7FFFFFFF
+; CHECK-NEXT: 2147483647(%rdi), %rcx
; CHECK-NEXT: cmpq $41, %rsi
-; CHECK-NEXT: cmovneq %rdi, %rax
+; CHECK-NEXT: cmovneq %rcx, %rax
; CHECK-NEXT: retq
%b = icmp ne i64 %x, 41
%s = select i1 %b, i64 2147483647, i64 2
; CHECK-LABEL: select_20_43_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 43(%rdi), %eax
-; CHECK-NEXT: addl $20, %edi
+; CHECK-NEXT: leal 43(%rdi), %ecx
+; CHECK-NEXT: 20(%rdi), %eax
; CHECK-NEXT: cmpq $42, %rsi
-; CHECK-NEXT: cmovgel %edi, %eax
+; CHECK-NEXT: cmovll %ecx, %eax
; CHECK-NEXT: retq
%b = icmp sgt i64 %x, 41
%s = select i1 %b, i32 20, i32 43
; CHECK-LABEL: select_n2_17_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 17(%rdi), %eax
-; CHECK-NEXT: addl $65534, %edi # imm = 0xFFFE
+; CHECK-NEXT: leal 17(%rdi), %ecx
+; CHECK-NEXT: leal 65534(%rdi), %eax
; CHECK-NEXT: testb $1, %sil
-; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: cmovel %ecx, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: retq
%s = select i1 %b, i16 -2, i16 17
; X64-NEXT: movl %edi, %ebx
; X64-NEXT: leal 8(%rbx), %edi
; X64-NEXT: callq use@PLT
-; X64-NEXT: leal 10(%rbx), %eax
+; X64-NEXT: addl $10, %ebx
+; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
; X64-NEXT: movl %edi, %ebx
; X64-NEXT: leal 8(%rbx), %edi
; X64-NEXT: callq use@PLT
-; X64-NEXT: leal 6(%rbx), %eax
+; X64-NEXT: addl $6, %ebx
+; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
; X64-NEXT: movl %edi, %ebx
; X64-NEXT: leal -8(%rbx), %edi
; X64-NEXT: callq use@PLT
-; X64-NEXT: leal -6(%rbx), %eax
+; X64-NEXT: addl $-6, %ebx
+; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
; X64-NEXT: movl %edi, %ebx
; X64-NEXT: leal -8(%rbx), %edi
; X64-NEXT: callq use@PLT
-; X64-NEXT: leal -10(%rbx), %eax
+; X64-NEXT: addl $-10, %ebx
+; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
;
; NOBMI-X64-LABEL: n8_not_lowbit_mask:
; NOBMI-X64: # %bb.0:
-; NOBMI-X64-NEXT: movl %esi, %eax
-; NOBMI-X64-NEXT: incl %eax
+; NOBMI-X64-NEXT: # kill: def $esi killed $esi def $rsi
+; NOBMI-X64-NEXT: leal 1(%rsi), %eax
; NOBMI-X64-NEXT: notl %eax
; NOBMI-X64-NEXT: andl %edi, %eax
; NOBMI-X64-NEXT: retq
;
; X64-LABEL: n9_sub_is_not_commutative:
; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: decl %eax
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: leal -1(%rsi), %eax
; X64-NEXT: andl %edi, %eax
; X64-NEXT: subl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: addsd %xmm0, %xmm0
; X64-NEXT: movapd %xmm0, %xmm1
; X64-NEXT: #ARITH_FENCE
-; X64-NEXT: addsd %xmm0, %xmm1
-; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: addsd %xmm1, %xmm0
; X64-NEXT: retq
%1 = fadd fast double %a, %a
%t = call double @llvm.arithmetic.fence.f64(double %1)
; X86-NEXT: addps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, %xmm1
; X86-NEXT: #ARITH_FENCE
-; X86-NEXT: addps %xmm0, %xmm1
-; X86-NEXT: movaps %xmm1, %xmm0
+; X86-NEXT: addps %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: f4:
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, %xmm1
; X64-NEXT: #ARITH_FENCE
-; X64-NEXT: addps %xmm0, %xmm1
-; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: retq
%1 = fadd fast <2 x float> %a, %a
%t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1)
; X86-NEXT: #ARITH_FENCE
; X86-NEXT: movaps %xmm0, %xmm3
; X86-NEXT: #ARITH_FENCE
-; X86-NEXT: addps %xmm0, %xmm3
-; X86-NEXT: addps %xmm1, %xmm2
-; X86-NEXT: movaps %xmm3, %xmm0
-; X86-NEXT: movaps %xmm2, %xmm1
+; X86-NEXT: addps %xmm3, %xmm0
+; X86-NEXT: addps %xmm2, %xmm1
; X86-NEXT: retl
;
; X64-LABEL: f6:
; X64-NEXT: #ARITH_FENCE
; X64-NEXT: movaps %xmm0, %xmm3
; X64-NEXT: #ARITH_FENCE
-; X64-NEXT: addps %xmm0, %xmm3
-; X64-NEXT: addps %xmm1, %xmm2
-; X64-NEXT: movaps %xmm3, %xmm0
-; X64-NEXT: movaps %xmm2, %xmm1
+; X64-NEXT: addps %xmm3, %xmm0
+; X64-NEXT: addps %xmm2, %xmm1
; X64-NEXT: retq
%1 = fadd fast <8 x float> %a, %a
%t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1)
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSE2-NEXT: paddd %xmm4, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: psubd %xmm4, %xmm5
-; SSE2-NEXT: psubd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: psubd %xmm1, %xmm5
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: psubd %xmm4, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm2
; SSE2-NEXT: pslld $15, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pslld $15, %xmm0
define i32 @test(i32 %a, i32 %b) {
; CHECK-LABEL: test:
; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def $esi killed $esi def $rsi
; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal -1(%rdi), %eax
-; CHECK-NEXT: addl $1, %esi
-; CHECK-NEXT: imull %esi, %eax
+; CHECK-NEXT: leal -1(%rdi), %ecx
+; CHECK-NEXT: leal 1(%rsi), %eax
+; CHECK-NEXT: imull %ecx, %eax
; CHECK-NEXT: retq
%a1 = add i32 %a, -1
%b1 = add i32 %b, 1
; CHECK-LABEL: mand16:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: andl %esi, %ecx
-; CHECK-NEXT: xorl %esi, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: andl %esi, %eax
+; CHECK-NEXT: xorl %esi, %edi
+; CHECK-NEXT: orl %edi, %eax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: retq
;
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
-; X32-NEXT: subl $20, %esp
-; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: leal (%edx,%esi), %eax
; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: subl %ecx, %ebx
+; X32-NEXT: subl %esi, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: subl %ecx, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, %ecx
+; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: imull %eax, %ebx
-; X32-NEXT: movl %edx, %eax
-; X32-NEXT: subl %edi, %eax
+; X32-NEXT: imull %ebx, %eax
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl %edx, %ecx
; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: imull %eax, %ecx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, %esi
-; X32-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: imull %esi, %eax
-; X32-NEXT: addl %eax, %ebx
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT: movl (%esp), %esi # 4-byte Reload
-; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: imull %eax, %ebp
+; X32-NEXT: imull %ebp, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl %esi, %edi
; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: imull %esi, %edx
-; X32-NEXT: addl %ebp, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: imull %edi, %ecx
-; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: addl %ecx, %ebx
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: addl $20, %esp
+; X32-NEXT: imull %eax, %edx
+; X32-NEXT: addl %edx, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: addl $12, %esp
; X32-NEXT: popl %ebx
; X32-NEXT: popl %ebp
; X32-NEXT: retl
; WIN64-NEXT: pushq %r13
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: movl %eax, %r13d
-; WIN64-NEXT: subl %ecx, %eax
-; WIN64-NEXT: movl %edx, %ebp
-; WIN64-NEXT: subl %edi, %ebp
-; WIN64-NEXT: movl %r9d, %ebx
-; WIN64-NEXT: subl %r10d, %ebx
-; WIN64-NEXT: imull %ebx, %eax
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
+; WIN64-NEXT: movl %ecx, %ebx
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
+; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
+; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
+; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
+; WIN64-NEXT: # kill: def $r11d killed $r11d def $r11
+; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10
+; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
+; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: leal (%rdx,%rdi), %r13d
+; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx
+; WIN64-NEXT: subl %edi, %edx
+; WIN64-NEXT: leal (%rsi,%r8), %ecx
+; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi
+; WIN64-NEXT: subl %r8d, %esi
+; WIN64-NEXT: leal (%r9,%r10), %r8d
+; WIN64-NEXT: movl %r9d, %ebp
+; WIN64-NEXT: subl %r10d, %ebp
+; WIN64-NEXT: movl %eax, %edi
+; WIN64-NEXT: movl %ebx, %r9d
+; WIN64-NEXT: subl %ebx, %edi
+; WIN64-NEXT: imull %edi, %ebp
+; WIN64-NEXT: leal (%r11,%r12), %edi
; WIN64-NEXT: movl %r11d, %ebx
; WIN64-NEXT: subl %r12d, %ebx
-; WIN64-NEXT: imull %ebp, %ebx
-; WIN64-NEXT: movl %esi, %ebp
-; WIN64-NEXT: subl %r8d, %ebp
-; WIN64-NEXT: addl %ebx, %eax
-; WIN64-NEXT: movl %r14d, %ebx
-; WIN64-NEXT: subl %r15d, %ebx
-; WIN64-NEXT: imull %ebp, %ebx
-; WIN64-NEXT: addl %ebx, %eax
-; WIN64-NEXT: addl %ecx, %r13d
-; WIN64-NEXT: addl %edi, %edx
-; WIN64-NEXT: addl %r8d, %esi
-; WIN64-NEXT: addl %r10d, %r9d
-; WIN64-NEXT: imull %r13d, %r9d
-; WIN64-NEXT: addl %r12d, %r11d
-; WIN64-NEXT: imull %edx, %r11d
-; WIN64-NEXT: addl %r9d, %r11d
-; WIN64-NEXT: addl %r15d, %r14d
-; WIN64-NEXT: imull %esi, %r14d
-; WIN64-NEXT: addl %r11d, %r14d
-; WIN64-NEXT: addl %r14d, %eax
+; WIN64-NEXT: imull %edx, %ebx
+; WIN64-NEXT: addl %ebp, %ebx
+; WIN64-NEXT: leal (%r14,%r15), %edx
+; WIN64-NEXT: movl %r14d, %ebp
+; WIN64-NEXT: subl %r15d, %ebp
+; WIN64-NEXT: imull %esi, %ebp
+; WIN64-NEXT: addl %ebx, %ebp
+; WIN64-NEXT: addl %r9d, %eax
+; WIN64-NEXT: imull %r8d, %eax
+; WIN64-NEXT: imull %r13d, %edi
+; WIN64-NEXT: addl %edi, %eax
+; WIN64-NEXT: imull %ecx, %edx
+; WIN64-NEXT: addl %edx, %eax
+; WIN64-NEXT: addl %ebp, %eax
; WIN64-NEXT: popq %rbx
; WIN64-NEXT: popq %rbp
; WIN64-NEXT: popq %r13
; LINUXOSX64: # %bb.0:
; LINUXOSX64-NEXT: pushq %rbp
; LINUXOSX64-NEXT: pushq %rbx
-; LINUXOSX64-NEXT: movl %eax, %r10d
-; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d
-; LINUXOSX64-NEXT: subl %ecx, %eax
-; LINUXOSX64-NEXT: movl %edx, %ebx
-; LINUXOSX64-NEXT: subl %edi, %ebx
-; LINUXOSX64-NEXT: movl %r9d, %ebp
-; LINUXOSX64-NEXT: subl %r12d, %ebp
-; LINUXOSX64-NEXT: imull %ebp, %eax
-; LINUXOSX64-NEXT: movl %r13d, %ebp
-; LINUXOSX64-NEXT: subl %r14d, %ebp
-; LINUXOSX64-NEXT: imull %ebx, %ebp
-; LINUXOSX64-NEXT: movl %esi, %ebx
-; LINUXOSX64-NEXT: subl %r8d, %ebx
-; LINUXOSX64-NEXT: addl %ebp, %eax
-; LINUXOSX64-NEXT: movl %r15d, %ebp
-; LINUXOSX64-NEXT: subl %r11d, %ebp
-; LINUXOSX64-NEXT: imull %ebx, %ebp
+; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx
+; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi
+; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14
+; LINUXOSX64-NEXT: # kill: def $r13d killed $r13d def $r13
+; LINUXOSX64-NEXT: # kill: def $r12d killed $r12d def $r12
+; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d def $r9
+; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d
+; LINUXOSX64-NEXT: movl %edx, %ebp
+; LINUXOSX64-NEXT: subl %edi, %ebp
+; LINUXOSX64-NEXT: leal (%rsi,%r8), %r11d
+; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi
+; LINUXOSX64-NEXT: subl %r8d, %esi
+; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d
+; LINUXOSX64-NEXT: movl %r9d, %edi
+; LINUXOSX64-NEXT: subl %r12d, %edi
+; LINUXOSX64-NEXT: movl %eax, %edx
+; LINUXOSX64-NEXT: subl %ecx, %edx
+; LINUXOSX64-NEXT: imull %edx, %edi
+; LINUXOSX64-NEXT: leal (%r13,%r14), %edx
+; LINUXOSX64-NEXT: movl %r13d, %ebx
+; LINUXOSX64-NEXT: subl %r14d, %ebx
+; LINUXOSX64-NEXT: imull %ebp, %ebx
+; LINUXOSX64-NEXT: movl 24(%rsp), %ebp
+; LINUXOSX64-NEXT: addl %edi, %ebx
+; LINUXOSX64-NEXT: movl %r15d, %edi
+; LINUXOSX64-NEXT: subl %ebp, %edi
+; LINUXOSX64-NEXT: imull %esi, %edi
+; LINUXOSX64-NEXT: addl %ebx, %edi
+; LINUXOSX64-NEXT: addl %ecx, %eax
+; LINUXOSX64-NEXT: imull %r8d, %eax
+; LINUXOSX64-NEXT: imull %r10d, %edx
+; LINUXOSX64-NEXT: addl %edx, %eax
+; LINUXOSX64-NEXT: addl %r15d, %ebp
+; LINUXOSX64-NEXT: imull %r11d, %ebp
; LINUXOSX64-NEXT: addl %ebp, %eax
-; LINUXOSX64-NEXT: addl %ecx, %r10d
-; LINUXOSX64-NEXT: addl %edi, %edx
-; LINUXOSX64-NEXT: addl %r8d, %esi
-; LINUXOSX64-NEXT: addl %r12d, %r9d
-; LINUXOSX64-NEXT: imull %r10d, %r9d
-; LINUXOSX64-NEXT: addl %r14d, %r13d
-; LINUXOSX64-NEXT: imull %edx, %r13d
-; LINUXOSX64-NEXT: addl %r9d, %r13d
-; LINUXOSX64-NEXT: addl %r11d, %r15d
-; LINUXOSX64-NEXT: imull %esi, %r15d
-; LINUXOSX64-NEXT: addl %r13d, %r15d
-; LINUXOSX64-NEXT: addl %r15d, %eax
+; LINUXOSX64-NEXT: addl %edi, %eax
; LINUXOSX64-NEXT: popq %rbx
; LINUXOSX64-NEXT: popq %rbp
; LINUXOSX64-NEXT: retq
; CHECK-LABEL: mand32:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: andl %esi, %ecx
-; CHECK-NEXT: xorl %esi, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: andl %esi, %eax
+; CHECK-NEXT: xorl %esi, %edi
+; CHECK-NEXT: orl %edi, %eax
; CHECK-NEXT: retq
%ma = bitcast i32 %x to <32 x i1>
%mb = bitcast i32 %y to <32 x i1>
; CHECK-LABEL: mand64:
; CHECK: ## %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: xorq %rsi, %rax
-; CHECK-NEXT: orq %rcx, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: xorq %rsi, %rdi
+; CHECK-NEXT: orq %rdi, %rax
; CHECK-NEXT: retq
%ma = bitcast i64 %x to <64 x i1>
%mb = bitcast i64 %y to <64 x i1>
; CHECK-LABEL: mand8:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: andb %sil, %cl
-; CHECK-NEXT: xorb %sil, %al
-; CHECK-NEXT: orb %cl, %al
-; CHECK-NEXT: ## kill: def $al killed $al killed $eax
+; CHECK-NEXT: andb %sil, %al
+; CHECK-NEXT: xorb %sil, %dil
+; CHECK-NEXT: orb %dil, %al
; CHECK-NEXT: retq
%ma = bitcast i8 %x to <8 x i1>
%mb = bitcast i8 %y to <8 x i1>
;
; X64-LABEL: test_bitreverse_i8:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: rolb $4, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
; X64-NEXT: addb %al, %al
; X64-NEXT: shrb %dil
; X64-NEXT: andb $85, %dil
-; X64-NEXT: addl %edi, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: orb %dil, %al
; X64-NEXT: retq
;
; X86XOP-LABEL: test_bitreverse_i8:
;
; X64-LABEL: test_bitreverse_i4:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: rolb $4, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
; X64-NEXT: addb %al, %al
; X64-NEXT: shrb %dil
; X64-NEXT: andb $80, %dil
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: orb %dil, %al
; X64-NEXT: shrb $4, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86XOP-LABEL: test_bitreverse_i4:
;
; CHECK64-LABEL: test1:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: movl %edi, %eax
; CHECK64-NEXT: movl %edi, %ecx
; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000
-; CHECK64-NEXT: movl %edi, %edx
-; CHECK64-NEXT: orl $-16777216, %edx # imm = 0xFF000000
+; CHECK64-NEXT: movl %edi, %eax
+; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; CHECK64-NEXT: shll $8, %ecx
-; CHECK64-NEXT: shrl $8, %edx
-; CHECK64-NEXT: orl %ecx, %edx
-; CHECK64-NEXT: bswapl %eax
-; CHECK64-NEXT: shrl $16, %eax
-; CHECK64-NEXT: orl %edx, %eax
+; CHECK64-NEXT: shrl $8, %eax
+; CHECK64-NEXT: orl %ecx, %eax
+; CHECK64-NEXT: bswapl %edi
+; CHECK64-NEXT: shrl $16, %edi
+; CHECK64-NEXT: orl %edi, %eax
; CHECK64-NEXT: retq
%byte0 = and i32 %x, 255 ; 0x000000ff
%byte1 = and i32 %x, 65280 ; 0x0000ff00
; CHECK64-NEXT: andl $-16777216, %edi # imm = 0xFF000000
; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000
; CHECK64-NEXT: orl %edi, %eax
-; CHECK64-NEXT: addl %ecx, %eax
+; CHECK64-NEXT: orl %ecx, %eax
; CHECK64-NEXT: retq
%byte1 = lshr i32 %x, 8
%byte0 = shl i32 %x, 8
; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: pcmpeqw %xmm0, %xmm1
-; CHECK-NEXT: pcmpeqw %xmm2, %xmm0
-; CHECK-NEXT: packsswb %xmm1, %xmm0
+; CHECK-NEXT: pcmpeqw %xmm0, %xmm2
+; CHECK-NEXT: packsswb %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%zx = zext <16 x i8> %x to <16 x i16>
%zy = zext <16 x i8> %y to <16 x i16>
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psllq $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psllq $4, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllq $4, %xmm2
; SSE-NEXT: psllq $2, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2c:
; SSE-NEXT: paddd %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: pxor %xmm0, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; CHECK-NEXT: por %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
; SSE41-LABEL: combine_vec_sdiv_by_pos1:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $4, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld $2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrld $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrld $4, %xmm0
-; SSE41-NEXT: psrld $2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pos1:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $30, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: psrad $2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_by_pow2a:
; SSE41-NEXT: psraw $1, %xmm2
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psraw $15, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
-; SSE41-NEXT: pmulhuw %xmm4, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
+; SSE41-NEXT: pmulhuw %xmm3, %xmm2
; SSE41-NEXT: paddw %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768>
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pmulhw %xmm5, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,16384,32768,4096,8192,2048,1024,32768>
+; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: pmulhw %xmm4, %xmm5
; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psraw $15, %xmm3
-; SSE41-NEXT: pmulhuw %xmm4, %xmm3
-; SSE41-NEXT: paddw %xmm1, %xmm3
-; SSE41-NEXT: pmulhw %xmm3, %xmm5
-; SSE41-NEXT: psraw $1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $15, %xmm2
+; SSE41-NEXT: pmulhuw %xmm3, %xmm2
+; SSE41-NEXT: paddw %xmm1, %xmm2
+; SSE41-NEXT: pmulhw %xmm2, %xmm4
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6],xmm2[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psraw $15, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
-; SSE41-NEXT: pmulhuw %xmm7, %xmm0
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768>
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmulhw %xmm6, %xmm5
-; SSE41-NEXT: psraw $1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psraw $15, %xmm1
-; SSE41-NEXT: pmulhuw %xmm7, %xmm1
-; SSE41-NEXT: paddw %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmulhw %xmm6, %xmm5
-; SSE41-NEXT: psraw $1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psraw $15, %xmm4
-; SSE41-NEXT: pmulhuw %xmm7, %xmm4
-; SSE41-NEXT: paddw %xmm2, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pmulhw %xmm6, %xmm5
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psraw $15, %xmm5
-; SSE41-NEXT: pmulhuw %xmm7, %xmm5
-; SSE41-NEXT: paddw %xmm3, %xmm5
-; SSE41-NEXT: pmulhw %xmm5, %xmm6
-; SSE41-NEXT: psraw $1, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: psraw $15, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,4,2,16,8,32,64,2>
+; SSE41-NEXT: pmulhuw %xmm5, %xmm6
+; SSE41-NEXT: paddw %xmm0, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,16384,32768,4096,8192,2048,1024,32768>
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pmulhw %xmm4, %xmm7
+; SSE41-NEXT: psraw $1, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psraw $15, %xmm6
+; SSE41-NEXT: pmulhuw %xmm5, %xmm6
+; SSE41-NEXT: paddw %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pmulhw %xmm4, %xmm7
+; SSE41-NEXT: psraw $1, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: psraw $15, %xmm6
+; SSE41-NEXT: pmulhuw %xmm5, %xmm6
+; SSE41-NEXT: paddw %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pmulhw %xmm4, %xmm7
+; SSE41-NEXT: psraw $1, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psraw $15, %xmm6
+; SSE41-NEXT: pmulhuw %xmm5, %xmm6
+; SSE41-NEXT: paddw %xmm3, %xmm6
+; SSE41-NEXT: pmulhw %xmm6, %xmm4
+; SSE41-NEXT: psraw $1, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrld $28, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrld $30, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
-; SSE41-NEXT: paddd %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld $29, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrad $4, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrad $2, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: psrad $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrld $28, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: psrld $30, %xmm6
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrld $29, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrad $4, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: psrad $2, %xmm6
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: psrad $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrad $3, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrld $28, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: psrld $30, %xmm6
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
-; SSE41-NEXT: paddd %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrld $29, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT: paddd %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrad $4, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: psrad $2, %xmm6
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psrad $31, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: psrld $28, %xmm2
-; SSE41-NEXT: movdqa %xmm5, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrld $28, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: psrld $30, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
-; SSE41-NEXT: paddd %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: psrad $4, %xmm2
-; SSE41-NEXT: movdqa %xmm5, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrld $29, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT: paddd %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrad $4, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: psrad $2, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrad $3, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
; SSE41-NEXT: psrad $2, %xmm2
; SSE41-NEXT: psrlq $2, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: psrlq $62, %xmm0
-; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: psrlq $62, %xmm2
+; SSE41-NEXT: paddq %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrad $2, %xmm3
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: psrlq $2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: psrlq $62, %xmm0
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $2, %xmm2
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrlq $62, %xmm2
-; SSE41-NEXT: paddq %xmm5, %xmm2
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: psrlq $62, %xmm1
+; SSE41-NEXT: paddq %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrad $2, %xmm5
+; SSE41-NEXT: psrlq $2, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrad $2, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: psrlq $62, %xmm1
+; SSE41-NEXT: paddq %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrad $2, %xmm5
+; SSE41-NEXT: psrlq $2, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: psubd %xmm3, %xmm2
; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: non_splat_minus_one_divisor_2:
; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrlw $15, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_nonuniform:
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8>
-; SSE41-NEXT: pmulhw %xmm0, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <4,256,256,u,u,512,256,8>
+; SSE41-NEXT: pmulhw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
; SSE41-NEXT: psrlw $15, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
+; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
define i32 @combine_sdiv_two(i32 %x) {
; CHECK-LABEL: combine_sdiv_two:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: addl %edi, %eax
define i32 @combine_sdiv_negtwo(i32 %x) {
; CHECK-LABEL: combine_sdiv_negtwo:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: addl %edi, %eax
define i8 @combine_i8_sdiv_pow2(i8 %x) {
; CHECK-LABEL: combine_i8_sdiv_pow2:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: sarb $7, %al
; CHECK-NEXT: shrb $4, %al
-; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: addb %dil, %al
; CHECK-NEXT: sarb $4, %al
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%1 = sdiv i8 %x, 16
ret i8 %1
define i8 @combine_i8_sdiv_negpow2(i8 %x) {
; CHECK-LABEL: combine_i8_sdiv_negpow2:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: sarb $7, %al
; CHECK-NEXT: shrb $2, %al
-; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: addb %dil, %al
; CHECK-NEXT: sarb $6, %al
; CHECK-NEXT: negb %al
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%1 = sdiv i8 %x, -64
ret i8 %1
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pmaxub %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16i8_nosignbit:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pmaxsb %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pmaxsb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: test_v16i8_nosignbit:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE42-NEXT: pand %xmm2, %xmm0
-; SSE42-NEXT: pand %xmm2, %xmm1
-; SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; SSE42-NEXT: pand %xmm1, %xmm2
+; SSE42-NEXT: pmaxsb %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: test_v16i8_nosignbit:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pminub %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pminub %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16i8_nosignbit:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pminsb %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: pminsb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: test_v16i8_nosignbit:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE42-NEXT: pand %xmm2, %xmm0
-; SSE42-NEXT: pand %xmm2, %xmm1
-; SSE42-NEXT: pminsb %xmm1, %xmm0
+; SSE42-NEXT: pand %xmm1, %xmm2
+; SSE42-NEXT: pminsb %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: test_v16i8_nosignbit:
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: psrad $1, %xmm0
; SSE-NEXT: psrad $3, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; SSE-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: psrad $1, %xmm0
; SSE-NEXT: psrad $3, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; SSE-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_srem_by_pow2b_neg:
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pshufb %xmm0, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $4, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pshufb %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrlw $4, %xmm3
; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pshufb %xmm1, %xmm2
+; SSE-NEXT: pshufb %xmm3, %xmm1
+; SSE-NEXT: pcmpeqb %xmm4, %xmm3
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: paddb %xmm1, %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: paddb %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pcmpeqb %xmm4, %xmm2
-; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: psrlw $8, %xmm3
+; SSE-NEXT: paddw %xmm1, %xmm3
; SSE-NEXT: pcmpeqw %xmm4, %xmm0
; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: psrld $5, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: psrld $16, %xmm3
+; SSE-NEXT: paddd %xmm3, %xmm0
+; SSE-NEXT: psrld $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
;
; SSE41-LABEL: combine_vec_udiv_by_pow2b:
; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $4, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld $2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrld $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrld $4, %xmm0
-; SSE41-NEXT: psrld $2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_udiv_by_pow2b:
define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_udiv_nonuniform2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform2:
define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-LABEL: combine_vec_udiv_nonuniform4:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: psrlw $7, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-LABEL: pr38477:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115]
-; SSE2-NEXT: pmulhuw %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm0, %xmm3
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: pr38477:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115]
-; SSE41-NEXT: pmulhuw %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psubw %xmm2, %xmm1
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,1024,1024,16,4,1024,u,4096>
-; SSE41-NEXT: pmulhuw %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115]
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubw %xmm1, %xmm2
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: paddw %xmm1, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,1024,1024,16,4,1024,u,4096>
+; SSE41-NEXT: pmulhuw %xmm2, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: pr38477:
;
; NO-POPCOUNT-LABEL: test4:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi
; NO-POPCOUNT-NEXT: andb $127, %dil
; NO-POPCOUNT-NEXT: movl %edi, %eax
; NO-POPCOUNT-NEXT: shrb %al
; NO-POPCOUNT-NEXT: addb %al, %dil
; NO-POPCOUNT-NEXT: movl %edi, %eax
; NO-POPCOUNT-NEXT: shrb $4, %al
-; NO-POPCOUNT-NEXT: addl %edi, %eax
+; NO-POPCOUNT-NEXT: addb %dil, %al
; NO-POPCOUNT-NEXT: andb $15, %al
-; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax
; NO-POPCOUNT-NEXT: retq
%x2 = and i8 %x, 127
%count = tail call i8 @llvm.ctpop.i8(i8 %x2)
; X64-FAST-LABEL: var_shift_i128:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: movq %r8, %r9
-; X64-FAST-NEXT: movq %rcx, %r10
-; X64-FAST-NEXT: movq %rdx, %r8
-; X64-FAST-NEXT: movq %rsi, %rdx
+; X64-FAST-NEXT: movq %rcx, %r8
; X64-FAST-NEXT: movl %r9d, %ecx
-; X64-FAST-NEXT: shldq %cl, %rdi, %rdx
-; X64-FAST-NEXT: shrdq $1, %r10, %r8
-; X64-FAST-NEXT: shrq %r10
+; X64-FAST-NEXT: shldq %cl, %rdi, %rsi
+; X64-FAST-NEXT: shrdq $1, %r8, %rdx
+; X64-FAST-NEXT: shrq %r8
; X64-FAST-NEXT: notb %cl
-; X64-FAST-NEXT: shrdq %cl, %r10, %r8
-; X64-FAST-NEXT: shrq %cl, %r10
+; X64-FAST-NEXT: shrdq %cl, %r8, %rdx
+; X64-FAST-NEXT: shrq %cl, %r8
; X64-FAST-NEXT: xorl %eax, %eax
; X64-FAST-NEXT: testb $64, %cl
-; X64-FAST-NEXT: cmovneq %r10, %r8
-; X64-FAST-NEXT: cmovneq %rax, %r10
+; X64-FAST-NEXT: cmovneq %r8, %rdx
+; X64-FAST-NEXT: cmovneq %rax, %r8
; X64-FAST-NEXT: movl %r9d, %ecx
; X64-FAST-NEXT: shlq %cl, %rdi
; X64-FAST-NEXT: testb $64, %r9b
-; X64-FAST-NEXT: cmovneq %rdi, %rdx
+; X64-FAST-NEXT: cmovneq %rdi, %rsi
; X64-FAST-NEXT: cmoveq %rdi, %rax
-; X64-FAST-NEXT: orq %r8, %rax
-; X64-FAST-NEXT: orq %r10, %rdx
+; X64-FAST-NEXT: orq %rdx, %rax
+; X64-FAST-NEXT: orq %rsi, %r8
+; X64-FAST-NEXT: movq %r8, %rdx
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i128:
;
; X64-FAST-LABEL: var_shift_i128:
; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movq %r8, %r10
-; X64-FAST-NEXT: movq %rcx, %r9
-; X64-FAST-NEXT: movq %rdx, %r8
-; X64-FAST-NEXT: movq %rsi, %rdx
-; X64-FAST-NEXT: movl %r10d, %ecx
-; X64-FAST-NEXT: shrdq %cl, %r9, %r8
-; X64-FAST-NEXT: shrq %cl, %r9
+; X64-FAST-NEXT: movq %r8, %r9
+; X64-FAST-NEXT: movq %rcx, %r8
+; X64-FAST-NEXT: movl %r9d, %ecx
+; X64-FAST-NEXT: shrdq %cl, %r8, %rdx
+; X64-FAST-NEXT: shrq %cl, %r8
; X64-FAST-NEXT: xorl %eax, %eax
-; X64-FAST-NEXT: testb $64, %r10b
-; X64-FAST-NEXT: cmovneq %r9, %r8
-; X64-FAST-NEXT: cmovneq %rax, %r9
-; X64-FAST-NEXT: shldq $1, %rdi, %rdx
+; X64-FAST-NEXT: testb $64, %r9b
+; X64-FAST-NEXT: cmovneq %r8, %rdx
+; X64-FAST-NEXT: cmovneq %rax, %r8
+; X64-FAST-NEXT: shldq $1, %rdi, %rsi
; X64-FAST-NEXT: addq %rdi, %rdi
-; X64-FAST-NEXT: notb %r10b
-; X64-FAST-NEXT: movl %r10d, %ecx
-; X64-FAST-NEXT: shldq %cl, %rdi, %rdx
+; X64-FAST-NEXT: notb %r9b
+; X64-FAST-NEXT: movl %r9d, %ecx
+; X64-FAST-NEXT: shldq %cl, %rdi, %rsi
; X64-FAST-NEXT: shlq %cl, %rdi
-; X64-FAST-NEXT: testb $64, %r10b
-; X64-FAST-NEXT: cmovneq %rdi, %rdx
+; X64-FAST-NEXT: testb $64, %r9b
+; X64-FAST-NEXT: cmovneq %rdi, %rsi
; X64-FAST-NEXT: cmoveq %rdi, %rax
-; X64-FAST-NEXT: orq %r8, %rax
-; X64-FAST-NEXT: orq %r9, %rdx
+; X64-FAST-NEXT: orq %rdx, %rax
+; X64-FAST-NEXT: orq %rsi, %r8
+; X64-FAST-NEXT: movq %r8, %rdx
; X64-FAST-NEXT: retq
;
; X64-SLOW-LABEL: var_shift_i128:
; X64-AVX2-LABEL: fshl_i128:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: movq %r8, %r9
-; X64-AVX2-NEXT: movq %rcx, %r10
-; X64-AVX2-NEXT: movq %rdx, %r8
-; X64-AVX2-NEXT: movq %rsi, %rdx
+; X64-AVX2-NEXT: movq %rcx, %r8
; X64-AVX2-NEXT: movl %r9d, %ecx
-; X64-AVX2-NEXT: shldq %cl, %rdi, %rdx
-; X64-AVX2-NEXT: shrdq $1, %r10, %r8
-; X64-AVX2-NEXT: shrq %r10
+; X64-AVX2-NEXT: shldq %cl, %rdi, %rsi
+; X64-AVX2-NEXT: shrdq $1, %r8, %rdx
+; X64-AVX2-NEXT: shrq %r8
; X64-AVX2-NEXT: notb %cl
-; X64-AVX2-NEXT: shrdq %cl, %r10, %r8
-; X64-AVX2-NEXT: shrq %cl, %r10
+; X64-AVX2-NEXT: shrdq %cl, %r8, %rdx
+; X64-AVX2-NEXT: shrq %cl, %r8
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: testb $64, %cl
-; X64-AVX2-NEXT: cmovneq %r10, %r8
-; X64-AVX2-NEXT: cmovneq %rax, %r10
+; X64-AVX2-NEXT: cmovneq %r8, %rdx
+; X64-AVX2-NEXT: cmovneq %rax, %r8
; X64-AVX2-NEXT: movl %r9d, %ecx
; X64-AVX2-NEXT: shlq %cl, %rdi
; X64-AVX2-NEXT: testb $64, %r9b
-; X64-AVX2-NEXT: cmovneq %rdi, %rdx
+; X64-AVX2-NEXT: cmovneq %rdi, %rsi
; X64-AVX2-NEXT: cmoveq %rdi, %rax
-; X64-AVX2-NEXT: orq %r8, %rax
-; X64-AVX2-NEXT: orq %r10, %rdx
+; X64-AVX2-NEXT: orq %rdx, %rax
+; X64-AVX2-NEXT: orq %rsi, %r8
+; X64-AVX2-NEXT: movq %r8, %rdx
; X64-AVX2-NEXT: retq
%f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
ret i128 %f
; SSE3-NEXT: movaps %xmm0, %xmm5
; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE3-NEXT: paddd %xmm4, %xmm2
+; SSE3-NEXT: paddd %xmm2, %xmm4
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE3-NEXT: paddd %xmm5, %xmm0
-; SSE3-NEXT: movdqa %xmm2, %xmm1
+; SSE3-NEXT: movdqa %xmm4, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_8i32_v8i32_shuffle:
; SSE-SLOW: # %bb.0:
; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: test5_undef:
; SSE-SLOW: # %bb.0:
; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
-; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
-; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: add_pd_003_2:
; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSE-SLOW-NEXT: movapd %xmm0, %xmm3
; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
-; SSE-SLOW-NEXT: addpd %xmm0, %xmm3
+; SSE-SLOW-NEXT: addpd %xmm3, %xmm0
; SSE-SLOW-NEXT: addpd %xmm2, %xmm1
-; SSE-SLOW-NEXT: movapd %xmm3, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE-FAST-LABEL: add_pd_011:
; SSE-SLOW-LABEL: PR45747_2:
; SSE-SLOW: # %bb.0:
; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
-; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSE-SLOW-NEXT: addps %xmm1, %xmm0
; SSE-SLOW-NEXT: retq
;
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: haddpd3:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
-; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
-; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: hadd32_4:
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: hadd32_8:
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: hadd32_16:
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32:
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32:
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: PR37890_v4f32:
; SSE2-NEXT: addpd %xmm1, %xmm0
; SSE2-NEXT: movapd %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addsd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: addsd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-SLOW-LABEL: PR37890_v4f64:
; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-SLOW-NEXT: addsd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: PR37890_v4f64:
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-SLOW-NEXT: addss %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: PR37890_v8f32:
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4
-; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
+; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSSE3-FAST-NEXT: addps %xmm0, %xmm4
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
-; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm0
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSSE3-FAST-NEXT: addps %xmm2, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2]
-; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0
+; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
+; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
+; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
+; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
+; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
+; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
+; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; X86-NOBMI-NEXT: pushl %ebx
; X86-NOBMI-NEXT: pushl %edi
; X86-NOBMI-NEXT: pushl %esi
-; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT: movl %esi, %eax
-; X86-NOBMI-NEXT: mull %ebx
-; X86-NOBMI-NEXT: movl %edx, %edi
-; X86-NOBMI-NEXT: movl %ebp, %eax
-; X86-NOBMI-NEXT: mull %ebx
+; X86-NOBMI-NEXT: movl %edi, %eax
+; X86-NOBMI-NEXT: mull %esi
; X86-NOBMI-NEXT: movl %edx, %ebx
+; X86-NOBMI-NEXT: movl %ebp, %eax
+; X86-NOBMI-NEXT: mull %esi
+; X86-NOBMI-NEXT: movl %edx, %esi
; X86-NOBMI-NEXT: movl %eax, %ebp
-; X86-NOBMI-NEXT: addl %edi, %ebp
-; X86-NOBMI-NEXT: adcl $0, %ebx
-; X86-NOBMI-NEXT: movl %esi, %eax
+; X86-NOBMI-NEXT: addl %ebx, %ebp
+; X86-NOBMI-NEXT: adcl $0, %esi
+; X86-NOBMI-NEXT: movl %edi, %eax
; X86-NOBMI-NEXT: mull %ecx
-; X86-NOBMI-NEXT: movl %edx, %esi
; X86-NOBMI-NEXT: addl %ebp, %eax
-; X86-NOBMI-NEXT: adcl %ebx, %esi
+; X86-NOBMI-NEXT: adcl %edx, %esi
; X86-NOBMI-NEXT: setb %al
; X86-NOBMI-NEXT: movzbl %al, %edi
; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
;
; X64-LABEL: test_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rsi, %rdx
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: xorq %rcx, %rdx
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: sarq $63, %rdx
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: adcq %rdx, %rsi
+; X64-NEXT: xorq %rdx, %rax
+; X64-NEXT: xorq %rsi, %rdx
; X64-NEXT: retq
%tmp1neg = sub i128 0, %a
%b = icmp sgt i128 %a, -1
define i32 @mul33_32(i32 %A) {
; X64-LABEL: mul33_32:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: addl %edi, %eax
define i32 @test2(i32 %a) {
; X64-LABEL: test2:
; X64: # %bb.0: # %entry
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: addl %edi, %eax
define i32 @test3(i32 %a) {
; X64-LABEL: test3:
; X64: # %bb.0: # %entry
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: addl %edi, %eax
; SSE41-NEXT: psubd %xmm3, %xmm1
; SSE41-NEXT: psrld $1, %xmm1
; SSE41-NEXT: pmulld %xmm1, %xmm2
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-FALLBACK-LABEL: vec128_i32_signed_reg_reg:
;
; SSE41-LABEL: vec128_i32_signed_reg_mem:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movdqa (%rdi), %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pminsd %xmm2, %xmm3
-; SSE41-NEXT: pmaxsd %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm3, %xmm2
-; SSE41-NEXT: psrld $1, %xmm2
+; SSE41-NEXT: pminsd %xmm1, %xmm3
+; SSE41-NEXT: pmaxsd %xmm0, %xmm1
+; SSE41-NEXT: psubd %xmm3, %xmm1
+; SSE41-NEXT: psrld $1, %xmm1
; SSE41-NEXT: pmulld %xmm2, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-FALLBACK-LABEL: vec128_i32_signed_reg_mem:
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: psubq %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrlq $1, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: psrlq $1, %xmm1
; SSE2-NEXT: psrlq $33, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm4, %xmm1
-; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm1, %xmm4
+; SSE2-NEXT: paddq %xmm3, %xmm4
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: paddq %xmm4, %xmm0
+; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i64_signed_reg_reg:
; SSE41-NEXT: pmuludq %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
+; SSE41-NEXT: pmuludq %xmm4, %xmm3
; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm4, %xmm0
+; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: psubq %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrlq $1, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: psrlq $1, %xmm1
; SSE2-NEXT: psrlq $33, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm4, %xmm1
-; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm1, %xmm4
+; SSE2-NEXT: paddq %xmm3, %xmm4
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: paddq %xmm4, %xmm0
+; SSE2-NEXT: paddq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
; SSE41-NEXT: pmuludq %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
+; SSE41-NEXT: pmuludq %xmm4, %xmm3
; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm4, %xmm0
+; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa (%rdi), %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
; SSE41-NEXT: pand %xmm4, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: por %xmm7, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
; SSE41-NEXT: por %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlq $1, %xmm2
; SSE41-NEXT: psrlq $33, %xmm1
; SSE41-NEXT: pmuludq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: psrlq $32, %xmm2
-; SSE41-NEXT: pmuludq %xmm0, %xmm2
-; SSE41-NEXT: paddq %xmm1, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: pmuludq %xmm4, %xmm0
-; SSE41-NEXT: paddq %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: psrlq $32, %xmm0
+; SSE41-NEXT: pmuludq %xmm2, %xmm0
+; SSE41-NEXT: paddq %xmm1, %xmm0
+; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: pmuludq %xmm4, %xmm2
+; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
; SSE41-NEXT: retq
;
define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr) nounwind {
; SSE2-LABEL: vec128_i64_signed_reg_mem:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pand %xmm7, %xmm5
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: psubq %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrlq $1, %xmm4
-; SSE2-NEXT: psrlq $33, %xmm3
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm4, %xmm1
-; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm4
-; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: psubq %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psrlq $1, %xmm3
+; SSE2-NEXT: psrlq $33, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm3, %xmm4
+; SSE2-NEXT: paddq %xmm2, %xmm4
+; SSE2-NEXT: psllq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: paddq %xmm4, %xmm0
+; SSE2-NEXT: paddq %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i64_signed_reg_mem:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa (%rdi), %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
; SSE41-NEXT: pand %xmm4, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: por %xmm7, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
; SSE41-NEXT: por %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movdqa %xmm3, %xmm5
; SSE-NEXT: psubw %xmm3, %xmm1
; SSE-NEXT: psrlw $1, %xmm1
; SSE-NEXT: pmullw %xmm1, %xmm2
-; SSE-NEXT: paddw %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-FALLBACK-LABEL: vec128_i16_signed_reg_reg:
define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind {
; SSE2-LABEL: vec128_i16_unsigned_reg_reg:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtw %xmm2, %xmm3
-; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm1, %xmm3
; SSE2-NEXT: psubusw %xmm0, %xmm1
-; SSE2-NEXT: psubw %xmm0, %xmm2
-; SSE2-NEXT: paddw %xmm1, %xmm2
-; SSE2-NEXT: paddw %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: pmullw %xmm3, %xmm2
-; SSE2-NEXT: paddw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psubw %xmm0, %xmm3
+; SSE2-NEXT: paddw %xmm1, %xmm3
+; SSE2-NEXT: paddw %xmm0, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i16_unsigned_reg_reg:
define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, <8 x i16>* %a2_addr) nounwind {
; SSE-LABEL: vec128_i16_signed_reg_mem:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pcmpgtw %xmm2, %xmm1
-; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpgtw %xmm1, %xmm2
+; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pminsw %xmm2, %xmm3
-; SSE-NEXT: pmaxsw %xmm0, %xmm2
-; SSE-NEXT: psubw %xmm3, %xmm2
-; SSE-NEXT: psrlw $1, %xmm2
+; SSE-NEXT: pminsw %xmm1, %xmm3
+; SSE-NEXT: pmaxsw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm3, %xmm1
+; SSE-NEXT: psrlw $1, %xmm1
; SSE-NEXT: pmullw %xmm2, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-FALLBACK-LABEL: vec128_i16_signed_reg_mem:
define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind {
; SSE2-LABEL: vec128_i8_signed_reg_reg:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: psubb %xmm4, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: psubb %xmm4, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: pmullw %xmm1, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: packuswb %xmm4, %xmm2
-; SSE2-NEXT: paddb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i8_signed_reg_reg:
; SSE2-LABEL: vec128_i8_signed_reg_mem:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: psubb %xmm4, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: psubb %xmm4, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: pmullw %xmm3, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: vec128_i8_signed_reg_mem:
;
; X64-LABEL: test_mul_by_17:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $4, %eax
; X64-NEXT: addl %edi, %eax
;
; X64-LABEL: test_mul_by_17:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $4, %eax
; X64-NEXT: addl %edi, %eax
;
; X64-SLM-LABEL: test_mul_by_66:
; X64-SLM: # %bb.0:
-; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: movl %edi, %eax
; X64-SLM-NEXT: shll $6, %eax
; X64-SLM-NEXT: addl %edi, %eax
define i64 @test_mul_spec(i64 %x) nounwind {
; X86-LABEL: test_mul_spec:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl $9, %ecx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: leal (%ebx,%ebx,8), %edi
-; X86-NEXT: addl $42, %ecx
-; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: leal (%ebp,%ebp,8), %eax
+; X86-NEXT: addl $42, %esi
+; X86-NEXT: adcl %eax, %ecx
; X86-NEXT: movl $5, %edx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: leal (%ebx,%ebx,4), %ebx
-; X86-NEXT: addl $2, %esi
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: leal (%ebp,%ebp,4), %eax
+; X86-NEXT: addl $2, %edi
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: imull %esi, %ebx
; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: addl %esi, %edx
+; X86-NEXT: imull %ecx, %edi
+; X86-NEXT: addl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X86-NOOPT-LABEL: test_mul_spec:
; X86-NOOPT: # %bb.0:
+; X86-NOOPT-NEXT: pushl %ebp
; X86-NOOPT-NEXT: pushl %ebx
; X86-NOOPT-NEXT: pushl %edi
; X86-NOOPT-NEXT: pushl %esi
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NOOPT-NEXT: movl $9, %ecx
-; X86-NOOPT-NEXT: movl %esi, %eax
+; X86-NOOPT-NEXT: movl %edi, %eax
; X86-NOOPT-NEXT: mull %ecx
-; X86-NOOPT-NEXT: movl %eax, %ecx
-; X86-NOOPT-NEXT: leal (%ebx,%ebx,8), %edi
-; X86-NOOPT-NEXT: addl $42, %ecx
-; X86-NOOPT-NEXT: adcl %edx, %edi
+; X86-NOOPT-NEXT: movl %eax, %esi
+; X86-NOOPT-NEXT: movl %edx, %ecx
+; X86-NOOPT-NEXT: leal (%ebp,%ebp,8), %eax
+; X86-NOOPT-NEXT: addl $42, %esi
+; X86-NOOPT-NEXT: adcl %eax, %ecx
; X86-NOOPT-NEXT: movl $5, %edx
-; X86-NOOPT-NEXT: movl %esi, %eax
+; X86-NOOPT-NEXT: movl %edi, %eax
; X86-NOOPT-NEXT: mull %edx
-; X86-NOOPT-NEXT: movl %eax, %esi
-; X86-NOOPT-NEXT: leal (%ebx,%ebx,4), %ebx
-; X86-NOOPT-NEXT: addl $2, %esi
-; X86-NOOPT-NEXT: adcl %edx, %ebx
-; X86-NOOPT-NEXT: movl %ecx, %eax
-; X86-NOOPT-NEXT: mull %esi
-; X86-NOOPT-NEXT: imull %ecx, %ebx
+; X86-NOOPT-NEXT: movl %eax, %edi
+; X86-NOOPT-NEXT: movl %edx, %ebx
+; X86-NOOPT-NEXT: leal (%ebp,%ebp,4), %eax
+; X86-NOOPT-NEXT: addl $2, %edi
+; X86-NOOPT-NEXT: adcl %eax, %ebx
+; X86-NOOPT-NEXT: movl %esi, %eax
+; X86-NOOPT-NEXT: mull %edi
+; X86-NOOPT-NEXT: imull %esi, %ebx
; X86-NOOPT-NEXT: addl %ebx, %edx
-; X86-NOOPT-NEXT: imull %edi, %esi
-; X86-NOOPT-NEXT: addl %esi, %edx
+; X86-NOOPT-NEXT: imull %ecx, %edi
+; X86-NOOPT-NEXT: addl %edi, %edx
; X86-NOOPT-NEXT: popl %esi
; X86-NOOPT-NEXT: popl %edi
; X86-NOOPT-NEXT: popl %ebx
+; X86-NOOPT-NEXT: popl %ebp
; X86-NOOPT-NEXT: retl
;
; X64-HSW-LABEL: test_mul_spec:
define i8 @test_mul_by_17(i8 %x) {
; X64-LABEL: test_mul_by_17:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $4, %eax
; X64-NEXT: addl %edi, %eax
; X64-NEXT: imulq %rdi, %rcx
; X64-NEXT: mulq %rdx
; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: imulq %r8, %rsi
-; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %r8, %rdx
; X64-NEXT: retq
;
; X86-LABEL: foo:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: andl $1, %ecx
-; X64-NEXT: leaq (%rcx,%rdx), %rax
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%1 = zext i64 %a to i128
; CHECK-SSE2-LABEL: test9:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retl
;
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
; SSSE3-SLOW-NEXT: pslld $16, %xmm1
-; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: phaddw_single_source4:
define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
; SSE2-LABEL: mul_v4i64_zero_upper_left:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pmuludq %xmm2, %xmm4
; SSE2-NEXT: psrlq $32, %xmm2
; SSE2-NEXT: pmuludq %xmm0, %xmm2
; SSE2-NEXT: psllq $32, %xmm2
-; SSE2-NEXT: paddq %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: paddq %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: pmuludq %xmm4, %xmm1
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: paddq %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: psllq $32, %xmm3
+; SSE2-NEXT: paddq %xmm0, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
+; SSE2-NEXT: movaps %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i64_zero_upper_left:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pmuludq %xmm2, %xmm4
; SSE41-NEXT: psrlq $32, %xmm2
; SSE41-NEXT: pmuludq %xmm0, %xmm2
; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: paddq %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: paddq %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pmuludq %xmm1, %xmm0
; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: pmuludq %xmm4, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; SSE41-NEXT: pmuludq %xmm1, %xmm3
+; SSE41-NEXT: psllq $32, %xmm3
+; SSE41-NEXT: paddq %xmm0, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64_zero_upper_left:
;
; SSE41-LABEL: and_mulhuw_v16i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
-; SSE41-NEXT: pand %xmm8, %xmm3
-; SSE41-NEXT: pand %xmm8, %xmm2
-; SSE41-NEXT: pand %xmm8, %xmm1
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm7
+; SSE41-NEXT: movdqa %xmm6, %xmm8
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
+; SSE41-NEXT: pand %xmm6, %xmm3
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pand %xmm6, %xmm7
; SSE41-NEXT: pmaddwd %xmm3, %xmm7
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: pmaddwd %xmm2, %xmm6
-; SSE41-NEXT: pand %xmm8, %xmm5
+; SSE41-NEXT: pand %xmm6, %xmm8
+; SSE41-NEXT: pmaddwd %xmm2, %xmm8
+; SSE41-NEXT: pand %xmm6, %xmm5
; SSE41-NEXT: pmaddwd %xmm1, %xmm5
-; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: pmaddwd %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm6
+; SSE41-NEXT: pmaddwd %xmm6, %xmm0
; SSE41-NEXT: psrld $16, %xmm7
-; SSE41-NEXT: psrld $16, %xmm6
-; SSE41-NEXT: packusdw %xmm7, %xmm6
+; SSE41-NEXT: psrld $16, %xmm8
+; SSE41-NEXT: packusdw %xmm7, %xmm8
; SSE41-NEXT: psrld $16, %xmm5
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: packusdw %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: and_mulhuw_v16i16:
;
; X64-LABEL: cnt8:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrb %al
; X64-NEXT: andb $85, %al
; X64-NEXT: addb %al, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrb $4, %al
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: addb %dil, %al
; X64-NEXT: andb $15, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86-POPCNT-LABEL: cnt8:
; X64-NEXT: mulsd %xmm1, %xmm1
; X64-NEXT: mulsd %xmm1, %xmm0
; X64-NEXT: mulsd %xmm1, %xmm1
-; X64-NEXT: mulsd %xmm0, %xmm1
-; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: mulsd %xmm1, %xmm0
; X64-NEXT: retq
%ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; <double> [#uses=1]
ret double %ret
define i64 @imm1_Oz(i32 %x, i32 %y) minsize nounwind {
; CHECK-LABEL: imm1_Oz:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 1(%rdi), %eax
-; CHECK-NEXT: incl %esi
-; CHECK-NEXT: addq %rsi, %rax
+; CHECK: incl %edi
+; CHECK-NEXT: leal 1(%rsi), %eax
+; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: retq
%x1 = add i32 %x, 1
%y1 = add i32 %y, 1
define i64 @imm1_Os(i32 %x, i32 %y) optsize nounwind {
; CHECK-LABEL: imm1_Os:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 1(%rdi), %eax
-; CHECK-NEXT: incl %esi
-; CHECK-NEXT: addq %rsi, %rax
+; CHECK: incl %edi
+; CHECK-NEXT: leal 1(%rsi), %eax
+; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: retq
%x1 = add i32 %x, 1
%y1 = add i32 %y, 1
; FAST-INCDEC: # %bb.0:
; FAST-INCDEC-NEXT: # kill: def $esi killed $esi def $rsi
; FAST-INCDEC-NEXT: # kill: def $edi killed $edi def $rdi
-; FAST-INCDEC-NEXT: leal 1(%rdi), %eax
-; FAST-INCDEC-NEXT: incl %esi
-; FAST-INCDEC-NEXT: addq %rsi, %rax
+; FAST-INCDEC-NEXT: incl %edi
+; FAST-INCDEC-NEXT: leal 1(%rsi), %eax
+; FAST-INCDEC-NEXT: addq %rdi, %rax
; FAST-INCDEC-NEXT: retq
;
; SLOW-INCDEC-LABEL: imm1_O2:
; SLOW-INCDEC: # %bb.0:
-; SLOW-INCDEC-NEXT: # kill: def $esi killed $esi def $rsi
; SLOW-INCDEC-NEXT: # kill: def $edi killed $edi def $rdi
-; SLOW-INCDEC-NEXT: leal 1(%rdi), %eax
-; SLOW-INCDEC-NEXT: addl $1, %esi
-; SLOW-INCDEC-NEXT: addq %rsi, %rax
+; SLOW-INCDEC-NEXT: # kill: def $esi killed $esi def $rsi
+; SLOW-INCDEC-NEXT: addl $1, %edi
+; SLOW-INCDEC-NEXT: leal 1(%rsi), %eax
+; SLOW-INCDEC-NEXT: addq %rdi, %rax
; SLOW-INCDEC-NEXT: retq
%x1 = add i32 %x, 1
%y1 = add i32 %y, 1
;
; HASWELL-LABEL: v4f32_one_step:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %xmm0, %xmm2
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
-; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
-; HASWELL-NEXT: vmovaps %xmm1, %xmm0
+; HASWELL-NEXT: vrcpps %xmm0, %xmm1
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step:
;
; KNL-LABEL: v4f32_one_step:
; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %xmm0, %xmm2
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
-; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
-; KNL-NEXT: vmovaps %xmm1, %xmm0
+; KNL-NEXT: vrcpps %xmm0, %xmm1
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
+; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
; KNL-NEXT: retq
;
; SKX-LABEL: v4f32_one_step:
;
; HASWELL-LABEL: v8f32_one_step:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm2
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
-; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
-; HASWELL-NEXT: vmovaps %ymm1, %ymm0
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
+; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step:
;
; KNL-LABEL: v8f32_one_step:
; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %ymm0, %ymm2
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
-; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
-; KNL-NEXT: vmovaps %ymm1, %ymm0
+; KNL-NEXT: vrcpps %ymm0, %ymm1
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
+; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: v8f32_one_step:
;
; X64-LABEL: not_rev16:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $8, %eax
; X64-NEXT: shrl $8, %edi
; X64-NEXT: andl $65280, %edi # imm = 0xFF00
; X64-NEXT: andl $16711680, %eax # imm = 0xFF0000
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: orl %edi, %eax
; X64-NEXT: retq
%l8 = shl i32 %a, 8
%r8 = lshr i32 %a, 8
;
; X64-LABEL: different_shift_amount:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $9, %eax
; X64-NEXT: shrl $8, %edi
; X64-NEXT: andl $-16712192, %eax # imm = 0xFF00FE00
; X64-NEXT: andl $16711935, %edi # imm = 0xFF00FF
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: orl %edi, %eax
; X64-NEXT: retq
%l8 = shl i32 %a, 9
%r8 = lshr i32 %a, 8
; X64-LABEL: rot16_trunc:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: shrl $11, %ecx
-; X64-NEXT: shll $5, %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: shrl $11, %eax
+; X64-NEXT: shll $5, %edi
+; X64-NEXT: orl %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%t0 = lshr i32 %x, 11
; X64-NEXT: shlq $5, %rax
; X64-NEXT: shlq $10, %rdi
; X64-NEXT: shrq $57, %rax
-; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
%lhs_mul = shl i64 %i, 5
%rhs_mul = shl i64 %i, 10
;
; X64-LABEL: no_extract_shrl:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $-8, %eax
; X64-NEXT: shll $25, %eax
; X64-NEXT: shrl $9, %edi
-; X64-NEXT: addl %edi, %eax
+; X64-NEXT: orl %edi, %eax
; X64-NEXT: retq
%lhs_div = lshr i32 %i, 3
%rhs_div = lshr i32 %i, 9
; CHECK-LABEL: f0:
; CHECK: # %bb.0: # %b0
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: roll $7, %ecx
-; CHECK-NEXT: roll $9, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: roll $7, %eax
+; CHECK-NEXT: roll $9, %edi
+; CHECK-NEXT: orl %edi, %eax
; CHECK-NEXT: retq
b0:
%v0 = shl i32 %a0, 7
; CHECK-LABEL: f1:
; CHECK: # %bb.0: # %b0
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: shll $7, %ecx
-; CHECK-NEXT: roll $9, %eax
-; CHECK-NEXT: orl %esi, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: shll $7, %eax
+; CHECK-NEXT: roll $9, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: orl %edi, %eax
; CHECK-NEXT: retq
b0:
%v0 = shl i32 %a0, 7
define i32 @f2(i32 %a0, i32 %a1) #0 {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: shll $11, %ecx
; CHECK-NEXT: shrl $21, %edi
-; CHECK-NEXT: movl %esi, %edx
-; CHECK-NEXT: shll $19, %edx
-; CHECK-NEXT: shrl $13, %eax
-; CHECK-NEXT: orl %edi, %eax
-; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: shll $19, %eax
+; CHECK-NEXT: shrl $13, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: retq
%v0 = shl i32 %a0, 11
; CHECK-LABEL: f3:
; CHECK: # %bb.0: # %b0
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal (,%rdi,8), %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: shll $5, %ecx
+; CHECK-NEXT: leal (,%rdi,8), %ecx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shll $5, %eax
; CHECK-NEXT: movl %edi, %edx
; CHECK-NEXT: shll $7, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: shll $13, %ecx
-; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shll $13, %eax
+; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: movl %edi, %edx
; CHECK-NEXT: shll $19, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: shrl $2, %ecx
-; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: movl %edi, %edx
; CHECK-NEXT: shrl $15, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: shrl $23, %ecx
-; CHECK-NEXT: orl %edx, %ecx
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shrl $25, %edx
-; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: movl %edi, %esi
+; CHECK-NEXT: shrl $23, %esi
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shrl $25, %eax
+; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: shrl $30, %edi
-; CHECK-NEXT: orl %edx, %edi
; CHECK-NEXT: orl %edi, %eax
+; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: retq
b0:
%v0 = shl i32 %a0, 3
define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
; ANY-LABEL: unsigned_sat_variable_i16_using_min:
; ANY: # %bb.0:
-; ANY-NEXT: # kill: def $esi killed $esi def $rsi
; ANY-NEXT: movl %esi, %eax
; ANY-NEXT: notl %eax
; ANY-NEXT: cmpw %ax, %di
define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) {
; ANY-LABEL: unsigned_sat_variable_i32_using_min:
; ANY: # %bb.0:
-; ANY-NEXT: # kill: def $esi killed $esi def $rsi
; ANY-NEXT: movl %esi, %eax
; ANY-NEXT: notl %eax
; ANY-NEXT: cmpl %eax, %edi
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: pxor %xmm1, %xmm2
; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE42-NEXT: pxor %xmm0, %xmm2
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
-; SSE42-NEXT: por %xmm0, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm0, %xmm1
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
+; SSE42-NEXT: por %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: pxor %xmm1, %xmm2
; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE42-NEXT: pxor %xmm0, %xmm2
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
-; SSE42-NEXT: por %xmm0, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm0, %xmm1
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
+; SSE42-NEXT: por %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; X64-NEXT: movq %r14, %rdx
; X64-NEXT: movq %r12, %rcx
; X64-NEXT: callq __divti3@PLT
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: decq %rax
; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT: leaq -1(%rax), %rbp
; X64-NEXT: testq %rbx, %rbx
; X64-NEXT: sets %al
; X64-NEXT: testq %r12, %r12
-; X64-NEXT: sets %bpl
-; X64-NEXT: xorb %al, %bpl
+; X64-NEXT: sets %r13b
+; X64-NEXT: xorb %al, %r13b
; X64-NEXT: movq %r15, %rdi
; X64-NEXT: movq %rbx, %rsi
; X64-NEXT: movq %r14, %rdx
; X64-NEXT: callq __modti3@PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %bpl, %al
-; X64-NEXT: cmovneq (%rsp), %r13 # 8-byte Folded Reload
-; X64-NEXT: movq %r13, %rax
+; X64-NEXT: testb %r13b, %al
+; X64-NEXT: cmoveq (%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT: movq %rbp, %rax
; X64-NEXT: addq $8, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) {
; CHECK-LABEL: oneusecmp:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: sarl $31, %eax
; CHECK-NEXT: xorl $127, %eax
define i64 @PR51612(i64 %x, i64 %y) {
; CHECK-LABEL: PR51612:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: incl %esi
-; CHECK-NEXT: incq %rax
-; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: leal 1(%rsi), %eax
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: cmovnel %edi, %eax
; CHECK-NEXT: andl 10, %eax
; CHECK-NEXT: retq
;
define i8 @shl_and(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: shl_and:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shlb $2, %sil
-; CHECK-NEXT: shlb $5, %al
-; CHECK-NEXT: andb %sil, %al
+; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT: leal (,%rsi,4), %eax
+; CHECK-NEXT: shlb $5, %dil
+; CHECK-NEXT: andb %dil, %al
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%sh0 = shl i8 %x, 3
define i128 @test_i128(i128 %a, i128 %b) nounwind {
; X64-LABEL: test_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
; X64-NEXT: cmpq %rdx, %rdi
-; X64-NEXT: cmovaq %rdi, %rdx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: cmovaq %rdi, %rax
; X64-NEXT: cmpq %rcx, %rsi
-; X64-NEXT: cmovgq %rdi, %rax
-; X64-NEXT: cmoveq %rdx, %rax
+; X64-NEXT: cmovgq %rdi, %rdx
+; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: cmovgq %rsi, %rcx
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: retq
; SSE-NEXT: pcmpgtd %xmm1, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; SSE-NEXT: pcmpgtd %xmm1, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v3i32:
; SSE-NEXT: pcmpgtd %xmm1, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i32:
; SSE-NEXT: pcmpgtd %xmm2, %xmm4
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: por %xmm0, %xmm4
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm3, %xmm2
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32:
; SSE-NEXT: pcmpgtb %xmm1, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16i8:
define i128 @test_i128(i128 %a, i128 %b) nounwind {
; X64-LABEL: test_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
; X64-NEXT: cmpq %rdx, %rdi
-; X64-NEXT: cmovbq %rdi, %rdx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: cmovbq %rdi, %rax
; X64-NEXT: cmpq %rcx, %rsi
-; X64-NEXT: cmovlq %rdi, %rax
-; X64-NEXT: cmoveq %rdx, %rax
+; X64-NEXT: cmovlq %rdi, %rdx
+; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: cmovlq %rsi, %rcx
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: retq
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %esi, %ebx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: imull %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: imull %esi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: adcl %ebp, %edx
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: sbbl $0, %ebp
-; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnsl %ebx, %ebp
-; X86-NEXT: cmovnsl %edx, %edi
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: cmovnsl %edx, %esi
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %edx
-; X86-NEXT: sbbl $0, %edx
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: sbbl $0, %edi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %ebp, %edx
-; X86-NEXT: cmovnsl %edi, %ecx
-; X86-NEXT: testl %edx, %edx
+; X86-NEXT: cmovnsl %ebp, %edi
+; X86-NEXT: cmovnsl %esi, %ecx
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: setg %bl
; X86-NEXT: sete %bh
; X86-NEXT: cmpl $2, %ecx
-; X86-NEXT: setae %al
-; X86-NEXT: andb %bh, %al
-; X86-NEXT: orb %bl, %al
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: shrdl $2, %esi, %ebx
-; X86-NEXT: shrdl $2, %ecx, %esi
-; X86-NEXT: testb %al, %al
-; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovel %esi, %edi
-; X86-NEXT: movl $-1, %eax
-; X86-NEXT: cmovnel %eax, %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: cmpl $-1, %edx
+; X86-NEXT: setae %dl
+; X86-NEXT: andb %bh, %dl
+; X86-NEXT: orb %bl, %dl
+; X86-NEXT: movl (%esp), %ebx
+; X86-NEXT: shrdl $2, %eax, %ebx
+; X86-NEXT: shrdl $2, %ecx, %eax
+; X86-NEXT: testb %dl, %dl
+; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT: cmovel %eax, %esi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovel %ebx, %edx
+; X86-NEXT: cmpl $-1, %edi
; X86-NEXT: setl %bl
-; X86-NEXT: sete %dl
+; X86-NEXT: sete %al
; X86-NEXT: cmpl $-2, %ecx
; X86-NEXT: setb %cl
-; X86-NEXT: andb %dl, %cl
-; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: andb %al, %cl
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: orb %bl, %cl
-; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: cmovel %edx, %eax
; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
-; X86-NEXT: cmovel %edi, %edx
+; X86-NEXT: cmovel %esi, %edx
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: imull %ebx, %edi
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: imull %ebp, %ebx
-; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: imull %ecx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: imull %esi, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: imull %edi, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: imull %ebx, %edi
; X86-NEXT: addl %edx, %edi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: imull %ebp, %esi
+; X86-NEXT: addl %edx, %esi
; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebp, %edi
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movzbl %bl, %edi
; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF
; X86-NEXT: orl %edx, %edi
; X86-NEXT: notl %ecx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload
; X86-NEXT: cmovel %ebx, %esi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %esi, %edx
; X86-NEXT: addl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: andb %bh, %bl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: movl $-1, %esi
-; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: cmovel %edx, %esi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovnel %edx, %eax
; X86-NEXT: cmpl $-1, %ecx
; X86-NEXT: setl %cl
-; X86-NEXT: sete %ch
-; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: orb %cl, %ch
-; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: sete %dl
+; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: orb %cl, %dl
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT: cmovel %esi, %edx
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: shrdl $31, %ecx, %edx
; X86-NEXT: cmpl $1073741824, %ecx # imm = 0x40000000
; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovgel %esi, %edx
-; X86-NEXT: movl $-1, %esi
-; X86-NEXT: cmovgel %esi, %eax
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmovll %edx, %esi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovgel %edx, %eax
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: cmpl $-1073741824, %ecx # imm = 0xC0000000
-; X86-NEXT: cmovll %esi, %eax
-; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT: cmovll %ecx, %edx
+; X86-NEXT: cmovll %edx, %eax
+; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT: cmovgel %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
; NHM-LABEL: v4f32_no_daz:
; NHM: # %bb.0:
-; NHM-NEXT: rsqrtps %xmm0, %xmm2
-; NHM-NEXT: movaps %xmm0, %xmm1
-; NHM-NEXT: mulps %xmm2, %xmm1
+; NHM-NEXT: rsqrtps %xmm0, %xmm1
+; NHM-NEXT: movaps %xmm0, %xmm2
+; NHM-NEXT: mulps %xmm1, %xmm2
; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; NHM-NEXT: mulps %xmm1, %xmm3
-; NHM-NEXT: mulps %xmm2, %xmm1
-; NHM-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; NHM-NEXT: mulps %xmm2, %xmm3
+; NHM-NEXT: mulps %xmm1, %xmm2
+; NHM-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; NHM-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; NHM-NEXT: mulps %xmm3, %xmm1
-; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
-; NHM-NEXT: cmpleps %xmm0, %xmm2
+; NHM-NEXT: mulps %xmm3, %xmm2
+; NHM-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; NHM-NEXT: cmpleps %xmm0, %xmm1
; NHM-NEXT: andps %xmm2, %xmm1
; NHM-NEXT: movaps %xmm1, %xmm0
; NHM-NEXT: retq
define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
; NHM-LABEL: v8f32_no_daz:
; NHM: # %bb.0:
-; NHM-NEXT: movaps %xmm0, %xmm2
-; NHM-NEXT: rsqrtps %xmm0, %xmm3
-; NHM-NEXT: mulps %xmm3, %xmm0
-; NHM-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; NHM-NEXT: movaps %xmm0, %xmm5
-; NHM-NEXT: mulps %xmm4, %xmm5
-; NHM-NEXT: mulps %xmm3, %xmm0
-; NHM-NEXT: movaps {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; NHM-NEXT: addps %xmm3, %xmm0
-; NHM-NEXT: mulps %xmm5, %xmm0
-; NHM-NEXT: movaps {{.*#+}} xmm5 = [NaN,NaN,NaN,NaN]
-; NHM-NEXT: andps %xmm5, %xmm2
-; NHM-NEXT: movaps {{.*#+}} xmm6 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
-; NHM-NEXT: movaps %xmm6, %xmm7
-; NHM-NEXT: cmpleps %xmm2, %xmm7
-; NHM-NEXT: andps %xmm7, %xmm0
-; NHM-NEXT: rsqrtps %xmm1, %xmm7
-; NHM-NEXT: movaps %xmm1, %xmm2
-; NHM-NEXT: mulps %xmm7, %xmm2
+; NHM-NEXT: rsqrtps %xmm0, %xmm2
+; NHM-NEXT: movaps %xmm0, %xmm4
; NHM-NEXT: mulps %xmm2, %xmm4
-; NHM-NEXT: mulps %xmm7, %xmm2
-; NHM-NEXT: addps %xmm3, %xmm2
-; NHM-NEXT: mulps %xmm4, %xmm2
-; NHM-NEXT: andps %xmm5, %xmm1
-; NHM-NEXT: cmpleps %xmm1, %xmm6
-; NHM-NEXT: andps %xmm6, %xmm2
+; NHM-NEXT: movaps {{.*#+}} xmm5 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; NHM-NEXT: movaps %xmm4, %xmm3
+; NHM-NEXT: mulps %xmm5, %xmm3
+; NHM-NEXT: mulps %xmm2, %xmm4
+; NHM-NEXT: movaps {{.*#+}} xmm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; NHM-NEXT: addps %xmm6, %xmm4
+; NHM-NEXT: mulps %xmm3, %xmm4
+; NHM-NEXT: movaps {{.*#+}} xmm7 = [NaN,NaN,NaN,NaN]
+; NHM-NEXT: andps %xmm7, %xmm0
+; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; NHM-NEXT: movaps %xmm2, %xmm3
+; NHM-NEXT: cmpleps %xmm0, %xmm3
+; NHM-NEXT: andps %xmm4, %xmm3
+; NHM-NEXT: rsqrtps %xmm1, %xmm0
+; NHM-NEXT: movaps %xmm1, %xmm4
+; NHM-NEXT: mulps %xmm0, %xmm4
+; NHM-NEXT: mulps %xmm4, %xmm5
+; NHM-NEXT: mulps %xmm0, %xmm4
+; NHM-NEXT: addps %xmm6, %xmm4
+; NHM-NEXT: mulps %xmm5, %xmm4
+; NHM-NEXT: andps %xmm7, %xmm1
+; NHM-NEXT: cmpleps %xmm1, %xmm2
+; NHM-NEXT: andps %xmm4, %xmm2
+; NHM-NEXT: movaps %xmm3, %xmm0
; NHM-NEXT: movaps %xmm2, %xmm1
; NHM-NEXT: retq
;
define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 {
; SSE-LABEL: sqrt_v4f32_check_denorms_ninf:
; SSE: # %bb.0:
-; SSE-NEXT: rsqrtps %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: mulps %xmm2, %xmm1
+; SSE-NEXT: rsqrtps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: mulps %xmm1, %xmm2
; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; SSE-NEXT: mulps %xmm1, %xmm3
-; SSE-NEXT: mulps %xmm2, %xmm1
-; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: mulps %xmm2, %xmm3
+; SSE-NEXT: mulps %xmm1, %xmm2
+; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: mulps %xmm3, %xmm1
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
-; SSE-NEXT: cmpleps %xmm0, %xmm2
+; SSE-NEXT: mulps %xmm3, %xmm2
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SSE-NEXT: cmpleps %xmm0, %xmm1
; SSE-NEXT: andps %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
; SSE-NEXT: mulss %xmm2, %xmm1
; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: mulss %xmm0, %xmm2
-; SSE-NEXT: mulss %xmm1, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: mulss %xmm2, %xmm0
+; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: div_sqrt_fabs_f32:
; SSE-NEXT: mulss %xmm1, %xmm2
; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: mulss %xmm0, %xmm1
-; SSE-NEXT: mulss %xmm2, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: mulss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: div_sqrt_f32:
; STRICT-NEXT: cmplesd %xmm1, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm0
; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm0, %xmm2
-; STRICT-NEXT: movapd %xmm2, %xmm0
+; STRICT-NEXT: orpd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole:
define double @ole_x(double %x) {
; STRICT-LABEL: ole_x:
; STRICT: # %bb.0:
-; STRICT-NEXT: xorpd %xmm2, %xmm2
-; STRICT-NEXT: movapd %xmm0, %xmm1
-; STRICT-NEXT: cmplesd %xmm2, %xmm1
-; STRICT-NEXT: andpd %xmm0, %xmm1
-; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: xorpd %xmm1, %xmm1
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: cmplesd %xmm1, %xmm2
+; STRICT-NEXT: andpd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole_x:
; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm0
; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm0, %xmm2
-; STRICT-NEXT: movapd %xmm2, %xmm0
+; STRICT-NEXT: orpd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt:
define double @ugt_x(double %x) {
; STRICT-LABEL: ugt_x:
; STRICT: # %bb.0:
-; STRICT-NEXT: xorpd %xmm2, %xmm2
-; STRICT-NEXT: movapd %xmm0, %xmm1
-; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT: andpd %xmm0, %xmm1
-; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: xorpd %xmm1, %xmm1
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
+; STRICT-NEXT: andpd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt_x:
define double @ole_y(double %x) {
; STRICT-LABEL: ole_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT: movapd %xmm0, %xmm1
-; STRICT-NEXT: cmplesd %xmm2, %xmm1
-; STRICT-NEXT: andpd %xmm1, %xmm0
-; STRICT-NEXT: andnpd %xmm2, %xmm1
-; STRICT-NEXT: orpd %xmm0, %xmm1
-; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: cmplesd %xmm1, %xmm2
+; STRICT-NEXT: andpd %xmm2, %xmm0
+; STRICT-NEXT: andnpd %xmm1, %xmm2
+; STRICT-NEXT: orpd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole_y:
define double @ugt_y(double %x) {
; STRICT-LABEL: ugt_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT: movapd %xmm0, %xmm1
-; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT: andpd %xmm1, %xmm0
-; STRICT-NEXT: andnpd %xmm2, %xmm1
-; STRICT-NEXT: orpd %xmm0, %xmm1
-; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
+; STRICT-NEXT: andpd %xmm2, %xmm0
+; STRICT-NEXT: andnpd %xmm1, %xmm2
+; STRICT-NEXT: orpd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt_y:
; X64-LABEL: func5:
; X64: # %bb.0:
; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: sets %dl
-; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: movq %rdi, %rdx
-; X64-NEXT: shlq %cl, %rdx
-; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: sets %al
+; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: addq %rax, %rdx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shlq %cl, %rax
+; X64-NEXT: movq %rax, %rsi
; X64-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-NEXT: sarq %cl, %rsi
; X64-NEXT: cmpq %rsi, %rdi
-; X64-NEXT: cmoveq %rdx, %rax
+; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: retq
;
; X86-LABEL: func5:
;
; X64-LABEL: vec:
; X64: # %bb.0:
-; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: psubd %xmm1, %xmm2
-; X64-NEXT: pcmpgtd %xmm3, %xmm1
-; X64-NEXT: pcmpgtd %xmm2, %xmm0
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: psubd %xmm1, %xmm3
+; X64-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-NEXT: pcmpgtd %xmm3, %xmm0
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: pandn %xmm2, %xmm1
-; X64-NEXT: psrad $31, %xmm2
-; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-NEXT: pand %xmm0, %xmm2
-; X64-NEXT: por %xmm1, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: pandn %xmm3, %xmm1
+; X64-NEXT: psrad $31, %xmm3
+; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-NEXT: pand %xmm3, %xmm0
+; X64-NEXT: por %xmm1, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: psubd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-LABEL: v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: psubd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i32:
define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm5, %xmm2
+; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: psubd %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pandn %xmm0, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: psubd %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm5, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm6, %xmm0
-; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: psubd %xmm3, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pandn %xmm2, %xmm3
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i32:
define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: psubd %xmm4, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm9, %xmm4
+; SSE2-NEXT: psrad $31, %xmm9
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm10, %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: psubd %xmm5, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm8
-; SSE2-NEXT: pxor %xmm5, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: pxor %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psubd %xmm6, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pandn %xmm4, %xmm5
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: pxor %xmm10, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: psubd %xmm7, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psubd %xmm7, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
; SSE2-NEXT: pxor %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm5, %xmm2
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: pxor %xmm10, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm1, %xmm8
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm9
-; SSSE3-NEXT: psubd %xmm4, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm0, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pxor %xmm8, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
+; SSSE3-NEXT: psubd %xmm4, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm9, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm9
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm10, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm0
; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm8, %xmm1
-; SSSE3-NEXT: psubd %xmm5, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8
-; SSSE3-NEXT: pxor %xmm5, %xmm8
-; SSSE3-NEXT: movdqa %xmm8, %xmm4
-; SSSE3-NEXT: pandn %xmm1, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: pand %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: psubd %xmm5, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: pxor %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: psubd %xmm6, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pandn %xmm4, %xmm5
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: pxor %xmm10, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: por %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: psubd %xmm7, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: psubd %xmm7, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
; SSSE3-NEXT: pxor %xmm7, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm5, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm5
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: pxor %xmm10, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kandb %k0, %k1, %k1
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kandb %k0, %k1, %k1
; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kandw %k0, %k1, %k1
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kandw %k0, %k1, %k1
; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; CHECK-NEXT: addq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
define i64 @test__blcic_u64(i64 %a0) {
; X64-LABEL: test__blcic_u64:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: xorq $-1, %rcx
-; X64-NEXT: addq $1, %rax
-; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: retq
%1 = xor i64 %a0, -1
%2 = add i64 %a0, 1
define i64 @test__blsic_u64(i64 %a0) {
; X64-LABEL: test__blsic_u64:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: xorq $-1, %rcx
-; X64-NEXT: subq $1, %rax
-; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: leaq -1(%rdi), %rax
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
%1 = xor i64 %a0, -1
%2 = sub i64 %a0, 1
define i64 @test__t1mskc_u64(i64 %a0) {
; X64-LABEL: test__t1mskc_u64:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: xorq $-1, %rcx
-; X64-NEXT: addq $1, %rax
-; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
%1 = xor i64 %a0, -1
%2 = add i64 %a0, 1
define i64 @test__tzmsk_u64(i64 %a0) {
; X64-LABEL: test__tzmsk_u64:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: xorq $-1, %rcx
-; X64-NEXT: subq $1, %rax
-; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: leaq -1(%rdi), %rax
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: andq %rdi, %rax
; X64-NEXT: retq
%1 = xor i64 %a0, -1
%2 = sub i64 %a0, 1
;
; X64-LABEL: test__blcic_u32:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal 1(%rdi), %eax
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: xorl $-1, %ecx
-; X64-NEXT: addl $1, %eax
; X64-NEXT: andl %ecx, %eax
; X64-NEXT: retq
%1 = xor i32 %a0, -1
;
; X64-LABEL: test__blsic_u32:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal -1(%rdi), %eax
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: xorl $-1, %ecx
-; X64-NEXT: subl $1, %eax
; X64-NEXT: orl %ecx, %eax
; X64-NEXT: retq
%1 = xor i32 %a0, -1
;
; X64-LABEL: test__t1mskc_u32:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal 1(%rdi), %eax
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: xorl $-1, %ecx
-; X64-NEXT: addl $1, %eax
; X64-NEXT: orl %ecx, %eax
; X64-NEXT: retq
%1 = xor i32 %a0, -1
;
; X64-LABEL: test__tzmsk_u32:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal -1(%rdi), %eax
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: xorl $-1, %ecx
-; X64-NEXT: subl $1, %eax
; X64-NEXT: andl %ecx, %eax
; X64-NEXT: retq
%1 = xor i32 %a0, -1
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movq %rsi, %rdx
-; X64-NEXT: leaq (%rdi,%rdi), %rsi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $63, %rax
-; X64-NEXT: shrdq $33, %rax, %rsi
+; X64-NEXT: leaq (%rdi,%rdi), %rax
+; X64-NEXT: movq %rdi, %rsi
+; X64-NEXT: shrq $63, %rsi
+; X64-NEXT: shldq $31, %rax, %rsi
; X64-NEXT: shlq $32, %rdi
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: callq __udivti3@PLT
; X64-NEXT: cmpq $2, %rdx
; X64-NEXT: movq $-1, %rcx
-; X64-NEXT: cmovbq %rax, %rcx
+; X64-NEXT: cmovaeq %rcx, %rax
; X64-NEXT: cmpq $1, %rdx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbq %rdx, %rax
-; X64-NEXT: shldq $63, %rcx, %rax
+; X64-NEXT: movl $1, %ecx
+; X64-NEXT: cmovbq %rdx, %rcx
+; X64-NEXT: shrdq $1, %rcx, %rax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
;
define i128 @test_i128(i128 %a, i128 %b) nounwind {
; X64-LABEL: test_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
; X64-NEXT: cmpq %rdx, %rdi
-; X64-NEXT: cmovaq %rdi, %rdx
-; X64-NEXT: cmpq %rcx, %rsi
+; X64-NEXT: movq %rdx, %rax
; X64-NEXT: cmovaq %rdi, %rax
-; X64-NEXT: cmoveq %rdx, %rax
+; X64-NEXT: cmpq %rcx, %rsi
+; X64-NEXT: cmovaq %rdi, %rdx
+; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: cmovaq %rsi, %rcx
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: retq
define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; SSE-LABEL: test_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pxor %xmm5, %xmm6
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pxor %xmm5, %xmm4
-; SSE-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm5, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm5
-; SSE-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: pxor %xmm4, %xmm6
+; SSE-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE-NEXT: pand %xmm6, %xmm0
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: por %xmm6, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm4
+; SSE-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pandn %xmm3, %xmm4
+; SSE-NEXT: por %xmm4, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32:
define i128 @test_i128(i128 %a, i128 %b) nounwind {
; X64-LABEL: test_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
; X64-NEXT: cmpq %rdx, %rdi
-; X64-NEXT: cmovbq %rdi, %rdx
-; X64-NEXT: cmpq %rcx, %rsi
+; X64-NEXT: movq %rdx, %rax
; X64-NEXT: cmovbq %rdi, %rax
-; X64-NEXT: cmoveq %rdx, %rax
+; X64-NEXT: cmpq %rcx, %rsi
+; X64-NEXT: cmovbq %rdi, %rdx
+; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: cmovbq %rsi, %rcx
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: retq
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: shldl $30, %eax, %ecx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: shldl $30, %eax, %edx
; X86-NEXT: shldl $30, %esi, %eax
-; X86-NEXT: movl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: setne %dl
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %bl
; X86-NEXT: andb %dl, %bl
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: seto %bh
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: movl %eax, %esi
; X86-NEXT: seto %cl
; X86-NEXT: orb %bh, %cl
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %eax, %esi
; X86-NEXT: movl %edx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: addl %esi, %edx
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %edx, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: cmpl $1, %edi
+; X86-NEXT: cmpl $1, %ebx
; X86-NEXT: sbbl %ecx, %ecx
; X86-NEXT: notl %ecx
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %edx, %edi
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: shrdl $31, %edx, %eax
; X86-NEXT: movl %edx, %esi
; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: notl %edi
; X86-NEXT: orl %edi, %eax
-; X86-NEXT: shldl $1, %edx, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrdl $31, %ecx, %edx
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X64-NEXT: seto %r10b
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %rcx
; X64-NEXT: seto %r11b
; X64-NEXT: orb %r10b, %r11b
-; X64-NEXT: addq %rsi, %rcx
+; X64-NEXT: addq %rax, %rsi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: addq %rsi, %rdx
; X64-NEXT: setb %cl
; X64-NEXT: orb %r11b, %cl
; X64-NEXT: orb %r9b, %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: setne %dl
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %bl
; X86-NEXT: andb %dl, %bl
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: seto %bh
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: movl %eax, %esi
; X86-NEXT: seto %ch
; X86-NEXT: orb %bh, %ch
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %eax, %esi
; X86-NEXT: movl %edx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: addl %esi, %edx
define i32 @out_constant_varx_mone(i32 %x, i32 %y, i32 %mask) {
; CHECK-NOBMI-LABEL: out_constant_varx_mone:
; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: movl %edi, %eax
-; CHECK-NOBMI-NEXT: andl %edx, %eax
-; CHECK-NOBMI-NEXT: notl %edx
-; CHECK-NOBMI-NEXT: orl %edx, %eax
+; CHECK-NOBMI-NEXT: andl %edx, %edi
+; CHECK-NOBMI-NEXT: movl %edx, %eax
+; CHECK-NOBMI-NEXT: notl %eax
+; CHECK-NOBMI-NEXT: orl %edi, %eax
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI-LABEL: out_constant_varx_mone:
; CHECK-BMI: # %bb.0:
-; CHECK-BMI-NEXT: movl %edi, %eax
-; CHECK-BMI-NEXT: andl %edx, %eax
-; CHECK-BMI-NEXT: notl %edx
-; CHECK-BMI-NEXT: orl %edx, %eax
+; CHECK-BMI-NEXT: andl %edx, %edi
+; CHECK-BMI-NEXT: movl %edx, %eax
+; CHECK-BMI-NEXT: notl %eax
+; CHECK-BMI-NEXT: orl %edi, %eax
; CHECK-BMI-NEXT: retq
%notmask = xor i32 %mask, -1
%mx = and i32 %mask, %x
; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask:
; CHECK-NOBMI: # %bb.0:
; CHECK-NOBMI-NEXT: movl %edx, %eax
-; CHECK-NOBMI-NEXT: movl %edx, %ecx
-; CHECK-NOBMI-NEXT: notl %ecx
-; CHECK-NOBMI-NEXT: andl %edi, %ecx
-; CHECK-NOBMI-NEXT: andl $42, %eax
-; CHECK-NOBMI-NEXT: orl %ecx, %eax
+; CHECK-NOBMI-NEXT: notl %eax
+; CHECK-NOBMI-NEXT: andl %edi, %eax
+; CHECK-NOBMI-NEXT: andl $42, %edx
+; CHECK-NOBMI-NEXT: orl %edx, %eax
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI-LABEL: out_constant_varx_42_invmask:
define i32 @out_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
; CHECK-NOBMI-LABEL: out_constant_mone_vary_invmask:
; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: movl %esi, %eax
-; CHECK-NOBMI-NEXT: andl %edx, %eax
-; CHECK-NOBMI-NEXT: notl %edx
-; CHECK-NOBMI-NEXT: orl %edx, %eax
+; CHECK-NOBMI-NEXT: andl %edx, %esi
+; CHECK-NOBMI-NEXT: movl %edx, %eax
+; CHECK-NOBMI-NEXT: notl %eax
+; CHECK-NOBMI-NEXT: orl %esi, %eax
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI-LABEL: out_constant_mone_vary_invmask:
; CHECK-BMI: # %bb.0:
-; CHECK-BMI-NEXT: movl %esi, %eax
-; CHECK-BMI-NEXT: andl %edx, %eax
-; CHECK-BMI-NEXT: notl %edx
-; CHECK-BMI-NEXT: orl %edx, %eax
+; CHECK-BMI-NEXT: andl %edx, %esi
+; CHECK-BMI-NEXT: movl %edx, %eax
+; CHECK-BMI-NEXT: notl %eax
+; CHECK-BMI-NEXT: orl %esi, %eax
; CHECK-BMI-NEXT: retq
%notmask = xor i32 %mask, -1
%mx = and i32 %notmask, -1
define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask:
; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: movl %esi, %eax
-; CHECK-NOBMI-NEXT: andl %edx, %eax
-; CHECK-NOBMI-NEXT: notl %edx
-; CHECK-NOBMI-NEXT: andl $42, %edx
-; CHECK-NOBMI-NEXT: orl %edx, %eax
+; CHECK-NOBMI-NEXT: andl %edx, %esi
+; CHECK-NOBMI-NEXT: movl %edx, %eax
+; CHECK-NOBMI-NEXT: notl %eax
+; CHECK-NOBMI-NEXT: andl $42, %eax
+; CHECK-NOBMI-NEXT: orl %esi, %eax
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI-LABEL: out_constant_42_vary_invmask:
; CHECK-BMI: # %bb.0:
-; CHECK-BMI-NEXT: movl %esi, %eax
-; CHECK-BMI-NEXT: andl %edx, %eax
-; CHECK-BMI-NEXT: notl %edx
-; CHECK-BMI-NEXT: andl $42, %edx
-; CHECK-BMI-NEXT: orl %edx, %eax
+; CHECK-BMI-NEXT: andl %edx, %esi
+; CHECK-BMI-NEXT: movl %edx, %eax
+; CHECK-BMI-NEXT: notl %eax
+; CHECK-BMI-NEXT: andl $42, %eax
+; CHECK-BMI-NEXT: orl %esi, %eax
; CHECK-BMI-NEXT: retq
%notmask = xor i32 %mask, -1
%mx = and i32 %notmask, 42
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: imulq $1491936009, %rax, %rax # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: subl %eax, %ecx
-; CHECK-NEXT: shrl %ecx
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: shrl $6, %ecx
-; CHECK-NEXT: imull $95, %ecx, %eax
-; CHECK-NEXT: subl %eax, %edi
-; CHECK-NEXT: leal (%rdi,%rcx), %eax
+; CHECK-NEXT: imulq $1491936009, %rax, %rcx # imm = 0x58ED2309
+; CHECK-NEXT: shrq $32, %rcx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: shrl %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: shrl $6, %eax
+; CHECK-NEXT: imull $95, %eax, %ecx
+; CHECK-NEXT: subl %ecx, %edi
+; CHECK-NEXT: addl %edi, %eax
; CHECK-NEXT: retq
%1 = urem i32 %x, 95
%2 = udiv i32 %x, 95
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pxor %xmm0, %xmm0
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t32_tautological:
define <4 x i32> @strict_vector_fptoui_v4f32_to_v4i32(<4 x float> %a) #0 {
; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i32:
; SSE-32: # %bb.0:
-; SSE-32-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-32-NEXT: movaps %xmm0, %xmm3
-; SSE-32-NEXT: cmpltps %xmm2, %xmm3
-; SSE-32-NEXT: movaps %xmm3, %xmm1
-; SSE-32-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; SSE-32-NEXT: andnps %xmm2, %xmm3
-; SSE-32-NEXT: subps %xmm3, %xmm0
+; SSE-32-NEXT: movaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; SSE-32-NEXT: movaps %xmm0, %xmm2
+; SSE-32-NEXT: cmpltps %xmm1, %xmm2
+; SSE-32-NEXT: movaps %xmm2, %xmm3
+; SSE-32-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; SSE-32-NEXT: andnps %xmm1, %xmm2
+; SSE-32-NEXT: subps %xmm2, %xmm0
; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-32-NEXT: xorps %xmm0, %xmm1
-; SSE-32-NEXT: movaps %xmm1, %xmm0
+; SSE-32-NEXT: xorps %xmm3, %xmm0
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i32:
; SSE-64: # %bb.0:
-; SSE-64-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-64-NEXT: movaps %xmm0, %xmm3
-; SSE-64-NEXT: cmpltps %xmm2, %xmm3
-; SSE-64-NEXT: movaps %xmm3, %xmm1
-; SSE-64-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-64-NEXT: andnps %xmm2, %xmm3
-; SSE-64-NEXT: subps %xmm3, %xmm0
+; SSE-64-NEXT: movaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; SSE-64-NEXT: movaps %xmm0, %xmm2
+; SSE-64-NEXT: cmpltps %xmm1, %xmm2
+; SSE-64-NEXT: movaps %xmm2, %xmm3
+; SSE-64-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-64-NEXT: andnps %xmm1, %xmm2
+; SSE-64-NEXT: subps %xmm2, %xmm0
; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-64-NEXT: xorps %xmm0, %xmm1
-; SSE-64-NEXT: movaps %xmm1, %xmm0
+; SSE-64-NEXT: xorps %xmm3, %xmm0
; SSE-64-NEXT: retq
;
; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32:
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
-; CHECK-NEXT: paddb %xmm0, %xmm1
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: paddb %xmm1, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
ret <2 x i64> %c
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
-; CHECK-NEXT: paddb %xmm0, %xmm1
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: paddb %xmm1, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
ret <2 x i64> %c
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i32:
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i32:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v16i8:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i32:
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i32:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v16i8:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v32i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_gt_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_ge_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i32:
; SSE2-NEXT: pslld $8, %xmm2
; SSE2-NEXT: psrad $8, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pslld $8, %xmm0
-; SSE2-NEXT: psrad $8, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pslld $8, %xmm1
+; SSE2-NEXT: psrad $8, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: movw %ax, (%rdi)
; SSSE3-NEXT: pslld $8, %xmm2
; SSSE3-NEXT: psrad $8, %xmm2
; SSSE3-NEXT: paddd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pslld $8, %xmm0
-; SSSE3-NEXT: psrad $8, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pslld $8, %xmm1
+; SSSE3-NEXT: psrad $8, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movd %xmm2, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
;
; SSE41-LABEL: saddo_v4i24:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pslld $8, %xmm1
; SSE41-NEXT: psrad $8, %xmm1
-; SSE41-NEXT: pslld $8, %xmm2
-; SSE41-NEXT: psrad $8, %xmm2
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pslld $8, %xmm0
; SSE41-NEXT: psrad $8, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pslld $8, %xmm2
+; SSE41-NEXT: psrad $8, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrd $3, %xmm2, %eax
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
; SSE41-NEXT: movw %ax, 9(%rdi)
-; SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; SSE41-NEXT: pextrd $2, %xmm0, %ecx
; SSE41-NEXT: movw %cx, 6(%rdi)
-; SSE41-NEXT: pextrd $1, %xmm2, %edx
+; SSE41-NEXT: pextrd $1, %xmm0, %edx
; SSE41-NEXT: movw %dx, 3(%rdi)
-; SSE41-NEXT: movd %xmm2, %esi
+; SSE41-NEXT: movd %xmm0, %esi
; SSE41-NEXT: movw %si, (%rdi)
; SSE41-NEXT: shrl $16, %eax
; SSE41-NEXT: movb %al, 11(%rdi)
; SSE41-NEXT: movb %dl, 5(%rdi)
; SSE41-NEXT: shrl $16, %esi
; SSE41-NEXT: movb %sil, 2(%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: saddo_v4i24:
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
-; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: saddo_v4i1:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm1
; SSE-NEXT: psrlw $11, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: psraw $5, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: psraw $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_vec8x16:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm1
; SSE-NEXT: psrlw $11, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: psraw $5, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: psraw $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_vec8x16_minsize:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $28, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: psrad $4, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: psrad $4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_vec4x32:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: psrld $26, %xmm2
-; SSE-NEXT: paddd %xmm0, %xmm2
-; SSE-NEXT: psrad $6, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: psrld $26, %xmm3
-; SSE-NEXT: paddd %xmm1, %xmm3
-; SSE-NEXT: psrad $6, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: psrad $6, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: psrld $26, %xmm2
+; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: psrad $6, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: sdiv8x32:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psraw $15, %xmm2
; SSE-NEXT: psrlw $14, %xmm2
-; SSE-NEXT: paddw %xmm0, %xmm2
-; SSE-NEXT: psraw $2, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psraw $15, %xmm3
-; SSE-NEXT: psrlw $14, %xmm3
-; SSE-NEXT: paddw %xmm1, %xmm3
-; SSE-NEXT: psraw $2, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: psraw $2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psraw $15, %xmm2
+; SSE-NEXT: psrlw $14, %xmm2
+; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: psraw $2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: sdiv16x16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pslld $1, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test4:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pslld $1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: test4:
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, (%rdi)
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pslld $8, %xmm3
-; SSE41-NEXT: psrad $8, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pslld $8, %xmm0
+; SSE41-NEXT: psrad $8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: pextrd $3, %xmm1, %eax
; SSE41-NEXT: pextrd $2, %xmm1, %ecx
; SSE41-NEXT: pextrd $1, %xmm1, %edx
; SSE41-NEXT: movd %xmm1, %esi
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movw %ax, 9(%rdi)
; SSE41-NEXT: movw %cx, 6(%rdi)
; SSE2-NEXT: pslld $8, %xmm2
; SSE2-NEXT: psrad $8, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pslld $8, %xmm0
-; SSE2-NEXT: psrad $8, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pslld $8, %xmm1
+; SSE2-NEXT: psrad $8, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: movw %ax, (%rdi)
; SSSE3-NEXT: pslld $8, %xmm2
; SSSE3-NEXT: psrad $8, %xmm2
; SSSE3-NEXT: psubd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pslld $8, %xmm0
-; SSSE3-NEXT: psrad $8, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pslld $8, %xmm1
+; SSSE3-NEXT: psrad $8, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movd %xmm2, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
;
; SSE41-LABEL: ssubo_v4i24:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pslld $8, %xmm1
; SSE41-NEXT: psrad $8, %xmm1
-; SSE41-NEXT: pslld $8, %xmm2
-; SSE41-NEXT: psrad $8, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pslld $8, %xmm0
; SSE41-NEXT: psrad $8, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pslld $8, %xmm2
+; SSE41-NEXT: psrad $8, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrd $3, %xmm2, %eax
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
; SSE41-NEXT: movw %ax, 9(%rdi)
-; SSE41-NEXT: pextrd $2, %xmm2, %ecx
+; SSE41-NEXT: pextrd $2, %xmm0, %ecx
; SSE41-NEXT: movw %cx, 6(%rdi)
-; SSE41-NEXT: pextrd $1, %xmm2, %edx
+; SSE41-NEXT: pextrd $1, %xmm0, %edx
; SSE41-NEXT: movw %dx, 3(%rdi)
-; SSE41-NEXT: movd %xmm2, %esi
+; SSE41-NEXT: movd %xmm0, %esi
; SSE41-NEXT: movw %si, (%rdi)
; SSE41-NEXT: shrl $16, %eax
; SSE41-NEXT: movb %al, 11(%rdi)
; SSE41-NEXT: movb %dl, 5(%rdi)
; SSE41-NEXT: shrl $16, %esi
; SSE41-NEXT: movb %sil, 2(%rdi)
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ssubo_v4i24:
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movb %al, (%rdi)
-; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ssubo_v4i1:
;
; SSE41-LABEL: umulo_v4i24:
; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
-; SSE41-NEXT: pand %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pmuludq %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; SSE41-NEXT: pmuludq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: pmulld %xmm2, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE41-NEXT: pxor %xmm2, %xmm4
+; SSE41-NEXT: pmulld %xmm0, %xmm1
; SSE41-NEXT: pextrd $3, %xmm1, %eax
; SSE41-NEXT: pextrd $2, %xmm1, %ecx
; SSE41-NEXT: pextrd $1, %xmm1, %edx
; SSE41-NEXT: movd %xmm1, %esi
-; SSE41-NEXT: psrld $24, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $24, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movw %ax, 9(%rdi)
; SSE41-NEXT: movw %cx, 6(%rdi)
; SSE41-NEXT: movw %dx, 3(%rdi)
define i8 @test_bitreverse_i8(i8 %a) nounwind {
; SSE-LABEL: test_bitreverse_i8:
; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: rolb $4, %dil
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andb $51, %al
; SSE-NEXT: addb %al, %al
; SSE-NEXT: shrb %dil
; SSE-NEXT: andb $85, %dil
-; SSE-NEXT: addl %edi, %eax
-; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: orb %dil, %al
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i8:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $edi killed $edi def $rdi
; AVX-NEXT: rolb $4, %dil
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andb $51, %al
; AVX-NEXT: addb %al, %al
; AVX-NEXT: shrb %dil
; AVX-NEXT: andb $85, %dil
-; AVX-NEXT: addl %edi, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: orb %dil, %al
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i8:
;
; GFNISSE-LABEL: test_bitreverse_i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi
; GFNISSE-NEXT: rolb $4, %dil
; GFNISSE-NEXT: movl %edi, %eax
; GFNISSE-NEXT: andb $51, %al
; GFNISSE-NEXT: addb %al, %al
; GFNISSE-NEXT: shrb %dil
; GFNISSE-NEXT: andb $85, %dil
-; GFNISSE-NEXT: addl %edi, %eax
-; GFNISSE-NEXT: # kill: def $al killed $al killed $eax
+; GFNISSE-NEXT: orb %dil, %al
; GFNISSE-NEXT: retq
;
; GFNIAVX-LABEL: test_bitreverse_i8:
; GFNIAVX: # %bb.0:
-; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi
; GFNIAVX-NEXT: rolb $4, %dil
; GFNIAVX-NEXT: movl %edi, %eax
; GFNIAVX-NEXT: andb $51, %al
; GFNIAVX-NEXT: addb %al, %al
; GFNIAVX-NEXT: shrb %dil
; GFNIAVX-NEXT: andb $85, %dil
-; GFNIAVX-NEXT: addl %edi, %eax
-; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX-NEXT: orb %dil, %al
; GFNIAVX-NEXT: retq
;
; GFNIAVX2-LABEL: test_bitreverse_i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi
; GFNIAVX2-NEXT: rolb $4, %dil
; GFNIAVX2-NEXT: movl %edi, %eax
; GFNIAVX2-NEXT: andb $51, %al
; GFNIAVX2-NEXT: addb %al, %al
; GFNIAVX2-NEXT: shrb %dil
; GFNIAVX2-NEXT: andb $85, %dil
-; GFNIAVX2-NEXT: addl %edi, %eax
-; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX2-NEXT: orb %dil, %al
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512F-LABEL: test_bitreverse_i8:
; GFNIAVX512F: # %bb.0:
-; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi
; GFNIAVX512F-NEXT: rolb $4, %dil
; GFNIAVX512F-NEXT: movl %edi, %eax
; GFNIAVX512F-NEXT: andb $51, %al
; GFNIAVX512F-NEXT: addb %al, %al
; GFNIAVX512F-NEXT: shrb %dil
; GFNIAVX512F-NEXT: andb $85, %dil
-; GFNIAVX512F-NEXT: addl %edi, %eax
-; GFNIAVX512F-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX512F-NEXT: orb %dil, %al
; GFNIAVX512F-NEXT: retq
;
; GFNIAVX512BW-LABEL: test_bitreverse_i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi
; GFNIAVX512BW-NEXT: rolb $4, %dil
; GFNIAVX512BW-NEXT: movl %edi, %eax
; GFNIAVX512BW-NEXT: andb $51, %al
; GFNIAVX512BW-NEXT: addb %al, %al
; GFNIAVX512BW-NEXT: shrb %dil
; GFNIAVX512BW-NEXT: andb $85, %dil
-; GFNIAVX512BW-NEXT: addl %edi, %eax
-; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; GFNIAVX512BW-NEXT: orb %dil, %al
; GFNIAVX512BW-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
define <8 x i32> @bool_sext_and(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_sext_and:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pslld $31, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
define <8 x i32> @bool_sext_or(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_sext_or:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pslld $31, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
define <8 x i32> @bool_sext_xor(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_sext_xor:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pslld $31, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: psrlq %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE2-NEXT: psrlq %xmm4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; SSE2-NEXT: shufpd{{.*#+}} xmm5 = xmm5[0],xmm1[1]
; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psllq %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: psllq %xmm2, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm5, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v2i64:
; SSE41-NEXT: psrlq %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE41-NEXT: psrlq %xmm4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pand %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllq %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v2i64:
;
; X86-SSE2-LABEL: var_funnnel_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm5
; X86-SSE2-NEXT: psrlq $1, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; X86-SSE2-NEXT: pand %xmm3, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT: psllq %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
+; X86-SSE2-NEXT: pand %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psllq %xmm2, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm2, %xmm0
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X86-SSE2-NEXT: orpd %xmm1, %xmm0
+; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT: orpd %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
ret <2 x i64> %res
; SSE41-NEXT: psrld %xmm4, %xmm6
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrld %xmm4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: pmulld %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v4i32:
; SSE2-NEXT: por %xmm5, %xmm3
; SSE2-NEXT: paddw %xmm4, %xmm4
; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: psrlw $1, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v8i16:
; SSE41-NEXT: paddd %xmm4, %xmm0
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
-; SSE41-NEXT: pmullw %xmm0, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pmullw %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v8i16:
; X86-SSE2-NEXT: por %xmm5, %xmm3
; X86-SSE2-NEXT: paddw %xmm4, %xmm4
; X86-SSE2-NEXT: psraw $15, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm1
; X86-SSE2-NEXT: psrlw $1, %xmm3
; X86-SSE2-NEXT: pand %xmm4, %xmm3
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pslld $23, %xmm1
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm4, %xmm1
-; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pslld $23, %xmm4
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT: paddd %xmm5, %xmm4
+; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
-; X86-SSE2-NEXT: paddd %xmm4, %xmm2
-; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; X86-SSE2-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: por %xmm3, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: paddd %xmm5, %xmm2
+; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
ret <8 x i16> %res
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm5
; X86-SSE2-NEXT: psrlq $1, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; X86-SSE2-NEXT: pand %xmm3, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT: psllq %xmm2, %xmm3
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; X86-SSE2-NEXT: psllq %xmm2, %xmm0
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X86-SSE2-NEXT: orpd %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psllq %xmm3, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT: psllq %xmm3, %xmm0
+; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlq $60, %xmm2
; SSE2-NEXT: psrlq $50, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq $4, %xmm2
+; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllq $4, %xmm1
; SSE2-NEXT: psllq $14, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v2i64:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrlq $50, %xmm2
; SSE41-NEXT: psrlq $60, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $14, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $14, %xmm1
; SSE41-NEXT: psllq $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v2i64:
;
; X86-SSE2-LABEL: constant_funnnel_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u>
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <4,u,14,u>
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
; X86-SSE2-NEXT: psrlq $1, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; X86-SSE2-NEXT: pand %xmm2, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: psllq %xmm3, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psllq %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm3, %xmm0
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE2-NEXT: orpd %xmm1, %xmm0
+; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
ret <2 x i64> %res
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrld $26, %xmm2
; SSE41-NEXT: psrld $28, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v4i32:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; SSE2-NEXT: psubb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllw %xmm3, %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw %xmm1, %xmm3
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: psllw %xmm3, %xmm5
+; SSE2-NEXT: psllw %xmm1, %xmm5
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw %xmm3, %xmm2
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT: psllw %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pshufb %xmm3, %xmm5
-; SSE41-NEXT: pand %xmm5, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm1, %xmm3
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm2, %xmm5
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm2, %xmm5
+; SSE41-NEXT: pand %xmm3, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT: psubb %xmm1, %xmm2
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: psrlw %xmm1, %xmm4
; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; X86-SSE2-NEXT: psubb %xmm1, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psllw %xmm3, %xmm1
+; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: psllw %xmm1, %xmm3
; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; X86-SSE2-NEXT: psllw %xmm3, %xmm5
+; X86-SSE2-NEXT: psllw %xmm1, %xmm5
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-SSE2-NEXT: pand %xmm3, %xmm1
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X86-SSE2-NEXT: pand %xmm0, %xmm2
-; X86-SSE2-NEXT: por %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
%res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: splatconstant_funnnel_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $28, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $4, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: psrld $28, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pslld $4, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movsd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatconstant_funnnel_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrld $28, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslld $4, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $28, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pslld $4, %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatconstant_funnnel_v2i32:
;
; X86-SSE2-LABEL: splatconstant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: psrld $28, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: pslld $4, %xmm1
-; X86-SSE2-NEXT: por %xmm2, %xmm1
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; X86-SSE2-NEXT: movaps %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $28, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pslld $4, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movsd %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> <i32 4, i32 4>)
ret <2 x i32> %res
; SSE2-NEXT: psrlq %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE2-NEXT: psrlq %xmm4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: psllq $1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psllq %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE2-NEXT: psllq %xmm2, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm5, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v2i64:
; SSE41-NEXT: psrlq %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; SSE41-NEXT: psrlq %xmm4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pandn %xmm3, %xmm2
; SSE41-NEXT: psllq $1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllq %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v2i64:
;
; X86-SSE2-LABEL: var_funnnel_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: pand %xmm3, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; X86-SSE2-NEXT: pandn %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
+; X86-SSE2-NEXT: pandn %xmm4, %xmm2
; X86-SSE2-NEXT: psllq $1, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT: psllq %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psllq %xmm2, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm2, %xmm0
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X86-SSE2-NEXT: orpd %xmm1, %xmm0
+; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT: orpd %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
ret <2 x i64> %res
; SSE41-NEXT: psrld %xmm4, %xmm6
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrld %xmm4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
; SSE41-NEXT: pandn %xmm8, %xmm2
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
; SSE41-NEXT: pslld $1, %xmm0
-; SSE41-NEXT: pmulld %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_funnnel_v4i32:
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm5, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE2-NEXT: psllw $1, %xmm0
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v8i16:
; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
; X86-SSE2-NEXT: pandn %xmm1, %xmm4
; X86-SSE2-NEXT: psrlw $1, %xmm1
-; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pslld $23, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm5, %xmm3
-; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X86-SSE2-NEXT: paddd %xmm5, %xmm1
+; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
; X86-SSE2-NEXT: paddd %xmm5, %xmm2
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; X86-SSE2-NEXT: psllw $1, %xmm0
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm4, %xmm0
-; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
ret <8 x i16> %res
define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: psllw $5, %xmm6
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm7
; SSE2-NEXT: pandn %xmm1, %xmm7
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: paddb %xmm6, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm7
; SSE2-NEXT: pandn %xmm1, %xmm7
; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: paddb %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm6, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm2
; SSE2-NEXT: psllw $5, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm0, %xmm5
; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm0, %xmm5
; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v16i8:
;
; X86-SSE2-LABEL: var_funnnel_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
-; X86-SSE2-NEXT: pand %xmm4, %xmm5
-; X86-SSE2-NEXT: psllw $5, %xmm5
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: psllw $5, %xmm6
; X86-SSE2-NEXT: pxor %xmm3, %xmm3
-; X86-SSE2-NEXT: pxor %xmm6, %xmm6
-; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm7
; X86-SSE2-NEXT: pandn %xmm1, %xmm7
; X86-SSE2-NEXT: psrlw $4, %xmm1
-; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: por %xmm7, %xmm1
-; X86-SSE2-NEXT: paddb %xmm5, %xmm5
-; X86-SSE2-NEXT: pxor %xmm6, %xmm6
-; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: paddb %xmm6, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm7
; X86-SSE2-NEXT: pandn %xmm1, %xmm7
; X86-SSE2-NEXT: psrlw $2, %xmm1
-; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: por %xmm7, %xmm1
-; X86-SSE2-NEXT: paddb %xmm5, %xmm5
-; X86-SSE2-NEXT: pxor %xmm6, %xmm6
-; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6
-; X86-SSE2-NEXT: movdqa %xmm6, %xmm5
-; X86-SSE2-NEXT: pandn %xmm1, %xmm5
+; X86-SSE2-NEXT: paddb %xmm6, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT: pandn %xmm1, %xmm6
; X86-SSE2-NEXT: psrlw $1, %xmm1
-; X86-SSE2-NEXT: pand %xmm6, %xmm1
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: pandn %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
+; X86-SSE2-NEXT: por %xmm6, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm2
; X86-SSE2-NEXT: psllw $5, %xmm2
-; X86-SSE2-NEXT: pxor %xmm4, %xmm4
-; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
; X86-SSE2-NEXT: paddb %xmm0, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
; X86-SSE2-NEXT: pandn %xmm0, %xmm5
; X86-SSE2-NEXT: psllw $4, %xmm0
-; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm5, %xmm0
; X86-SSE2-NEXT: paddb %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm4, %xmm4
-; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
; X86-SSE2-NEXT: pandn %xmm0, %xmm5
; X86-SSE2-NEXT: psllw $2, %xmm0
-; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm5, %xmm0
; X86-SSE2-NEXT: paddb %xmm2, %xmm2
; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
-; X86-SSE2-NEXT: pandn %xmm0, %xmm2
-; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
; X86-SSE2-NEXT: paddb %xmm0, %xmm0
; X86-SSE2-NEXT: pand %xmm3, %xmm0
-; X86-SSE2-NEXT: por %xmm2, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
ret <16 x i8> %res
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT: pand %xmm3, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; X86-SSE2-NEXT: pandn %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; X86-SSE2-NEXT: pandn %xmm4, %xmm3
; X86-SSE2-NEXT: psllq $1, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT: psllq %xmm2, %xmm3
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; X86-SSE2-NEXT: psllq %xmm2, %xmm0
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X86-SSE2-NEXT: orpd %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psllq %xmm3, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT: psllq %xmm3, %xmm0
+; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlq $4, %xmm2
; SSE2-NEXT: psrlq $14, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq $60, %xmm2
+; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllq $60, %xmm1
; SSE2-NEXT: psllq $50, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v2i64:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrlq $14, %xmm2
; SSE41-NEXT: psrlq $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $50, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $50, %xmm1
; SSE41-NEXT: psllq $60, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v2i64:
;
; X86-SSE2-LABEL: constant_funnnel_v2i64:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u>
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT: pand %xmm2, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; X86-SSE2-NEXT: pandn %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <4,u,14,u>
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pand %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
; X86-SSE2-NEXT: psllq $1, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: psllq %xmm3, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psllq %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm3, %xmm0
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE2-NEXT: orpd %xmm1, %xmm0
+; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
ret <2 x i64> %res
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrld $6, %xmm2
; SSE41-NEXT: psrld $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v4i32:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE2-NEXT: psubb %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: psubb %xmm2, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw %xmm2, %xmm3
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
; SSE2-NEXT: psllw %xmm2, %xmm5
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm3, %xmm0
-; SSE2-NEXT: psrlw %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: psrlw %xmm1, %xmm4
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v16i8:
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: psubb %xmm1, %xmm3
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllw %xmm4, %xmm1
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psllw %xmm1, %xmm4
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT: psllw %xmm4, %xmm6
+; SSE41-NEXT: psllw %xmm1, %xmm6
; SSE41-NEXT: pshufb %xmm2, %xmm6
-; SSE41-NEXT: pand %xmm6, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm3, %xmm2
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm2, %xmm0
-; SSE41-NEXT: psrlw %xmm2, %xmm5
+; SSE41-NEXT: pand %xmm4, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT: psubb %xmm3, %xmm1
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: psrlw %xmm1, %xmm5
; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm0, %xmm5
-; SSE41-NEXT: por %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2-NEXT: pxor %xmm2, %xmm2
; X86-SSE2-NEXT: psubb %xmm1, %xmm2
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; X86-SSE2-NEXT: psubb %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; X86-SSE2-NEXT: psubb %xmm2, %xmm1
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psllw %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: psllw %xmm2, %xmm3
; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5
; X86-SSE2-NEXT: psllw %xmm2, %xmm5
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X86-SSE2-NEXT: pand %xmm2, %xmm1
-; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE2-NEXT: psrlw %xmm3, %xmm0
-; X86-SSE2-NEXT: psrlw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
+; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE2-NEXT: psrlw %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw %xmm1, %xmm4
; X86-SSE2-NEXT: psrlw $8, %xmm4
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X86-SSE2-NEXT: pand %xmm0, %xmm2
-; X86-SSE2-NEXT: por %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: por %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: splatconstant_funnnel_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $4, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $28, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: psrld $4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pslld $28, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movsd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatconstant_funnnel_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrld $4, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslld $28, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $4, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pslld $28, %xmm2
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatconstant_funnnel_v2i32:
;
; X86-SSE2-LABEL: splatconstant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: psrld $4, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: pslld $28, %xmm1
-; X86-SSE2-NEXT: por %xmm2, %xmm1
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; X86-SSE2-NEXT: movaps %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pslld $28, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movsd %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> <i32 4, i32 4>)
ret <2 x i32> %res
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pslld $3, %xmm2
; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
; SSE2-NEXT: psllw $3, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: psubb %xmm2, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_16i8:
; SSE41-NEXT: psllw $3, %xmm2
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psubb %xmm2, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
; SSSE3-NEXT: psrlq $32, %xmm0
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: psrlq $32, %xmm1
-; SSSE3-NEXT: paddq %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: paddq %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64:
; SSE41-NEXT: psrlq $32, %xmm0
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: paddq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv2i64:
; X32-SSE-NEXT: psrlq $32, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: psrlq $32, %xmm1
-; X32-SSE-NEXT: paddq %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: paddq %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
; SSSE3-NEXT: psrlq $32, %xmm0
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: psrlq $32, %xmm1
-; SSSE3-NEXT: paddq %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: paddq %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64u:
; SSE41-NEXT: psrlq $32, %xmm0
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: paddq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv2i64u:
; X32-SSE-NEXT: psrlq $32, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: psrlq $32, %xmm1
-; X32-SSE-NEXT: paddq %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: paddq %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1)
;
; SSSE3-LABEL: testv4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pshufb %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: paddb %xmm1, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: paddb %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2
-; SSSE3-NEXT: psrlw $8, %xmm2
-; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: paddw %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: psrlw $8, %xmm3
+; SSSE3-NEXT: paddw %xmm1, %xmm3
; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0
; SSSE3-NEXT: psrld $16, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: psrld $16, %xmm1
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: psrld $16, %xmm3
+; SSSE3-NEXT: paddd %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pshufb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: paddb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm4, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm3
; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: psrld $16, %xmm3
+; SSE41-NEXT: paddd %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv4i32:
; X32-SSE-NEXT: psrld $16, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: psrld $16, %xmm1
-; X32-SSE-NEXT: paddd %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: paddd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0)
;
; SSSE3-LABEL: testv4i32u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pshufb %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: paddb %xmm1, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: paddb %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2
-; SSSE3-NEXT: psrlw $8, %xmm2
-; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: paddw %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: psrlw $8, %xmm3
+; SSSE3-NEXT: paddw %xmm1, %xmm3
; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0
; SSSE3-NEXT: psrld $16, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: psrld $16, %xmm1
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: psrld $16, %xmm3
+; SSSE3-NEXT: paddd %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pshufb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: paddb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm4, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm3
; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: psrld $16, %xmm3
+; SSE41-NEXT: paddd %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv4i32u:
; X32-SSE-NEXT: psrld $16, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: psrld $16, %xmm1
-; X32-SSE-NEXT: paddd %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: paddd %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1)
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pshufb %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pshufb %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: paddb %xmm1, %xmm3
; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
; SSSE3-NEXT: psrlw $8, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: paddw %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm3
+; SSSE3-NEXT: paddw %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: paddb %xmm2, %xmm1
+; SSE41-NEXT: pshufb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm3
; SSE41-NEXT: pcmpeqb %xmm4, %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
; X32-SSE-NEXT: psrlw $8, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: psrlw $8, %xmm1
-; X32-SSE-NEXT: paddw %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
ret <8 x i16> %out
;
; SSSE3-LABEL: testv8i16u:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pshufb %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: psrlw $4, %xmm1
-; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pshufb %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: paddb %xmm1, %xmm3
; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
; SSSE3-NEXT: psrlw $8, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: psrlw $8, %xmm1
-; SSSE3-NEXT: paddw %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm3
+; SSSE3-NEXT: paddw %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16u:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: paddb %xmm2, %xmm1
+; SSE41-NEXT: pshufb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm3
; SSE41-NEXT: pcmpeqb %xmm4, %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16u:
; X32-SSE-NEXT: psrlw $8, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: psrlw $8, %xmm1
-; X32-SSE-NEXT: paddw %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
ret <8 x i16> %out
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psllq $4, %xmm1
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: mul_v2i64_17:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pslld $4, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: ret{{[l|q]}}
;
; X86-SSE4-LABEL: mul_v4i32_17:
; X64-SSE4-SLOW: # %bb.0:
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-SLOW-NEXT: pslld $4, %xmm1
-; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm0
; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_17:
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
; X86-SSE-NEXT: psllw $4, %xmm1
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT: paddb %xmm0, %xmm1
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: paddb %xmm1, %xmm0
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: mul_v16i8_17:
; X64-SSE-NEXT: movdqa %xmm0, %xmm1
; X64-SSE-NEXT: psllw $4, %xmm1
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE-NEXT: paddb %xmm0, %xmm1
-; X64-SSE-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE-NEXT: paddb %xmm1, %xmm0
; X64-SSE-NEXT: retq
;
; X64-XOP-LABEL: mul_v16i8_17:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psllq $4, %xmm2
-; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psllq $4, %xmm3
-; SSE-NEXT: paddq %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllq $4, %xmm2
+; SSE-NEXT: paddq %xmm2, %xmm1
; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v4i64_17:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pslld $4, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pslld $4, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pslld $4, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: ret{{[l|q]}}
;
; X86-SSE4-LABEL: mul_v8i32_17:
; X64-SSE4-SLOW: # %bb.0:
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm2
; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2
-; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm2
-; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3
-; X64-SSE4-SLOW-NEXT: pslld $4, %xmm3
-; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3
-; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm0
-; X64-SSE4-SLOW-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE4-SLOW-NEXT: paddd %xmm2, %xmm0
+; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2
+; X64-SSE4-SLOW-NEXT: paddd %xmm2, %xmm1
; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v8i32_17:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psllw $4, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: paddb %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psllw $4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: paddb %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT: pand %xmm3, %xmm2
+; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllw $4, %xmm2
+; SSE-NEXT: pand %xmm3, %xmm2
+; SSE-NEXT: paddb %xmm2, %xmm1
; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v32i8_17:
define <16 x i8> @ugt_1_v16i8(<16 x i8> %0) {
; SSE-LABEL: ugt_1_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: paddb %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddb %xmm1, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ugt_1_v16i8:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_2_v16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_2_v16i8:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_3_v16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_3_v16i8:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_4_v16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_4_v16i8:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_5_v16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_5_v16i8:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_6_v16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_6_v16i8:
define <8 x i16> @ugt_1_v8i16(<8 x i16> %0) {
; SSE-LABEL: ugt_1_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddw %xmm1, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ugt_1_v8i16:
define <4 x i32> @ugt_1_v4i32(<4 x i32> %0) {
; SSE-LABEL: ugt_1_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddd %xmm1, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ugt_1_v4i32:
;
; SSE41-LABEL: ugt_1_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: paddq %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: paddq %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: ugt_1_v2i64:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8:
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: paddq %xmm0, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE3-NEXT: paddq %xmm0, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; SSE3-NEXT: movdqa %xmm0, %xmm3
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
; SSE3-NEXT: pand %xmm3, %xmm0
; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: paddq %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm4
; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ne_1_v2i64:
; SSE3-NEXT: pand %xmm0, %xmm3
; SSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSE3-NEXT: pand %xmm4, %xmm0
+; SSE3-NEXT: pand %xmm0, %xmm4
; SSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2]
-; SSE3-NEXT: pand %xmm3, %xmm1
-; SSE3-NEXT: pxor %xmm2, %xmm1
-; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
+; SSE3-NEXT: pand %xmm3, %xmm0
+; SSE3-NEXT: pxor %xmm2, %xmm0
+; SSE3-NEXT: por %xmm4, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ne_1_v2i64:
; SSSE3-NEXT: pand %xmm0, %xmm3
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm4
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ne_1_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm2
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: paddq %xmm3, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddq %xmm3, %xmm4
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: ne_1_v2i64:
define <4 x i32> @ne_1_v4i32(<4 x i32> %0) {
; SSE-LABEL: ne_1_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
; SSE-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: paddd %xmm3, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: paddd %xmm3, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ne_1_v4i32:
define <8 x i16> @ne_1_v8i16(<8 x i16> %0) {
; SSE-LABEL: ne_1_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpeqw %xmm1, %xmm2
; SSE-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE-NEXT: pcmpeqw %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: paddw %xmm3, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ne_1_v8i16:
define <16 x i8> @ne_1_v16i8(<16 x i8> %0) {
; SSE-LABEL: ne_1_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpeqb %xmm1, %xmm2
; SSE-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: paddb %xmm3, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE-NEXT: pcmpeqb %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: paddb %xmm3, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: ne_1_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT: addss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_zero:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: addss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4f32_zero:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: addss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v8f32_zero:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: addss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v16f32_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT: addss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_undef:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: addss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4f32_undef:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: addss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v8f32_undef:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: addss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v16f32_undef:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v2f64_zero:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4f64_zero:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v8f64_zero:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v2f64_undef:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4f64_undef:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v8f64_undef:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT: addss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_zero:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v2f64_zero:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm2
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm2
-; SSE-NEXT: addsd %xmm1, %xmm2
+; SSE-NEXT: addsd %xmm2, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: addsd %xmm1, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4f64_zero:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm4
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSE-NEXT: addsd %xmm0, %xmm4
-; SSE-NEXT: addsd %xmm1, %xmm4
+; SSE-NEXT: addsd %xmm4, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: addsd %xmm1, %xmm4
-; SSE-NEXT: addsd %xmm2, %xmm4
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm2, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-NEXT: addsd %xmm2, %xmm4
-; SSE-NEXT: addsd %xmm3, %xmm4
+; SSE-NEXT: addsd %xmm2, %xmm0
+; SSE-NEXT: addsd %xmm3, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE-NEXT: addsd %xmm3, %xmm4
-; SSE-NEXT: movapd %xmm4, %xmm0
+; SSE-NEXT: addsd %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v8f64_zero:
; SSE-LABEL: test_v16f64_zero:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm8
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
; SSE-NEXT: addsd %xmm8, %xmm0
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
define float @test_v3f32(<3 x float> %a0) {
; SSE2-LABEL: test_v3f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: cmpunordss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: andps %xmm2, %xmm3
-; SSE2-NEXT: maxss %xmm0, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm1
-; SSE2-NEXT: orps %xmm3, %xmm1
-; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: maxss %xmm1, %xmm2
-; SSE2-NEXT: cmpunordss %xmm1, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: andnps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm0, %xmm1
-; SSE2-NEXT: orps %xmm3, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: maxss %xmm2, %xmm1
+; SSE2-NEXT: cmpunordss %xmm2, %xmm2
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v3f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: cmpunordss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm2
-; SSE41-NEXT: andnps %xmm2, %xmm1
-; SSE41-NEXT: orps %xmm3, %xmm1
-; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: maxss %xmm1, %xmm2
-; SSE41-NEXT: cmpunordss %xmm1, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm3
-; SSE41-NEXT: andps %xmm0, %xmm1
-; SSE41-NEXT: orps %xmm3, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm0, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: andps %xmm1, %xmm3
+; SSE41-NEXT: maxss %xmm0, %xmm1
+; SSE41-NEXT: andnps %xmm1, %xmm2
+; SSE41-NEXT: orps %xmm3, %xmm2
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: maxss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: andps %xmm2, %xmm0
+; SSE41-NEXT: andnps %xmm1, %xmm2
+; SSE41-NEXT: orps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v3f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT: mulss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_zero:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: mulss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32_zero:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: mulss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32_zero:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: mulss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32_zero:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT: mulss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_undef:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: mulss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32_undef:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: mulss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32_undef:
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: mulps %xmm0, %xmm1
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: mulss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32_undef:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_zero:
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_zero:
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64_zero:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_undef:
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_undef:
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64_undef:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT: mulss %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32_one:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_one:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm2
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm2
-; SSE-NEXT: mulsd %xmm1, %xmm2
+; SSE-NEXT: mulsd %xmm2, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: mulsd %xmm1, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_one:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm4
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSE-NEXT: mulsd %xmm0, %xmm4
-; SSE-NEXT: mulsd %xmm1, %xmm4
+; SSE-NEXT: mulsd %xmm4, %xmm0
+; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: mulsd %xmm1, %xmm4
-; SSE-NEXT: mulsd %xmm2, %xmm4
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: mulsd %xmm2, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-NEXT: mulsd %xmm2, %xmm4
-; SSE-NEXT: mulsd %xmm3, %xmm4
+; SSE-NEXT: mulsd %xmm2, %xmm0
+; SSE-NEXT: mulsd %xmm3, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE-NEXT: mulsd %xmm3, %xmm4
-; SSE-NEXT: movapd %xmm4, %xmm0
+; SSE-NEXT: mulsd %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64_one:
; SSE-LABEL: test_v16f64_one:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm8
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1]
; SSE-NEXT: mulsd %xmm8, %xmm0
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pand %xmm10, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm8, %xmm1
; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm3
-; SSE41-NEXT: pxor %xmm9, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movq %xmm2, %rax
; SSE41-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pand %xmm10, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm8, %xmm1
; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm3
-; SSE41-NEXT: pxor %xmm9, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; SSE2-NEXT: psubb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllw %xmm3, %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw %xmm1, %xmm3
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: psllw %xmm3, %xmm5
+; SSE2-NEXT: psllw %xmm1, %xmm5
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_rotate_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw %xmm3, %xmm2
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw %xmm2, %xmm3
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE41-NEXT: psllw %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pshufb %xmm3, %xmm5
-; SSE41-NEXT: pand %xmm5, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; SSE41-NEXT: psubb %xmm1, %xmm3
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm2, %xmm5
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm2, %xmm5
+; SSE41-NEXT: pand %xmm3, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE41-NEXT: psubb %xmm1, %xmm2
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: psrlw %xmm1, %xmm4
; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_rotate_v16i8:
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; X86-SSE2-NEXT: psubb %xmm1, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
-; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: psllw %xmm3, %xmm1
+; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: psllw %xmm1, %xmm3
; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; X86-SSE2-NEXT: psllw %xmm3, %xmm5
+; X86-SSE2-NEXT: psllw %xmm1, %xmm5
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-SSE2-NEXT: pand %xmm3, %xmm1
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X86-SSE2-NEXT: pand %xmm0, %xmm2
-; X86-SSE2-NEXT: por %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
%splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
; SSE2-NEXT: psraw $2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: andps %xmm0, %xmm1
; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: andnps %xmm2, %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; X86-SSE-NEXT: psraw $2, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-SSE-NEXT: movaps %xmm2, %xmm0
-; X86-SSE-NEXT: andps %xmm1, %xmm0
+; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; X86-SSE-NEXT: movaps %xmm2, %xmm1
+; X86-SSE-NEXT: andps %xmm0, %xmm1
; X86-SSE-NEXT: psraw $1, %xmm2
-; X86-SSE-NEXT: andnps %xmm2, %xmm1
+; X86-SSE-NEXT: andnps %xmm2, %xmm0
; X86-SSE-NEXT: orps %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
; SSE2-NEXT: psraw $2, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm0, %xmm2
; SSE2-NEXT: psraw $1, %xmm1
-; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; X86-SSE-NEXT: psraw $2, %xmm1
; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
-; X86-SSE-NEXT: movaps %xmm1, %xmm0
-; X86-SSE-NEXT: andps %xmm2, %xmm0
+; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535]
+; X86-SSE-NEXT: movaps %xmm1, %xmm2
+; X86-SSE-NEXT: andps %xmm0, %xmm2
; X86-SSE-NEXT: psraw $1, %xmm1
-; X86-SSE-NEXT: andnps %xmm1, %xmm2
+; X86-SSE-NEXT: andnps %xmm1, %xmm0
; X86-SSE-NEXT: orps %xmm2, %xmm0
; X86-SSE-NEXT: retl
%shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSE2-NEXT: addps %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: addps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR22390:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; SSSE3-NEXT: movaps %xmm0, %xmm2
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSSE3-NEXT: addps %xmm0, %xmm2
-; SSSE3-NEXT: movaps %xmm2, %xmm0
+; SSSE3-NEXT: addps %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR22390:
; SSE-NEXT: packuswb %xmm7, %xmm6
; SSE-NEXT: pand %xmm8, %xmm5
; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand %xmm4, %xmm8
+; SSE-NEXT: pand %xmm8, %xmm0
; SSE-NEXT: packuswb %xmm5, %xmm0
; SSE-NEXT: packuswb %xmm6, %xmm0
; SSE-NEXT: retq
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm3, %xmm0
; SSE-NEXT: retq
;
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64u:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64u:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8:
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8u:
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8u:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psrlw $1, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pmaxub %xmm0, %xmm1
-; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pmaxub %xmm0, %xmm2
+; SSE-NEXT: pcmpeqb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: uge_v16i8:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: psrlw $1, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pminub %xmm0, %xmm1
-; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pminub %xmm0, %xmm2
+; SSE-NEXT: pcmpeqb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ule_v16i8:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test3:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test4:
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test19:
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test20:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test27:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test28:
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test43:
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test44:
define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test47:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test47:
define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test48:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test48:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test49:
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test50:
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test65:
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: test66:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test73:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test74:
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test89:
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test90:
define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test93:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test93:
define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test94:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE4-LABEL: test94:
; SSE2-NEXT: pcmpgtb %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test99:
; SSE2-NEXT: pcmpgtb %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test100:
; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test115:
; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test116:
define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test119:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm8
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm9
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test119:
define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test120:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm8
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm9
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test120:
; SSE2-NEXT: pcmpgtb %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test129:
; SSE2-NEXT: pcmpgtb %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test130:
; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test145:
; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: pandn %xmm4, %xmm8
-; SSE2-NEXT: por %xmm0, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test146:
define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test149:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm8
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm9
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test149:
define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test150:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm10
; SSE2-NEXT: por %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm8
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm9
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSE4-LABEL: test150:
; SSE2-LABEL: vsel_nonzero_constants:
; SSE2: # %bb.0:
; SSE2-NEXT: cmplepd %xmm0, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movapd %xmm1, %xmm2
-; SSE2-NEXT: andnpd %xmm0, %xmm2
-; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: orpd %xmm2, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: andnpd %xmm2, %xmm0
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: orpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: vsel_nonzero_constants:
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %eax, %xmm0
-; CHECK-NEXT: addss %xmm0, %xmm1
-; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
; CHECK-LABEL: shr2_nosplat:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psrlq $8, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $1, %xmm1
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
-; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT: xorps %xmm2, %xmm1
-; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: psrlq $8, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psrlq $1, %xmm2
+; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK-NEXT: xorpd %xmm1, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%B = lshr <2 x i64> %A, < i64 8, i64 1>